pi.c - kernel/futex/pi.c - Linux diff v6.2 - Bootlin Elixir Cross Referencer

   1// SPDX-License-Identifier: GPL-2.0-or-later
   2
   3#include <linux/slab.h>
 
   4#include <linux/sched/task.h>
   5
   6#include "futex.h"
   7#include "../locking/rtmutex_common.h"
   8
   9/*
  10 * PI code:
  11 */
  12int refill_pi_state_cache(void)
  13{
  14	struct futex_pi_state *pi_state;
  15
  16	if (likely(current->pi_state_cache))
  17		return 0;
  18
  19	pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
  20
  21	if (!pi_state)
  22		return -ENOMEM;
  23
  24	INIT_LIST_HEAD(&pi_state->list);
  25	/* pi_mutex gets initialized later */
  26	pi_state->owner = NULL;
  27	refcount_set(&pi_state->refcount, 1);
  28	pi_state->key = FUTEX_KEY_INIT;
  29
  30	current->pi_state_cache = pi_state;
  31
  32	return 0;
  33}
  34
  35static struct futex_pi_state *alloc_pi_state(void)
  36{
  37	struct futex_pi_state *pi_state = current->pi_state_cache;
  38
  39	WARN_ON(!pi_state);
  40	current->pi_state_cache = NULL;
  41
  42	return pi_state;
  43}
  44
  45static void pi_state_update_owner(struct futex_pi_state *pi_state,
  46				  struct task_struct *new_owner)
  47{
  48	struct task_struct *old_owner = pi_state->owner;
  49
  50	lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
  51
  52	if (old_owner) {
  53		raw_spin_lock(&old_owner->pi_lock);
  54		WARN_ON(list_empty(&pi_state->list));
  55		list_del_init(&pi_state->list);
  56		raw_spin_unlock(&old_owner->pi_lock);
  57	}
  58
  59	if (new_owner) {
  60		raw_spin_lock(&new_owner->pi_lock);
  61		WARN_ON(!list_empty(&pi_state->list));
  62		list_add(&pi_state->list, &new_owner->pi_state_list);
  63		pi_state->owner = new_owner;
  64		raw_spin_unlock(&new_owner->pi_lock);
  65	}
  66}
  67
  68void get_pi_state(struct futex_pi_state *pi_state)
  69{
  70	WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
  71}
  72
  73/*
  74 * Drops a reference to the pi_state object and frees or caches it
  75 * when the last reference is gone.
  76 */
  77void put_pi_state(struct futex_pi_state *pi_state)
  78{
  79	if (!pi_state)
  80		return;
  81
  82	if (!refcount_dec_and_test(&pi_state->refcount))
  83		return;
  84
  85	/*
  86	 * If pi_state->owner is NULL, the owner is most probably dying
  87	 * and has cleaned up the pi_state already
  88	 */
  89	if (pi_state->owner) {
  90		unsigned long flags;
  91
  92		raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
  93		pi_state_update_owner(pi_state, NULL);
  94		rt_mutex_proxy_unlock(&pi_state->pi_mutex);
  95		raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
  96	}
  97
  98	if (current->pi_state_cache) {
  99		kfree(pi_state);
 100	} else {
 101		/*
 102		 * pi_state->list is already empty.
 103		 * clear pi_state->owner.
 104		 * refcount is at 0 - put it back to 1.
 105		 */
 106		pi_state->owner = NULL;
 107		refcount_set(&pi_state->refcount, 1);
 108		current->pi_state_cache = pi_state;
 109	}
 110}
 111
 112/*
 113 * We need to check the following states:
 114 *
 115 *      Waiter | pi_state | pi->owner | uTID      | uODIED | ?
 116 *
 117 * [1]  NULL   | ---      | ---       | 0         | 0/1    | Valid
 118 * [2]  NULL   | ---      | ---       | >0        | 0/1    | Valid
 119 *
 120 * [3]  Found  | NULL     | --        | Any       | 0/1    | Invalid
 121 *
 122 * [4]  Found  | Found    | NULL      | 0         | 1      | Valid
 123 * [5]  Found  | Found    | NULL      | >0        | 1      | Invalid
 124 *
 125 * [6]  Found  | Found    | task      | 0         | 1      | Valid
 126 *
 127 * [7]  Found  | Found    | NULL      | Any       | 0      | Invalid
 128 *
 129 * [8]  Found  | Found    | task      | ==taskTID | 0/1    | Valid
 130 * [9]  Found  | Found    | task      | 0         | 0      | Invalid
 131 * [10] Found  | Found    | task      | !=taskTID | 0/1    | Invalid
 132 *
 133 * [1]	Indicates that the kernel can acquire the futex atomically. We
 134 *	came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
 135 *
 136 * [2]	Valid, if TID does not belong to a kernel thread. If no matching
 137 *      thread is found then it indicates that the owner TID has died.
 138 *
 139 * [3]	Invalid. The waiter is queued on a non PI futex
 140 *
 141 * [4]	Valid state after exit_robust_list(), which sets the user space
 142 *	value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
 143 *
 144 * [5]	The user space value got manipulated between exit_robust_list()
 145 *	and exit_pi_state_list()
 146 *
 147 * [6]	Valid state after exit_pi_state_list() which sets the new owner in
 148 *	the pi_state but cannot access the user space value.
 149 *
 150 * [7]	pi_state->owner can only be NULL when the OWNER_DIED bit is set.
 151 *
 152 * [8]	Owner and user space value match
 153 *
 154 * [9]	There is no transient state which sets the user space TID to 0
 155 *	except exit_robust_list(), but this is indicated by the
 156 *	FUTEX_OWNER_DIED bit. See [4]
 157 *
 158 * [10] There is no transient state which leaves owner and user space
 159 *	TID out of sync. Except one error case where the kernel is denied
 160 *	write access to the user address, see fixup_pi_state_owner().
 161 *
 162 *
 163 * Serialization and lifetime rules:
 164 *
 165 * hb->lock:
 166 *
 167 *	hb -> futex_q, relation
 168 *	futex_q -> pi_state, relation
 169 *
 170 *	(cannot be raw because hb can contain arbitrary amount
 171 *	 of futex_q's)
 172 *
 173 * pi_mutex->wait_lock:
 174 *
 175 *	{uval, pi_state}
 176 *
 177 *	(and pi_mutex 'obviously')
 178 *
 179 * p->pi_lock:
 180 *
 181 *	p->pi_state_list -> pi_state->list, relation
 182 *	pi_mutex->owner -> pi_state->owner, relation
 183 *
 184 * pi_state->refcount:
 185 *
 186 *	pi_state lifetime
 187 *
 188 *
 189 * Lock order:
 190 *
 191 *   hb->lock
 192 *     pi_mutex->wait_lock
 193 *       p->pi_lock
 194 *
 195 */
 196
 197/*
 198 * Validate that the existing waiter has a pi_state and sanity check
 199 * the pi_state against the user space value. If correct, attach to
 200 * it.
 201 */
 202static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
 203			      struct futex_pi_state *pi_state,
 204			      struct futex_pi_state **ps)
 205{
 206	pid_t pid = uval & FUTEX_TID_MASK;
 207	u32 uval2;
 208	int ret;
 209
 210	/*
 211	 * Userspace might have messed up non-PI and PI futexes [3]
 212	 */
 213	if (unlikely(!pi_state))
 214		return -EINVAL;
 215
 216	/*
 217	 * We get here with hb->lock held, and having found a
 218	 * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
 219	 * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(),
 220	 * which in turn means that futex_lock_pi() still has a reference on
 221	 * our pi_state.
 222	 *
 223	 * The waiter holding a reference on @pi_state also protects against
 224	 * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
 225	 * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
 226	 * free pi_state before we can take a reference ourselves.
 227	 */
 228	WARN_ON(!refcount_read(&pi_state->refcount));
 229
 230	/*
 231	 * Now that we have a pi_state, we can acquire wait_lock
 232	 * and do the state validation.
 233	 */
 234	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
 235
 236	/*
 237	 * Since {uval, pi_state} is serialized by wait_lock, and our current
 238	 * uval was read without holding it, it can have changed. Verify it
 239	 * still is what we expect it to be, otherwise retry the entire
 240	 * operation.
 241	 */
 242	if (futex_get_value_locked(&uval2, uaddr))
 243		goto out_efault;
 244
 245	if (uval != uval2)
 246		goto out_eagain;
 247
 248	/*
 249	 * Handle the owner died case:
 250	 */
 251	if (uval & FUTEX_OWNER_DIED) {
 252		/*
 253		 * exit_pi_state_list sets owner to NULL and wakes the
 254		 * topmost waiter. The task which acquires the
 255		 * pi_state->rt_mutex will fixup owner.
 256		 */
 257		if (!pi_state->owner) {
 258			/*
 259			 * No pi state owner, but the user space TID
 260			 * is not 0. Inconsistent state. [5]
 261			 */
 262			if (pid)
 263				goto out_einval;
 264			/*
 265			 * Take a ref on the state and return success. [4]
 266			 */
 267			goto out_attach;
 268		}
 269
 270		/*
 271		 * If TID is 0, then either the dying owner has not
 272		 * yet executed exit_pi_state_list() or some waiter
 273		 * acquired the rtmutex in the pi state, but did not
 274		 * yet fixup the TID in user space.
 275		 *
 276		 * Take a ref on the state and return success. [6]
 277		 */
 278		if (!pid)
 279			goto out_attach;
 280	} else {
 281		/*
 282		 * If the owner died bit is not set, then the pi_state
 283		 * must have an owner. [7]
 284		 */
 285		if (!pi_state->owner)
 286			goto out_einval;
 287	}
 288
 289	/*
 290	 * Bail out if user space manipulated the futex value. If pi
 291	 * state exists then the owner TID must be the same as the
 292	 * user space TID. [9/10]
 293	 */
 294	if (pid != task_pid_vnr(pi_state->owner))
 295		goto out_einval;
 296
 297out_attach:
 298	get_pi_state(pi_state);
 299	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 300	*ps = pi_state;
 301	return 0;
 302
 303out_einval:
 304	ret = -EINVAL;
 305	goto out_error;
 306
 307out_eagain:
 308	ret = -EAGAIN;
 309	goto out_error;
 310
 311out_efault:
 312	ret = -EFAULT;
 313	goto out_error;
 314
 315out_error:
 316	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 317	return ret;
 318}
 319
 320static int handle_exit_race(u32 __user *uaddr, u32 uval,
 321			    struct task_struct *tsk)
 322{
 323	u32 uval2;
 324
 325	/*
 326	 * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
 327	 * caller that the alleged owner is busy.
 328	 */
 329	if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
 330		return -EBUSY;
 331
 332	/*
 333	 * Reread the user space value to handle the following situation:
 334	 *
 335	 * CPU0				CPU1
 336	 *
 337	 * sys_exit()			sys_futex()
 338	 *  do_exit()			 futex_lock_pi()
 339	 *                                futex_lock_pi_atomic()
 340	 *   exit_signals(tsk)		    No waiters:
 341	 *    tsk->flags |= PF_EXITING;	    *uaddr == 0x00000PID
 342	 *  mm_release(tsk)		    Set waiter bit
 343	 *   exit_robust_list(tsk) {	    *uaddr = 0x80000PID;
 344	 *      Set owner died		    attach_to_pi_owner() {
 345	 *    *uaddr = 0xC0000000;	     tsk = get_task(PID);
 346	 *   }				     if (!tsk->flags & PF_EXITING) {
 347	 *  ...				       attach();
 348	 *  tsk->futex_state =               } else {
 349	 *	FUTEX_STATE_DEAD;              if (tsk->futex_state !=
 350	 *					  FUTEX_STATE_DEAD)
 351	 *				         return -EAGAIN;
 352	 *				       return -ESRCH; <--- FAIL
 353	 *				     }
 354	 *
 355	 * Returning ESRCH unconditionally is wrong here because the
 356	 * user space value has been changed by the exiting task.
 357	 *
 358	 * The same logic applies to the case where the exiting task is
 359	 * already gone.
 360	 */
 361	if (futex_get_value_locked(&uval2, uaddr))
 362		return -EFAULT;
 363
 364	/* If the user space value has changed, try again. */
 365	if (uval2 != uval)
 366		return -EAGAIN;
 367
 368	/*
 369	 * The exiting task did not have a robust list, the robust list was
 370	 * corrupted or the user space value in *uaddr is simply bogus.
 371	 * Give up and tell user space.
 372	 */
 373	return -ESRCH;
 374}
 375
 376static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
 377				 struct futex_pi_state **ps)
 378{
 379	/*
 380	 * No existing pi state. First waiter. [2]
 381	 *
 382	 * This creates pi_state, we have hb->lock held, this means nothing can
 383	 * observe this state, wait_lock is irrelevant.
 384	 */
 385	struct futex_pi_state *pi_state = alloc_pi_state();
 386
 387	/*
 388	 * Initialize the pi_mutex in locked state and make @p
 389	 * the owner of it:
 390	 */
 391	rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
 392
 393	/* Store the key for possible exit cleanups: */
 394	pi_state->key = *key;
 395
 396	WARN_ON(!list_empty(&pi_state->list));
 397	list_add(&pi_state->list, &p->pi_state_list);
 398	/*
 399	 * Assignment without holding pi_state->pi_mutex.wait_lock is safe
 400	 * because there is no concurrency as the object is not published yet.
 401	 */
 402	pi_state->owner = p;
 403
 404	*ps = pi_state;
 405}
 406/*
 407 * Lookup the task for the TID provided from user space and attach to
 408 * it after doing proper sanity checks.
 409 */
 410static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
 411			      struct futex_pi_state **ps,
 412			      struct task_struct **exiting)
 413{
 414	pid_t pid = uval & FUTEX_TID_MASK;
 415	struct task_struct *p;
 416
 417	/*
 418	 * We are the first waiter - try to look up the real owner and attach
 419	 * the new pi_state to it, but bail out when TID = 0 [1]
 420	 *
 421	 * The !pid check is paranoid. None of the call sites should end up
 422	 * with pid == 0, but better safe than sorry. Let the caller retry
 423	 */
 424	if (!pid)
 425		return -EAGAIN;
 426	p = find_get_task_by_vpid(pid);
 427	if (!p)
 428		return handle_exit_race(uaddr, uval, NULL);
 429
 430	if (unlikely(p->flags & PF_KTHREAD)) {
 431		put_task_struct(p);
 432		return -EPERM;
 433	}
 434
 435	/*
 436	 * We need to look at the task state to figure out, whether the
 437	 * task is exiting. To protect against the change of the task state
 438	 * in futex_exit_release(), we do this protected by p->pi_lock:
 439	 */
 440	raw_spin_lock_irq(&p->pi_lock);
 441	if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
 442		/*
 443		 * The task is on the way out. When the futex state is
 444		 * FUTEX_STATE_DEAD, we know that the task has finished
 445		 * the cleanup:
 446		 */
 447		int ret = handle_exit_race(uaddr, uval, p);
 448
 449		raw_spin_unlock_irq(&p->pi_lock);
 450		/*
 451		 * If the owner task is between FUTEX_STATE_EXITING and
 452		 * FUTEX_STATE_DEAD then store the task pointer and keep
 453		 * the reference on the task struct. The calling code will
 454		 * drop all locks, wait for the task to reach
 455		 * FUTEX_STATE_DEAD and then drop the refcount. This is
 456		 * required to prevent a live lock when the current task
 457		 * preempted the exiting task between the two states.
 458		 */
 459		if (ret == -EBUSY)
 460			*exiting = p;
 461		else
 462			put_task_struct(p);
 463		return ret;
 464	}
 465
 466	__attach_to_pi_owner(p, key, ps);
 467	raw_spin_unlock_irq(&p->pi_lock);
 468
 469	put_task_struct(p);
 470
 471	return 0;
 472}
 473
 474static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
 475{
 476	int err;
 477	u32 curval;
 478
 479	if (unlikely(should_fail_futex(true)))
 480		return -EFAULT;
 481
 482	err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
 483	if (unlikely(err))
 484		return err;
 485
 486	/* If user space value changed, let the caller retry */
 487	return curval != uval ? -EAGAIN : 0;
 488}
 489
 490/**
 491 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
 492 * @uaddr:		the pi futex user address
 493 * @hb:			the pi futex hash bucket
 494 * @key:		the futex key associated with uaddr and hb
 495 * @ps:			the pi_state pointer where we store the result of the
 496 *			lookup
 497 * @task:		the task to perform the atomic lock work for.  This will
 498 *			be "current" except in the case of requeue pi.
 499 * @exiting:		Pointer to store the task pointer of the owner task
 500 *			which is in the middle of exiting
 501 * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0)
 502 *
 503 * Return:
 504 *  -  0 - ready to wait;
 505 *  -  1 - acquired the lock;
 506 *  - <0 - error
 507 *
 508 * The hb->lock must be held by the caller.
 509 *
 510 * @exiting is only set when the return value is -EBUSY. If so, this holds
 511 * a refcount on the exiting task on return and the caller needs to drop it
 512 * after waiting for the exit to complete.
 513 */
 514int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
 515			 union futex_key *key,
 516			 struct futex_pi_state **ps,
 517			 struct task_struct *task,
 518			 struct task_struct **exiting,
 519			 int set_waiters)
 520{
 521	u32 uval, newval, vpid = task_pid_vnr(task);
 522	struct futex_q *top_waiter;
 523	int ret;
 524
 525	/*
 526	 * Read the user space value first so we can validate a few
 527	 * things before proceeding further.
 528	 */
 529	if (futex_get_value_locked(&uval, uaddr))
 530		return -EFAULT;
 531
 532	if (unlikely(should_fail_futex(true)))
 533		return -EFAULT;
 534
 535	/*
 536	 * Detect deadlocks.
 537	 */
 538	if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
 539		return -EDEADLK;
 540
 541	if ((unlikely(should_fail_futex(true))))
 542		return -EDEADLK;
 543
 544	/*
 545	 * Lookup existing state first. If it exists, try to attach to
 546	 * its pi_state.
 547	 */
 548	top_waiter = futex_top_waiter(hb, key);
 549	if (top_waiter)
 550		return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
 551
 552	/*
 553	 * No waiter and user TID is 0. We are here because the
 554	 * waiters or the owner died bit is set or called from
 555	 * requeue_cmp_pi or for whatever reason something took the
 556	 * syscall.
 557	 */
 558	if (!(uval & FUTEX_TID_MASK)) {
 559		/*
 560		 * We take over the futex. No other waiters and the user space
 561		 * TID is 0. We preserve the owner died bit.
 562		 */
 563		newval = uval & FUTEX_OWNER_DIED;
 564		newval |= vpid;
 565
 566		/* The futex requeue_pi code can enforce the waiters bit */
 567		if (set_waiters)
 568			newval |= FUTEX_WAITERS;
 569
 570		ret = lock_pi_update_atomic(uaddr, uval, newval);
 571		if (ret)
 572			return ret;
 573
 574		/*
 575		 * If the waiter bit was requested the caller also needs PI
 576		 * state attached to the new owner of the user space futex.
 577		 *
 578		 * @task is guaranteed to be alive and it cannot be exiting
 579		 * because it is either sleeping or waiting in
 580		 * futex_requeue_pi_wakeup_sync().
 581		 *
 582		 * No need to do the full attach_to_pi_owner() exercise
 583		 * because @task is known and valid.
 584		 */
 585		if (set_waiters) {
 586			raw_spin_lock_irq(&task->pi_lock);
 587			__attach_to_pi_owner(task, key, ps);
 588			raw_spin_unlock_irq(&task->pi_lock);
 589		}
 590		return 1;
 591	}
 592
 593	/*
 594	 * First waiter. Set the waiters bit before attaching ourself to
 595	 * the owner. If owner tries to unlock, it will be forced into
 596	 * the kernel and blocked on hb->lock.
 597	 */
 598	newval = uval | FUTEX_WAITERS;
 599	ret = lock_pi_update_atomic(uaddr, uval, newval);
 600	if (ret)
 601		return ret;
 602	/*
 603	 * If the update of the user space value succeeded, we try to
 604	 * attach to the owner. If that fails, no harm done, we only
 605	 * set the FUTEX_WAITERS bit in the user space variable.
 606	 */
 607	return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
 608}
 609
 610/*
 611 * Caller must hold a reference on @pi_state.
 612 */
 613static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
 
 
 614{
 615	struct rt_mutex_waiter *top_waiter;
 616	struct task_struct *new_owner;
 617	bool postunlock = false;
 618	DEFINE_RT_WAKE_Q(wqh);
 619	u32 curval, newval;
 620	int ret = 0;
 621
 622	top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
 623	if (WARN_ON_ONCE(!top_waiter)) {
 624		/*
 625		 * As per the comment in futex_unlock_pi() this should not happen.
 626		 *
 627		 * When this happens, give up our locks and try again, giving
 628		 * the futex_lock_pi() instance time to complete, either by
 629		 * waiting on the rtmutex or removing itself from the futex
 630		 * queue.
 631		 */
 632		ret = -EAGAIN;
 633		goto out_unlock;
 634	}
 635
 636	new_owner = top_waiter->task;
 637
 638	/*
 639	 * We pass it to the next owner. The WAITERS bit is always kept
 640	 * enabled while there is PI state around. We cleanup the owner
 641	 * died bit, because we are the owner.
 642	 */
 643	newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
 644
 645	if (unlikely(should_fail_futex(true))) {
 646		ret = -EFAULT;
 647		goto out_unlock;
 648	}
 649
 650	ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
 651	if (!ret && (curval != uval)) {
 652		/*
 653		 * If a unconditional UNLOCK_PI operation (user space did not
 654		 * try the TID->0 transition) raced with a waiter setting the
 655		 * FUTEX_WAITERS flag between get_user() and locking the hash
 656		 * bucket lock, retry the operation.
 657		 */
 658		if ((FUTEX_TID_MASK & curval) == uval)
 659			ret = -EAGAIN;
 660		else
 661			ret = -EINVAL;
 662	}
 663
 664	if (!ret) {
 665		/*
 666		 * This is a point of no return; once we modified the uval
 667		 * there is no going back and subsequent operations must
 668		 * not fail.
 669		 */
 670		pi_state_update_owner(pi_state, new_owner);
 671		postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
 672	}
 673
 674out_unlock:
 675	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 676
 677	if (postunlock)
 678		rt_mutex_postunlock(&wqh);
 679
 680	return ret;
 681}
 682
 683static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 684				  struct task_struct *argowner)
 685{
 686	struct futex_pi_state *pi_state = q->pi_state;
 687	struct task_struct *oldowner, *newowner;
 688	u32 uval, curval, newval, newtid;
 689	int err = 0;
 690
 691	oldowner = pi_state->owner;
 692
 693	/*
 694	 * We are here because either:
 695	 *
 696	 *  - we stole the lock and pi_state->owner needs updating to reflect
 697	 *    that (@argowner == current),
 698	 *
 699	 * or:
 700	 *
 701	 *  - someone stole our lock and we need to fix things to point to the
 702	 *    new owner (@argowner == NULL).
 703	 *
 704	 * Either way, we have to replace the TID in the user space variable.
 705	 * This must be atomic as we have to preserve the owner died bit here.
 706	 *
 707	 * Note: We write the user space value _before_ changing the pi_state
 708	 * because we can fault here. Imagine swapped out pages or a fork
 709	 * that marked all the anonymous memory readonly for cow.
 710	 *
 711	 * Modifying pi_state _before_ the user space value would leave the
 712	 * pi_state in an inconsistent state when we fault here, because we
 713	 * need to drop the locks to handle the fault. This might be observed
 714	 * in the PID checks when attaching to PI state .
 715	 */
 716retry:
 717	if (!argowner) {
 718		if (oldowner != current) {
 719			/*
 720			 * We raced against a concurrent self; things are
 721			 * already fixed up. Nothing to do.
 722			 */
 723			return 0;
 724		}
 725
 726		if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
 727			/* We got the lock. pi_state is correct. Tell caller. */
 728			return 1;
 729		}
 730
 731		/*
 732		 * The trylock just failed, so either there is an owner or
 733		 * there is a higher priority waiter than this one.
 734		 */
 735		newowner = rt_mutex_owner(&pi_state->pi_mutex);
 736		/*
 737		 * If the higher priority waiter has not yet taken over the
 738		 * rtmutex then newowner is NULL. We can't return here with
 739		 * that state because it's inconsistent vs. the user space
 740		 * state. So drop the locks and try again. It's a valid
 741		 * situation and not any different from the other retry
 742		 * conditions.
 743		 */
 744		if (unlikely(!newowner)) {
 745			err = -EAGAIN;
 746			goto handle_err;
 747		}
 748	} else {
 749		WARN_ON_ONCE(argowner != current);
 750		if (oldowner == current) {
 751			/*
 752			 * We raced against a concurrent self; things are
 753			 * already fixed up. Nothing to do.
 754			 */
 755			return 1;
 756		}
 757		newowner = argowner;
 758	}
 759
 760	newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
 761	/* Owner died? */
 762	if (!pi_state->owner)
 763		newtid |= FUTEX_OWNER_DIED;
 764
 765	err = futex_get_value_locked(&uval, uaddr);
 766	if (err)
 767		goto handle_err;
 768
 769	for (;;) {
 770		newval = (uval & FUTEX_OWNER_DIED) | newtid;
 771
 772		err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
 773		if (err)
 774			goto handle_err;
 775
 776		if (curval == uval)
 777			break;
 778		uval = curval;
 779	}
 780
 781	/*
 782	 * We fixed up user space. Now we need to fix the pi_state
 783	 * itself.
 784	 */
 785	pi_state_update_owner(pi_state, newowner);
 786
 787	return argowner == current;
 788
 789	/*
 790	 * In order to reschedule or handle a page fault, we need to drop the
 791	 * locks here. In the case of a fault, this gives the other task
 792	 * (either the highest priority waiter itself or the task which stole
 793	 * the rtmutex) the chance to try the fixup of the pi_state. So once we
 794	 * are back from handling the fault we need to check the pi_state after
 795	 * reacquiring the locks and before trying to do another fixup. When
 796	 * the fixup has been done already we simply return.
 797	 *
 798	 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
 799	 * drop hb->lock since the caller owns the hb -> futex_q relation.
 800	 * Dropping the pi_mutex->wait_lock requires the state revalidate.
 801	 */
 802handle_err:
 803	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 804	spin_unlock(q->lock_ptr);
 805
 806	switch (err) {
 807	case -EFAULT:
 808		err = fault_in_user_writeable(uaddr);
 809		break;
 810
 811	case -EAGAIN:
 812		cond_resched();
 813		err = 0;
 814		break;
 815
 816	default:
 817		WARN_ON_ONCE(1);
 818		break;
 819	}
 820
 821	spin_lock(q->lock_ptr);
 822	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
 823
 824	/*
 825	 * Check if someone else fixed it for us:
 826	 */
 827	if (pi_state->owner != oldowner)
 828		return argowner == current;
 829
 830	/* Retry if err was -EAGAIN or the fault in succeeded */
 831	if (!err)
 832		goto retry;
 833
 834	/*
 835	 * fault_in_user_writeable() failed so user state is immutable. At
 836	 * best we can make the kernel state consistent but user state will
 837	 * be most likely hosed and any subsequent unlock operation will be
 838	 * rejected due to PI futex rule [10].
 839	 *
 840	 * Ensure that the rtmutex owner is also the pi_state owner despite
 841	 * the user space value claiming something different. There is no
 842	 * point in unlocking the rtmutex if current is the owner as it
 843	 * would need to wait until the next waiter has taken the rtmutex
 844	 * to guarantee consistent state. Keep it simple. Userspace asked
 845	 * for this wreckaged state.
 846	 *
 847	 * The rtmutex has an owner - either current or some other
 848	 * task. See the EAGAIN loop above.
 849	 */
 850	pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
 851
 852	return err;
 853}
 854
 855static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 856				struct task_struct *argowner)
 857{
 858	struct futex_pi_state *pi_state = q->pi_state;
 859	int ret;
 860
 861	lockdep_assert_held(q->lock_ptr);
 862
 863	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
 864	ret = __fixup_pi_state_owner(uaddr, q, argowner);
 865	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 866	return ret;
 867}
 868
 869/**
 870 * fixup_pi_owner() - Post lock pi_state and corner case management
 871 * @uaddr:	user address of the futex
 872 * @q:		futex_q (contains pi_state and access to the rt_mutex)
 873 * @locked:	if the attempt to take the rt_mutex succeeded (1) or not (0)
 874 *
 875 * After attempting to lock an rt_mutex, this function is called to cleanup
 876 * the pi_state owner as well as handle race conditions that may allow us to
 877 * acquire the lock. Must be called with the hb lock held.
 878 *
 879 * Return:
 880 *  -  1 - success, lock taken;
 881 *  -  0 - success, lock not taken;
 882 *  - <0 - on error (-EFAULT)
 883 */
 884int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
 885{
 886	if (locked) {
 887		/*
 888		 * Got the lock. We might not be the anticipated owner if we
 889		 * did a lock-steal - fix up the PI-state in that case:
 890		 *
 891		 * Speculative pi_state->owner read (we don't hold wait_lock);
 892		 * since we own the lock pi_state->owner == current is the
 893		 * stable state, anything else needs more attention.
 894		 */
 895		if (q->pi_state->owner != current)
 896			return fixup_pi_state_owner(uaddr, q, current);
 897		return 1;
 898	}
 899
 900	/*
 901	 * If we didn't get the lock; check if anybody stole it from us. In
 902	 * that case, we need to fix up the uval to point to them instead of
 903	 * us, otherwise bad things happen. [10]
 904	 *
 905	 * Another speculative read; pi_state->owner == current is unstable
 906	 * but needs our attention.
 907	 */
 908	if (q->pi_state->owner == current)
 909		return fixup_pi_state_owner(uaddr, q, NULL);
 910
 911	/*
 912	 * Paranoia check. If we did not take the lock, then we should not be
 913	 * the owner of the rt_mutex. Warn and establish consistent state.
 914	 */
 915	if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
 916		return fixup_pi_state_owner(uaddr, q, current);
 917
 918	return 0;
 919}
 920
 921/*
 922 * Userspace tried a 0 -> TID atomic transition of the futex value
 923 * and failed. The kernel side here does the whole locking operation:
 924 * if there are waiters then it will block as a consequence of relying
 925 * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
 926 * a 0 value of the futex too.).
 927 *
 928 * Also serves as futex trylock_pi()'ing, and due semantics.
 929 */
 930int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
 931{
 932	struct hrtimer_sleeper timeout, *to;
 933	struct task_struct *exiting = NULL;
 934	struct rt_mutex_waiter rt_waiter;
 935	struct futex_hash_bucket *hb;
 936	struct futex_q q = futex_q_init;
 937	int res, ret;
 938
 939	if (!IS_ENABLED(CONFIG_FUTEX_PI))
 940		return -ENOSYS;
 941
 942	if (refill_pi_state_cache())
 943		return -ENOMEM;
 944
 945	to = futex_setup_timer(time, &timeout, flags, 0);
 946
 947retry:
 948	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
 949	if (unlikely(ret != 0))
 950		goto out;
 951
 952retry_private:
 953	hb = futex_q_lock(&q);
 954
 955	ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
 956				   &exiting, 0);
 957	if (unlikely(ret)) {
 958		/*
 959		 * Atomic work succeeded and we got the lock,
 960		 * or failed. Either way, we do _not_ block.
 961		 */
 962		switch (ret) {
 963		case 1:
 964			/* We got the lock. */
 965			ret = 0;
 966			goto out_unlock_put_key;
 967		case -EFAULT:
 968			goto uaddr_faulted;
 969		case -EBUSY:
 970		case -EAGAIN:
 971			/*
 972			 * Two reasons for this:
 973			 * - EBUSY: Task is exiting and we just wait for the
 974			 *   exit to complete.
 975			 * - EAGAIN: The user space value changed.
 976			 */
 977			futex_q_unlock(hb);
 978			/*
 979			 * Handle the case where the owner is in the middle of
 980			 * exiting. Wait for the exit to complete otherwise
 981			 * this task might loop forever, aka. live lock.
 982			 */
 983			wait_for_owner_exiting(ret, exiting);
 984			cond_resched();
 985			goto retry;
 986		default:
 987			goto out_unlock_put_key;
 988		}
 989	}
 990
 991	WARN_ON(!q.pi_state);
 992
 993	/*
 994	 * Only actually queue now that the atomic ops are done:
 995	 */
 996	__futex_queue(&q, hb);
 997
 998	if (trylock) {
 999		ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
1000		/* Fixup the trylock return value: */
1001		ret = ret ? 0 : -EWOULDBLOCK;
1002		goto no_block;
1003	}
1004
 
 
 
 
 
 
1005	rt_mutex_init_waiter(&rt_waiter);
1006
1007	/*
1008	 * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
1009	 * hold it while doing rt_mutex_start_proxy(), because then it will
1010	 * include hb->lock in the blocking chain, even through we'll not in
1011	 * fact hold it while blocking. This will lead it to report -EDEADLK
1012	 * and BUG when futex_unlock_pi() interleaves with this.
1013	 *
1014	 * Therefore acquire wait_lock while holding hb->lock, but drop the
1015	 * latter before calling __rt_mutex_start_proxy_lock(). This
1016	 * interleaves with futex_unlock_pi() -- which does a similar lock
1017	 * handoff -- such that the latter can observe the futex_q::pi_state
1018	 * before __rt_mutex_start_proxy_lock() is done.
1019	 */
1020	raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
1021	spin_unlock(q.lock_ptr);
1022	/*
1023	 * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
1024	 * such that futex_unlock_pi() is guaranteed to observe the waiter when
1025	 * it sees the futex_q::pi_state.
1026	 */
1027	ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
1028	raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
1029
1030	if (ret) {
1031		if (ret == 1)
1032			ret = 0;
1033		goto cleanup;
1034	}
1035
1036	if (unlikely(to))
1037		hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
1038
1039	ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
1040
1041cleanup:
1042	spin_lock(q.lock_ptr);
1043	/*
1044	 * If we failed to acquire the lock (deadlock/signal/timeout), we must
1045	 * first acquire the hb->lock before removing the lock from the
1046	 * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
1047	 * lists consistent.
 
 
 
 
 
 
 
 
 
 
1048	 *
1049	 * In particular; it is important that futex_unlock_pi() can not
1050	 * observe this inconsistency.
1051	 */
1052	if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
1053		ret = 0;
1054
 
 
 
 
 
 
 
 
 
 
1055no_block:
1056	/*
1057	 * Fixup the pi_state owner and possibly acquire the lock if we
1058	 * haven't already.
1059	 */
1060	res = fixup_pi_owner(uaddr, &q, !ret);
1061	/*
1062	 * If fixup_pi_owner() returned an error, propagate that.  If it acquired
1063	 * the lock, clear our -ETIMEDOUT or -EINTR.
1064	 */
1065	if (res)
1066		ret = (res < 0) ? res : 0;
1067
1068	futex_unqueue_pi(&q);
1069	spin_unlock(q.lock_ptr);
1070	goto out;
1071
1072out_unlock_put_key:
1073	futex_q_unlock(hb);
1074
1075out:
1076	if (to) {
1077		hrtimer_cancel(&to->timer);
1078		destroy_hrtimer_on_stack(&to->timer);
1079	}
1080	return ret != -EINTR ? ret : -ERESTARTNOINTR;
1081
1082uaddr_faulted:
1083	futex_q_unlock(hb);
1084
1085	ret = fault_in_user_writeable(uaddr);
1086	if (ret)
1087		goto out;
1088
1089	if (!(flags & FLAGS_SHARED))
1090		goto retry_private;
1091
1092	goto retry;
1093}
1094
1095/*
1096 * Userspace attempted a TID -> 0 atomic transition, and failed.
1097 * This is the in-kernel slowpath: we look up the PI state (if any),
1098 * and do the rt-mutex unlock.
1099 */
1100int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
1101{
1102	u32 curval, uval, vpid = task_pid_vnr(current);
1103	union futex_key key = FUTEX_KEY_INIT;
1104	struct futex_hash_bucket *hb;
1105	struct futex_q *top_waiter;
1106	int ret;
1107
1108	if (!IS_ENABLED(CONFIG_FUTEX_PI))
1109		return -ENOSYS;
1110
1111retry:
1112	if (get_user(uval, uaddr))
1113		return -EFAULT;
1114	/*
1115	 * We release only a lock we actually own:
1116	 */
1117	if ((uval & FUTEX_TID_MASK) != vpid)
1118		return -EPERM;
1119
1120	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
1121	if (ret)
1122		return ret;
1123
1124	hb = futex_hash(&key);
1125	spin_lock(&hb->lock);
 
1126
1127	/*
1128	 * Check waiters first. We do not trust user space values at
1129	 * all and we at least want to know if user space fiddled
1130	 * with the futex value instead of blindly unlocking.
1131	 */
1132	top_waiter = futex_top_waiter(hb, &key);
1133	if (top_waiter) {
1134		struct futex_pi_state *pi_state = top_waiter->pi_state;
 
1135
1136		ret = -EINVAL;
1137		if (!pi_state)
1138			goto out_unlock;
1139
1140		/*
1141		 * If current does not own the pi_state then the futex is
1142		 * inconsistent and user space fiddled with the futex value.
1143		 */
1144		if (pi_state->owner != current)
1145			goto out_unlock;
1146
1147		get_pi_state(pi_state);
1148		/*
1149		 * By taking wait_lock while still holding hb->lock, we ensure
1150		 * there is no point where we hold neither; and therefore
1151		 * wake_futex_p() must observe a state consistent with what we
1152		 * observed.
 
 
 
 
 
1153		 *
1154		 * In particular; this forces __rt_mutex_start_proxy() to
1155		 * complete such that we're guaranteed to observe the
1156		 * rt_waiter. Also see the WARN in wake_futex_pi().
1157		 */
1158		raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1159		spin_unlock(&hb->lock);
1160
1161		/* drops pi_state->pi_mutex.wait_lock */
1162		ret = wake_futex_pi(uaddr, uval, pi_state);
1163
1164		put_pi_state(pi_state);
1165
1166		/*
1167		 * Success, we're done! No tricky corner cases.
1168		 */
1169		if (!ret)
1170			return ret;
1171		/*
1172		 * The atomic access to the futex value generated a
1173		 * pagefault, so retry the user-access and the wakeup:
1174		 */
1175		if (ret == -EFAULT)
1176			goto pi_faulted;
1177		/*
1178		 * A unconditional UNLOCK_PI op raced against a waiter
1179		 * setting the FUTEX_WAITERS bit. Try again.
1180		 */
1181		if (ret == -EAGAIN)
1182			goto pi_retry;
1183		/*
1184		 * wake_futex_pi has detected invalid state. Tell user
1185		 * space.
1186		 */
1187		return ret;
1188	}
1189
1190	/*
1191	 * We have no kernel internal state, i.e. no waiters in the
1192	 * kernel. Waiters which are about to queue themselves are stuck
1193	 * on hb->lock. So we can safely ignore them. We do neither
1194	 * preserve the WAITERS bit not the OWNER_DIED one. We are the
1195	 * owner.
1196	 */
1197	if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) {
1198		spin_unlock(&hb->lock);
1199		switch (ret) {
1200		case -EFAULT:
1201			goto pi_faulted;
1202
1203		case -EAGAIN:
1204			goto pi_retry;
1205
1206		default:
1207			WARN_ON_ONCE(1);
1208			return ret;
1209		}
1210	}
1211
1212	/*
1213	 * If uval has changed, let user space handle it.
1214	 */
1215	ret = (curval == uval) ? 0 : -EAGAIN;
1216
1217out_unlock:
1218	spin_unlock(&hb->lock);
1219	return ret;
1220
1221pi_retry:
1222	cond_resched();
1223	goto retry;
1224
1225pi_faulted:
1226
1227	ret = fault_in_user_writeable(uaddr);
1228	if (!ret)
1229		goto retry;
1230
1231	return ret;
1232}
1233

   1// SPDX-License-Identifier: GPL-2.0-or-later
   2
   3#include <linux/slab.h>
   4#include <linux/sched/rt.h>
   5#include <linux/sched/task.h>
   6
   7#include "futex.h"
   8#include "../locking/rtmutex_common.h"
   9
  10/*
  11 * PI code:
  12 */
  13int refill_pi_state_cache(void)
  14{
  15	struct futex_pi_state *pi_state;
  16
  17	if (likely(current->pi_state_cache))
  18		return 0;
  19
  20	pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
  21
  22	if (!pi_state)
  23		return -ENOMEM;
  24
  25	INIT_LIST_HEAD(&pi_state->list);
  26	/* pi_mutex gets initialized later */
  27	pi_state->owner = NULL;
  28	refcount_set(&pi_state->refcount, 1);
  29	pi_state->key = FUTEX_KEY_INIT;
  30
  31	current->pi_state_cache = pi_state;
  32
  33	return 0;
  34}
  35
  36static struct futex_pi_state *alloc_pi_state(void)
  37{
  38	struct futex_pi_state *pi_state = current->pi_state_cache;
  39
  40	WARN_ON(!pi_state);
  41	current->pi_state_cache = NULL;
  42
  43	return pi_state;
  44}
  45
  46static void pi_state_update_owner(struct futex_pi_state *pi_state,
  47				  struct task_struct *new_owner)
  48{
  49	struct task_struct *old_owner = pi_state->owner;
  50
  51	lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
  52
  53	if (old_owner) {
  54		raw_spin_lock(&old_owner->pi_lock);
  55		WARN_ON(list_empty(&pi_state->list));
  56		list_del_init(&pi_state->list);
  57		raw_spin_unlock(&old_owner->pi_lock);
  58	}
  59
  60	if (new_owner) {
  61		raw_spin_lock(&new_owner->pi_lock);
  62		WARN_ON(!list_empty(&pi_state->list));
  63		list_add(&pi_state->list, &new_owner->pi_state_list);
  64		pi_state->owner = new_owner;
  65		raw_spin_unlock(&new_owner->pi_lock);
  66	}
  67}
  68
  69void get_pi_state(struct futex_pi_state *pi_state)
  70{
  71	WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
  72}
  73
  74/*
  75 * Drops a reference to the pi_state object and frees or caches it
  76 * when the last reference is gone.
  77 */
  78void put_pi_state(struct futex_pi_state *pi_state)
  79{
  80	if (!pi_state)
  81		return;
  82
  83	if (!refcount_dec_and_test(&pi_state->refcount))
  84		return;
  85
  86	/*
  87	 * If pi_state->owner is NULL, the owner is most probably dying
  88	 * and has cleaned up the pi_state already
  89	 */
  90	if (pi_state->owner) {
  91		unsigned long flags;
  92
  93		raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
  94		pi_state_update_owner(pi_state, NULL);
  95		rt_mutex_proxy_unlock(&pi_state->pi_mutex);
  96		raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
  97	}
  98
  99	if (current->pi_state_cache) {
 100		kfree(pi_state);
 101	} else {
 102		/*
 103		 * pi_state->list is already empty.
 104		 * clear pi_state->owner.
 105		 * refcount is at 0 - put it back to 1.
 106		 */
 107		pi_state->owner = NULL;
 108		refcount_set(&pi_state->refcount, 1);
 109		current->pi_state_cache = pi_state;
 110	}
 111}
 112
 113/*
 114 * We need to check the following states:
 115 *
 116 *      Waiter | pi_state | pi->owner | uTID      | uODIED | ?
 117 *
 118 * [1]  NULL   | ---      | ---       | 0         | 0/1    | Valid
 119 * [2]  NULL   | ---      | ---       | >0        | 0/1    | Valid
 120 *
 121 * [3]  Found  | NULL     | --        | Any       | 0/1    | Invalid
 122 *
 123 * [4]  Found  | Found    | NULL      | 0         | 1      | Valid
 124 * [5]  Found  | Found    | NULL      | >0        | 1      | Invalid
 125 *
 126 * [6]  Found  | Found    | task      | 0         | 1      | Valid
 127 *
 128 * [7]  Found  | Found    | NULL      | Any       | 0      | Invalid
 129 *
 130 * [8]  Found  | Found    | task      | ==taskTID | 0/1    | Valid
 131 * [9]  Found  | Found    | task      | 0         | 0      | Invalid
 132 * [10] Found  | Found    | task      | !=taskTID | 0/1    | Invalid
 133 *
 134 * [1]	Indicates that the kernel can acquire the futex atomically. We
 135 *	came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
 136 *
 137 * [2]	Valid, if TID does not belong to a kernel thread. If no matching
 138 *      thread is found then it indicates that the owner TID has died.
 139 *
 140 * [3]	Invalid. The waiter is queued on a non PI futex
 141 *
 142 * [4]	Valid state after exit_robust_list(), which sets the user space
 143 *	value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
 144 *
 145 * [5]	The user space value got manipulated between exit_robust_list()
 146 *	and exit_pi_state_list()
 147 *
 148 * [6]	Valid state after exit_pi_state_list() which sets the new owner in
 149 *	the pi_state but cannot access the user space value.
 150 *
 151 * [7]	pi_state->owner can only be NULL when the OWNER_DIED bit is set.
 152 *
 153 * [8]	Owner and user space value match
 154 *
 155 * [9]	There is no transient state which sets the user space TID to 0
 156 *	except exit_robust_list(), but this is indicated by the
 157 *	FUTEX_OWNER_DIED bit. See [4]
 158 *
 159 * [10] There is no transient state which leaves owner and user space
 160 *	TID out of sync. Except one error case where the kernel is denied
 161 *	write access to the user address, see fixup_pi_state_owner().
 162 *
 163 *
 164 * Serialization and lifetime rules:
 165 *
 166 * hb->lock:
 167 *
 168 *	hb -> futex_q, relation
 169 *	futex_q -> pi_state, relation
 170 *
 171 *	(cannot be raw because hb can contain arbitrary amount
 172 *	 of futex_q's)
 173 *
 174 * pi_mutex->wait_lock:
 175 *
 176 *	{uval, pi_state}
 177 *
 178 *	(and pi_mutex 'obviously')
 179 *
 180 * p->pi_lock:
 181 *
 182 *	p->pi_state_list -> pi_state->list, relation
 183 *	pi_mutex->owner -> pi_state->owner, relation
 184 *
 185 * pi_state->refcount:
 186 *
 187 *	pi_state lifetime
 188 *
 189 *
 190 * Lock order:
 191 *
 192 *   hb->lock
 193 *     pi_mutex->wait_lock
 194 *       p->pi_lock
 195 *
 196 */
 197
 198/*
 199 * Validate that the existing waiter has a pi_state and sanity check
 200 * the pi_state against the user space value. If correct, attach to
 201 * it.
 202 */
 203static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
 204			      struct futex_pi_state *pi_state,
 205			      struct futex_pi_state **ps)
 206{
 207	pid_t pid = uval & FUTEX_TID_MASK;
 208	u32 uval2;
 209	int ret;
 210
 211	/*
 212	 * Userspace might have messed up non-PI and PI futexes [3]
 213	 */
 214	if (unlikely(!pi_state))
 215		return -EINVAL;
 216
 217	/*
 218	 * We get here with hb->lock held, and having found a
 219	 * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
 220	 * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(),
 221	 * which in turn means that futex_lock_pi() still has a reference on
 222	 * our pi_state.
 223	 *
 224	 * The waiter holding a reference on @pi_state also protects against
 225	 * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
 226	 * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
 227	 * free pi_state before we can take a reference ourselves.
 228	 */
 229	WARN_ON(!refcount_read(&pi_state->refcount));
 230
 231	/*
 232	 * Now that we have a pi_state, we can acquire wait_lock
 233	 * and do the state validation.
 234	 */
 235	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
 236
 237	/*
 238	 * Since {uval, pi_state} is serialized by wait_lock, and our current
 239	 * uval was read without holding it, it can have changed. Verify it
 240	 * still is what we expect it to be, otherwise retry the entire
 241	 * operation.
 242	 */
 243	if (futex_get_value_locked(&uval2, uaddr))
 244		goto out_efault;
 245
 246	if (uval != uval2)
 247		goto out_eagain;
 248
 249	/*
 250	 * Handle the owner died case:
 251	 */
 252	if (uval & FUTEX_OWNER_DIED) {
 253		/*
 254		 * exit_pi_state_list sets owner to NULL and wakes the
 255		 * topmost waiter. The task which acquires the
 256		 * pi_state->rt_mutex will fixup owner.
 257		 */
 258		if (!pi_state->owner) {
 259			/*
 260			 * No pi state owner, but the user space TID
 261			 * is not 0. Inconsistent state. [5]
 262			 */
 263			if (pid)
 264				goto out_einval;
 265			/*
 266			 * Take a ref on the state and return success. [4]
 267			 */
 268			goto out_attach;
 269		}
 270
 271		/*
 272		 * If TID is 0, then either the dying owner has not
 273		 * yet executed exit_pi_state_list() or some waiter
 274		 * acquired the rtmutex in the pi state, but did not
 275		 * yet fixup the TID in user space.
 276		 *
 277		 * Take a ref on the state and return success. [6]
 278		 */
 279		if (!pid)
 280			goto out_attach;
 281	} else {
 282		/*
 283		 * If the owner died bit is not set, then the pi_state
 284		 * must have an owner. [7]
 285		 */
 286		if (!pi_state->owner)
 287			goto out_einval;
 288	}
 289
 290	/*
 291	 * Bail out if user space manipulated the futex value. If pi
 292	 * state exists then the owner TID must be the same as the
 293	 * user space TID. [9/10]
 294	 */
 295	if (pid != task_pid_vnr(pi_state->owner))
 296		goto out_einval;
 297
 298out_attach:
 299	get_pi_state(pi_state);
 300	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 301	*ps = pi_state;
 302	return 0;
 303
 304out_einval:
 305	ret = -EINVAL;
 306	goto out_error;
 307
 308out_eagain:
 309	ret = -EAGAIN;
 310	goto out_error;
 311
 312out_efault:
 313	ret = -EFAULT;
 314	goto out_error;
 315
 316out_error:
 317	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 318	return ret;
 319}
 320
 321static int handle_exit_race(u32 __user *uaddr, u32 uval,
 322			    struct task_struct *tsk)
 323{
 324	u32 uval2;
 325
 326	/*
 327	 * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
 328	 * caller that the alleged owner is busy.
 329	 */
 330	if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
 331		return -EBUSY;
 332
 333	/*
 334	 * Reread the user space value to handle the following situation:
 335	 *
 336	 * CPU0				CPU1
 337	 *
 338	 * sys_exit()			sys_futex()
 339	 *  do_exit()			 futex_lock_pi()
 340	 *                                futex_lock_pi_atomic()
 341	 *   exit_signals(tsk)		    No waiters:
 342	 *    tsk->flags |= PF_EXITING;	    *uaddr == 0x00000PID
 343	 *  mm_release(tsk)		    Set waiter bit
 344	 *   exit_robust_list(tsk) {	    *uaddr = 0x80000PID;
 345	 *      Set owner died		    attach_to_pi_owner() {
 346	 *    *uaddr = 0xC0000000;	     tsk = get_task(PID);
 347	 *   }				     if (!tsk->flags & PF_EXITING) {
 348	 *  ...				       attach();
 349	 *  tsk->futex_state =               } else {
 350	 *	FUTEX_STATE_DEAD;              if (tsk->futex_state !=
 351	 *					  FUTEX_STATE_DEAD)
 352	 *				         return -EAGAIN;
 353	 *				       return -ESRCH; <--- FAIL
 354	 *				     }
 355	 *
 356	 * Returning ESRCH unconditionally is wrong here because the
 357	 * user space value has been changed by the exiting task.
 358	 *
 359	 * The same logic applies to the case where the exiting task is
 360	 * already gone.
 361	 */
 362	if (futex_get_value_locked(&uval2, uaddr))
 363		return -EFAULT;
 364
 365	/* If the user space value has changed, try again. */
 366	if (uval2 != uval)
 367		return -EAGAIN;
 368
 369	/*
 370	 * The exiting task did not have a robust list, the robust list was
 371	 * corrupted or the user space value in *uaddr is simply bogus.
 372	 * Give up and tell user space.
 373	 */
 374	return -ESRCH;
 375}
 376
 377static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
 378				 struct futex_pi_state **ps)
 379{
 380	/*
 381	 * No existing pi state. First waiter. [2]
 382	 *
 383	 * This creates pi_state, we have hb->lock held, this means nothing can
 384	 * observe this state, wait_lock is irrelevant.
 385	 */
 386	struct futex_pi_state *pi_state = alloc_pi_state();
 387
 388	/*
 389	 * Initialize the pi_mutex in locked state and make @p
 390	 * the owner of it:
 391	 */
 392	rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
 393
 394	/* Store the key for possible exit cleanups: */
 395	pi_state->key = *key;
 396
 397	WARN_ON(!list_empty(&pi_state->list));
 398	list_add(&pi_state->list, &p->pi_state_list);
 399	/*
 400	 * Assignment without holding pi_state->pi_mutex.wait_lock is safe
 401	 * because there is no concurrency as the object is not published yet.
 402	 */
 403	pi_state->owner = p;
 404
 405	*ps = pi_state;
 406}
 407/*
 408 * Lookup the task for the TID provided from user space and attach to
 409 * it after doing proper sanity checks.
 410 */
 411static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
 412			      struct futex_pi_state **ps,
 413			      struct task_struct **exiting)
 414{
 415	pid_t pid = uval & FUTEX_TID_MASK;
 416	struct task_struct *p;
 417
 418	/*
 419	 * We are the first waiter - try to look up the real owner and attach
 420	 * the new pi_state to it, but bail out when TID = 0 [1]
 421	 *
 422	 * The !pid check is paranoid. None of the call sites should end up
 423	 * with pid == 0, but better safe than sorry. Let the caller retry
 424	 */
 425	if (!pid)
 426		return -EAGAIN;
 427	p = find_get_task_by_vpid(pid);
 428	if (!p)
 429		return handle_exit_race(uaddr, uval, NULL);
 430
 431	if (unlikely(p->flags & PF_KTHREAD)) {
 432		put_task_struct(p);
 433		return -EPERM;
 434	}
 435
 436	/*
 437	 * We need to look at the task state to figure out, whether the
 438	 * task is exiting. To protect against the change of the task state
 439	 * in futex_exit_release(), we do this protected by p->pi_lock:
 440	 */
 441	raw_spin_lock_irq(&p->pi_lock);
 442	if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
 443		/*
 444		 * The task is on the way out. When the futex state is
 445		 * FUTEX_STATE_DEAD, we know that the task has finished
 446		 * the cleanup:
 447		 */
 448		int ret = handle_exit_race(uaddr, uval, p);
 449
 450		raw_spin_unlock_irq(&p->pi_lock);
 451		/*
 452		 * If the owner task is between FUTEX_STATE_EXITING and
 453		 * FUTEX_STATE_DEAD then store the task pointer and keep
 454		 * the reference on the task struct. The calling code will
 455		 * drop all locks, wait for the task to reach
 456		 * FUTEX_STATE_DEAD and then drop the refcount. This is
 457		 * required to prevent a live lock when the current task
 458		 * preempted the exiting task between the two states.
 459		 */
 460		if (ret == -EBUSY)
 461			*exiting = p;
 462		else
 463			put_task_struct(p);
 464		return ret;
 465	}
 466
 467	__attach_to_pi_owner(p, key, ps);
 468	raw_spin_unlock_irq(&p->pi_lock);
 469
 470	put_task_struct(p);
 471
 472	return 0;
 473}
 474
 475static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
 476{
 477	int err;
 478	u32 curval;
 479
 480	if (unlikely(should_fail_futex(true)))
 481		return -EFAULT;
 482
 483	err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
 484	if (unlikely(err))
 485		return err;
 486
 487	/* If user space value changed, let the caller retry */
 488	return curval != uval ? -EAGAIN : 0;
 489}
 490
 491/**
 492 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
 493 * @uaddr:		the pi futex user address
 494 * @hb:			the pi futex hash bucket
 495 * @key:		the futex key associated with uaddr and hb
 496 * @ps:			the pi_state pointer where we store the result of the
 497 *			lookup
 498 * @task:		the task to perform the atomic lock work for.  This will
 499 *			be "current" except in the case of requeue pi.
 500 * @exiting:		Pointer to store the task pointer of the owner task
 501 *			which is in the middle of exiting
 502 * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0)
 503 *
 504 * Return:
 505 *  -  0 - ready to wait;
 506 *  -  1 - acquired the lock;
 507 *  - <0 - error
 508 *
 509 * The hb->lock must be held by the caller.
 510 *
 511 * @exiting is only set when the return value is -EBUSY. If so, this holds
 512 * a refcount on the exiting task on return and the caller needs to drop it
 513 * after waiting for the exit to complete.
 514 */
 515int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
 516			 union futex_key *key,
 517			 struct futex_pi_state **ps,
 518			 struct task_struct *task,
 519			 struct task_struct **exiting,
 520			 int set_waiters)
 521{
 522	u32 uval, newval, vpid = task_pid_vnr(task);
 523	struct futex_q *top_waiter;
 524	int ret;
 525
 526	/*
 527	 * Read the user space value first so we can validate a few
 528	 * things before proceeding further.
 529	 */
 530	if (futex_get_value_locked(&uval, uaddr))
 531		return -EFAULT;
 532
 533	if (unlikely(should_fail_futex(true)))
 534		return -EFAULT;
 535
 536	/*
 537	 * Detect deadlocks.
 538	 */
 539	if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
 540		return -EDEADLK;
 541
 542	if ((unlikely(should_fail_futex(true))))
 543		return -EDEADLK;
 544
 545	/*
 546	 * Lookup existing state first. If it exists, try to attach to
 547	 * its pi_state.
 548	 */
 549	top_waiter = futex_top_waiter(hb, key);
 550	if (top_waiter)
 551		return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
 552
 553	/*
 554	 * No waiter and user TID is 0. We are here because the
 555	 * waiters or the owner died bit is set or called from
 556	 * requeue_cmp_pi or for whatever reason something took the
 557	 * syscall.
 558	 */
 559	if (!(uval & FUTEX_TID_MASK)) {
 560		/*
 561		 * We take over the futex. No other waiters and the user space
 562		 * TID is 0. We preserve the owner died bit.
 563		 */
 564		newval = uval & FUTEX_OWNER_DIED;
 565		newval |= vpid;
 566
 567		/* The futex requeue_pi code can enforce the waiters bit */
 568		if (set_waiters)
 569			newval |= FUTEX_WAITERS;
 570
 571		ret = lock_pi_update_atomic(uaddr, uval, newval);
 572		if (ret)
 573			return ret;
 574
 575		/*
 576		 * If the waiter bit was requested the caller also needs PI
 577		 * state attached to the new owner of the user space futex.
 578		 *
 579		 * @task is guaranteed to be alive and it cannot be exiting
 580		 * because it is either sleeping or waiting in
 581		 * futex_requeue_pi_wakeup_sync().
 582		 *
 583		 * No need to do the full attach_to_pi_owner() exercise
 584		 * because @task is known and valid.
 585		 */
 586		if (set_waiters) {
 587			raw_spin_lock_irq(&task->pi_lock);
 588			__attach_to_pi_owner(task, key, ps);
 589			raw_spin_unlock_irq(&task->pi_lock);
 590		}
 591		return 1;
 592	}
 593
 594	/*
 595	 * First waiter. Set the waiters bit before attaching ourself to
 596	 * the owner. If owner tries to unlock, it will be forced into
 597	 * the kernel and blocked on hb->lock.
 598	 */
 599	newval = uval | FUTEX_WAITERS;
 600	ret = lock_pi_update_atomic(uaddr, uval, newval);
 601	if (ret)
 602		return ret;
 603	/*
 604	 * If the update of the user space value succeeded, we try to
 605	 * attach to the owner. If that fails, no harm done, we only
 606	 * set the FUTEX_WAITERS bit in the user space variable.
 607	 */
 608	return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
 609}
 610
 611/*
 612 * Caller must hold a reference on @pi_state.
 613 */
 614static int wake_futex_pi(u32 __user *uaddr, u32 uval,
 615			 struct futex_pi_state *pi_state,
 616			 struct rt_mutex_waiter *top_waiter)
 617{
 
 618	struct task_struct *new_owner;
 619	bool postunlock = false;
 620	DEFINE_RT_WAKE_Q(wqh);
 621	u32 curval, newval;
 622	int ret = 0;
 623
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 624	new_owner = top_waiter->task;
 625
 626	/*
 627	 * We pass it to the next owner. The WAITERS bit is always kept
 628	 * enabled while there is PI state around. We cleanup the owner
 629	 * died bit, because we are the owner.
 630	 */
 631	newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
 632
 633	if (unlikely(should_fail_futex(true))) {
 634		ret = -EFAULT;
 635		goto out_unlock;
 636	}
 637
 638	ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
 639	if (!ret && (curval != uval)) {
 640		/*
 641		 * If a unconditional UNLOCK_PI operation (user space did not
 642		 * try the TID->0 transition) raced with a waiter setting the
 643		 * FUTEX_WAITERS flag between get_user() and locking the hash
 644		 * bucket lock, retry the operation.
 645		 */
 646		if ((FUTEX_TID_MASK & curval) == uval)
 647			ret = -EAGAIN;
 648		else
 649			ret = -EINVAL;
 650	}
 651
 652	if (!ret) {
 653		/*
 654		 * This is a point of no return; once we modified the uval
 655		 * there is no going back and subsequent operations must
 656		 * not fail.
 657		 */
 658		pi_state_update_owner(pi_state, new_owner);
 659		postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
 660	}
 661
 662out_unlock:
 663	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 664
 665	if (postunlock)
 666		rt_mutex_postunlock(&wqh);
 667
 668	return ret;
 669}
 670
 671static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 672				  struct task_struct *argowner)
 673{
 674	struct futex_pi_state *pi_state = q->pi_state;
 675	struct task_struct *oldowner, *newowner;
 676	u32 uval, curval, newval, newtid;
 677	int err = 0;
 678
 679	oldowner = pi_state->owner;
 680
 681	/*
 682	 * We are here because either:
 683	 *
 684	 *  - we stole the lock and pi_state->owner needs updating to reflect
 685	 *    that (@argowner == current),
 686	 *
 687	 * or:
 688	 *
 689	 *  - someone stole our lock and we need to fix things to point to the
 690	 *    new owner (@argowner == NULL).
 691	 *
 692	 * Either way, we have to replace the TID in the user space variable.
 693	 * This must be atomic as we have to preserve the owner died bit here.
 694	 *
 695	 * Note: We write the user space value _before_ changing the pi_state
 696	 * because we can fault here. Imagine swapped out pages or a fork
 697	 * that marked all the anonymous memory readonly for cow.
 698	 *
 699	 * Modifying pi_state _before_ the user space value would leave the
 700	 * pi_state in an inconsistent state when we fault here, because we
 701	 * need to drop the locks to handle the fault. This might be observed
 702	 * in the PID checks when attaching to PI state .
 703	 */
 704retry:
 705	if (!argowner) {
 706		if (oldowner != current) {
 707			/*
 708			 * We raced against a concurrent self; things are
 709			 * already fixed up. Nothing to do.
 710			 */
 711			return 0;
 712		}
 713
 714		if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
 715			/* We got the lock. pi_state is correct. Tell caller. */
 716			return 1;
 717		}
 718
 719		/*
 720		 * The trylock just failed, so either there is an owner or
 721		 * there is a higher priority waiter than this one.
 722		 */
 723		newowner = rt_mutex_owner(&pi_state->pi_mutex);
 724		/*
 725		 * If the higher priority waiter has not yet taken over the
 726		 * rtmutex then newowner is NULL. We can't return here with
 727		 * that state because it's inconsistent vs. the user space
 728		 * state. So drop the locks and try again. It's a valid
 729		 * situation and not any different from the other retry
 730		 * conditions.
 731		 */
 732		if (unlikely(!newowner)) {
 733			err = -EAGAIN;
 734			goto handle_err;
 735		}
 736	} else {
 737		WARN_ON_ONCE(argowner != current);
 738		if (oldowner == current) {
 739			/*
 740			 * We raced against a concurrent self; things are
 741			 * already fixed up. Nothing to do.
 742			 */
 743			return 1;
 744		}
 745		newowner = argowner;
 746	}
 747
 748	newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
 749	/* Owner died? */
 750	if (!pi_state->owner)
 751		newtid |= FUTEX_OWNER_DIED;
 752
 753	err = futex_get_value_locked(&uval, uaddr);
 754	if (err)
 755		goto handle_err;
 756
 757	for (;;) {
 758		newval = (uval & FUTEX_OWNER_DIED) | newtid;
 759
 760		err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
 761		if (err)
 762			goto handle_err;
 763
 764		if (curval == uval)
 765			break;
 766		uval = curval;
 767	}
 768
 769	/*
 770	 * We fixed up user space. Now we need to fix the pi_state
 771	 * itself.
 772	 */
 773	pi_state_update_owner(pi_state, newowner);
 774
 775	return argowner == current;
 776
 777	/*
 778	 * In order to reschedule or handle a page fault, we need to drop the
 779	 * locks here. In the case of a fault, this gives the other task
 780	 * (either the highest priority waiter itself or the task which stole
 781	 * the rtmutex) the chance to try the fixup of the pi_state. So once we
 782	 * are back from handling the fault we need to check the pi_state after
 783	 * reacquiring the locks and before trying to do another fixup. When
 784	 * the fixup has been done already we simply return.
 785	 *
 786	 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
 787	 * drop hb->lock since the caller owns the hb -> futex_q relation.
 788	 * Dropping the pi_mutex->wait_lock requires the state revalidate.
 789	 */
 790handle_err:
 791	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 792	spin_unlock(q->lock_ptr);
 793
 794	switch (err) {
 795	case -EFAULT:
 796		err = fault_in_user_writeable(uaddr);
 797		break;
 798
 799	case -EAGAIN:
 800		cond_resched();
 801		err = 0;
 802		break;
 803
 804	default:
 805		WARN_ON_ONCE(1);
 806		break;
 807	}
 808
 809	spin_lock(q->lock_ptr);
 810	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
 811
 812	/*
 813	 * Check if someone else fixed it for us:
 814	 */
 815	if (pi_state->owner != oldowner)
 816		return argowner == current;
 817
 818	/* Retry if err was -EAGAIN or the fault in succeeded */
 819	if (!err)
 820		goto retry;
 821
 822	/*
 823	 * fault_in_user_writeable() failed so user state is immutable. At
 824	 * best we can make the kernel state consistent but user state will
 825	 * be most likely hosed and any subsequent unlock operation will be
 826	 * rejected due to PI futex rule [10].
 827	 *
 828	 * Ensure that the rtmutex owner is also the pi_state owner despite
 829	 * the user space value claiming something different. There is no
 830	 * point in unlocking the rtmutex if current is the owner as it
 831	 * would need to wait until the next waiter has taken the rtmutex
 832	 * to guarantee consistent state. Keep it simple. Userspace asked
 833	 * for this wreckaged state.
 834	 *
 835	 * The rtmutex has an owner - either current or some other
 836	 * task. See the EAGAIN loop above.
 837	 */
 838	pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
 839
 840	return err;
 841}
 842
 843static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 844				struct task_struct *argowner)
 845{
 846	struct futex_pi_state *pi_state = q->pi_state;
 847	int ret;
 848
 849	lockdep_assert_held(q->lock_ptr);
 850
 851	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
 852	ret = __fixup_pi_state_owner(uaddr, q, argowner);
 853	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 854	return ret;
 855}
 856
 857/**
 858 * fixup_pi_owner() - Post lock pi_state and corner case management
 859 * @uaddr:	user address of the futex
 860 * @q:		futex_q (contains pi_state and access to the rt_mutex)
 861 * @locked:	if the attempt to take the rt_mutex succeeded (1) or not (0)
 862 *
 863 * After attempting to lock an rt_mutex, this function is called to cleanup
 864 * the pi_state owner as well as handle race conditions that may allow us to
 865 * acquire the lock. Must be called with the hb lock held.
 866 *
 867 * Return:
 868 *  -  1 - success, lock taken;
 869 *  -  0 - success, lock not taken;
 870 *  - <0 - on error (-EFAULT)
 871 */
 872int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
 873{
 874	if (locked) {
 875		/*
 876		 * Got the lock. We might not be the anticipated owner if we
 877		 * did a lock-steal - fix up the PI-state in that case:
 878		 *
 879		 * Speculative pi_state->owner read (we don't hold wait_lock);
 880		 * since we own the lock pi_state->owner == current is the
 881		 * stable state, anything else needs more attention.
 882		 */
 883		if (q->pi_state->owner != current)
 884			return fixup_pi_state_owner(uaddr, q, current);
 885		return 1;
 886	}
 887
 888	/*
 889	 * If we didn't get the lock; check if anybody stole it from us. In
 890	 * that case, we need to fix up the uval to point to them instead of
 891	 * us, otherwise bad things happen. [10]
 892	 *
 893	 * Another speculative read; pi_state->owner == current is unstable
 894	 * but needs our attention.
 895	 */
 896	if (q->pi_state->owner == current)
 897		return fixup_pi_state_owner(uaddr, q, NULL);
 898
 899	/*
 900	 * Paranoia check. If we did not take the lock, then we should not be
 901	 * the owner of the rt_mutex. Warn and establish consistent state.
 902	 */
 903	if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
 904		return fixup_pi_state_owner(uaddr, q, current);
 905
 906	return 0;
 907}
 908
 909/*
 910 * Userspace tried a 0 -> TID atomic transition of the futex value
 911 * and failed. The kernel side here does the whole locking operation:
 912 * if there are waiters then it will block as a consequence of relying
 913 * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
 914 * a 0 value of the futex too.).
 915 *
 916 * Also serves as futex trylock_pi()'ing, and due semantics.
 917 */
 918int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
 919{
 920	struct hrtimer_sleeper timeout, *to;
 921	struct task_struct *exiting = NULL;
 922	struct rt_mutex_waiter rt_waiter;
 923	struct futex_hash_bucket *hb;
 924	struct futex_q q = futex_q_init;
 925	int res, ret;
 926
 927	if (!IS_ENABLED(CONFIG_FUTEX_PI))
 928		return -ENOSYS;
 929
 930	if (refill_pi_state_cache())
 931		return -ENOMEM;
 932
 933	to = futex_setup_timer(time, &timeout, flags, 0);
 934
 935retry:
 936	ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE);
 937	if (unlikely(ret != 0))
 938		goto out;
 939
 940retry_private:
 941	hb = futex_q_lock(&q);
 942
 943	ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
 944				   &exiting, 0);
 945	if (unlikely(ret)) {
 946		/*
 947		 * Atomic work succeeded and we got the lock,
 948		 * or failed. Either way, we do _not_ block.
 949		 */
 950		switch (ret) {
 951		case 1:
 952			/* We got the lock. */
 953			ret = 0;
 954			goto out_unlock_put_key;
 955		case -EFAULT:
 956			goto uaddr_faulted;
 957		case -EBUSY:
 958		case -EAGAIN:
 959			/*
 960			 * Two reasons for this:
 961			 * - EBUSY: Task is exiting and we just wait for the
 962			 *   exit to complete.
 963			 * - EAGAIN: The user space value changed.
 964			 */
 965			futex_q_unlock(hb);
 966			/*
 967			 * Handle the case where the owner is in the middle of
 968			 * exiting. Wait for the exit to complete otherwise
 969			 * this task might loop forever, aka. live lock.
 970			 */
 971			wait_for_owner_exiting(ret, exiting);
 972			cond_resched();
 973			goto retry;
 974		default:
 975			goto out_unlock_put_key;
 976		}
 977	}
 978
 979	WARN_ON(!q.pi_state);
 980
 981	/*
 982	 * Only actually queue now that the atomic ops are done:
 983	 */
 984	__futex_queue(&q, hb);
 985
 986	if (trylock) {
 987		ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
 988		/* Fixup the trylock return value: */
 989		ret = ret ? 0 : -EWOULDBLOCK;
 990		goto no_block;
 991	}
 992
 993	/*
 994	 * Must be done before we enqueue the waiter, here is unfortunately
 995	 * under the hb lock, but that *should* work because it does nothing.
 996	 */
 997	rt_mutex_pre_schedule();
 998
 999	rt_mutex_init_waiter(&rt_waiter);
1000
1001	/*
1002	 * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
1003	 * hold it while doing rt_mutex_start_proxy(), because then it will
1004	 * include hb->lock in the blocking chain, even through we'll not in
1005	 * fact hold it while blocking. This will lead it to report -EDEADLK
1006	 * and BUG when futex_unlock_pi() interleaves with this.
1007	 *
1008	 * Therefore acquire wait_lock while holding hb->lock, but drop the
1009	 * latter before calling __rt_mutex_start_proxy_lock(). This
1010	 * interleaves with futex_unlock_pi() -- which does a similar lock
1011	 * handoff -- such that the latter can observe the futex_q::pi_state
1012	 * before __rt_mutex_start_proxy_lock() is done.
1013	 */
1014	raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
1015	spin_unlock(q.lock_ptr);
1016	/*
1017	 * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
1018	 * such that futex_unlock_pi() is guaranteed to observe the waiter when
1019	 * it sees the futex_q::pi_state.
1020	 */
1021	ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
1022	raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
1023
1024	if (ret) {
1025		if (ret == 1)
1026			ret = 0;
1027		goto cleanup;
1028	}
1029
1030	if (unlikely(to))
1031		hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
1032
1033	ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
1034
1035cleanup:
 
1036	/*
1037	 * If we failed to acquire the lock (deadlock/signal/timeout), we must
1038	 * must unwind the above, however we canont lock hb->lock because
1039	 * rt_mutex already has a waiter enqueued and hb->lock can itself try
1040	 * and enqueue an rt_waiter through rtlock.
1041	 *
1042	 * Doing the cleanup without holding hb->lock can cause inconsistent
1043	 * state between hb and pi_state, but only in the direction of not
1044	 * seeing a waiter that is leaving.
1045	 *
1046	 * See futex_unlock_pi(), it deals with this inconsistency.
1047	 *
1048	 * There be dragons here, since we must deal with the inconsistency on
1049	 * the way out (here), it is impossible to detect/warn about the race
1050	 * the other way around (missing an incoming waiter).
1051	 *
1052	 * What could possibly go wrong...
 
1053	 */
1054	if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
1055		ret = 0;
1056
1057	/*
1058	 * Now that the rt_waiter has been dequeued, it is safe to use
1059	 * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up
1060	 * the
1061	 */
1062	spin_lock(q.lock_ptr);
1063	/*
1064	 * Waiter is unqueued.
1065	 */
1066	rt_mutex_post_schedule();
1067no_block:
1068	/*
1069	 * Fixup the pi_state owner and possibly acquire the lock if we
1070	 * haven't already.
1071	 */
1072	res = fixup_pi_owner(uaddr, &q, !ret);
1073	/*
1074	 * If fixup_pi_owner() returned an error, propagate that.  If it acquired
1075	 * the lock, clear our -ETIMEDOUT or -EINTR.
1076	 */
1077	if (res)
1078		ret = (res < 0) ? res : 0;
1079
1080	futex_unqueue_pi(&q);
1081	spin_unlock(q.lock_ptr);
1082	goto out;
1083
1084out_unlock_put_key:
1085	futex_q_unlock(hb);
1086
1087out:
1088	if (to) {
1089		hrtimer_cancel(&to->timer);
1090		destroy_hrtimer_on_stack(&to->timer);
1091	}
1092	return ret != -EINTR ? ret : -ERESTARTNOINTR;
1093
1094uaddr_faulted:
1095	futex_q_unlock(hb);
1096
1097	ret = fault_in_user_writeable(uaddr);
1098	if (ret)
1099		goto out;
1100
1101	if (!(flags & FLAGS_SHARED))
1102		goto retry_private;
1103
1104	goto retry;
1105}
1106
1107/*
1108 * Userspace attempted a TID -> 0 atomic transition, and failed.
1109 * This is the in-kernel slowpath: we look up the PI state (if any),
1110 * and do the rt-mutex unlock.
1111 */
1112int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
1113{
1114	u32 curval, uval, vpid = task_pid_vnr(current);
1115	union futex_key key = FUTEX_KEY_INIT;
1116	struct futex_hash_bucket *hb;
1117	struct futex_q *top_waiter;
1118	int ret;
1119
1120	if (!IS_ENABLED(CONFIG_FUTEX_PI))
1121		return -ENOSYS;
1122
1123retry:
1124	if (get_user(uval, uaddr))
1125		return -EFAULT;
1126	/*
1127	 * We release only a lock we actually own:
1128	 */
1129	if ((uval & FUTEX_TID_MASK) != vpid)
1130		return -EPERM;
1131
1132	ret = get_futex_key(uaddr, flags, &key, FUTEX_WRITE);
1133	if (ret)
1134		return ret;
1135
1136	hb = futex_hash(&key);
1137	spin_lock(&hb->lock);
1138retry_hb:
1139
1140	/*
1141	 * Check waiters first. We do not trust user space values at
1142	 * all and we at least want to know if user space fiddled
1143	 * with the futex value instead of blindly unlocking.
1144	 */
1145	top_waiter = futex_top_waiter(hb, &key);
1146	if (top_waiter) {
1147		struct futex_pi_state *pi_state = top_waiter->pi_state;
1148		struct rt_mutex_waiter *rt_waiter;
1149
1150		ret = -EINVAL;
1151		if (!pi_state)
1152			goto out_unlock;
1153
1154		/*
1155		 * If current does not own the pi_state then the futex is
1156		 * inconsistent and user space fiddled with the futex value.
1157		 */
1158		if (pi_state->owner != current)
1159			goto out_unlock;
1160
 
1161		/*
1162		 * By taking wait_lock while still holding hb->lock, we ensure
1163		 * there is no point where we hold neither; and thereby
1164		 * wake_futex_pi() must observe any new waiters.
1165		 *
1166		 * Since the cleanup: case in futex_lock_pi() removes the
1167		 * rt_waiter without holding hb->lock, it is possible for
1168		 * wake_futex_pi() to not find a waiter while the above does,
1169		 * in this case the waiter is on the way out and it can be
1170		 * ignored.
1171		 *
1172		 * In particular; this forces __rt_mutex_start_proxy() to
1173		 * complete such that we're guaranteed to observe the
1174		 * rt_waiter.
1175		 */
1176		raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
1177
1178		/*
1179		 * Futex vs rt_mutex waiter state -- if there are no rt_mutex
1180		 * waiters even though futex thinks there are, then the waiter
1181		 * is leaving. The entry needs to be removed from the list so a
1182		 * new futex_lock_pi() is not using this stale PI-state while
1183		 * the futex is available in user space again.
1184		 * There can be more than one task on its way out so it needs
1185		 * to retry.
1186		 */
1187		rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
1188		if (!rt_waiter) {
1189			__futex_unqueue(top_waiter);
1190			raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1191			goto retry_hb;
1192		}
1193
1194		get_pi_state(pi_state);
1195		spin_unlock(&hb->lock);
1196
1197		/* drops pi_state->pi_mutex.wait_lock */
1198		ret = wake_futex_pi(uaddr, uval, pi_state, rt_waiter);
1199
1200		put_pi_state(pi_state);
1201
1202		/*
1203		 * Success, we're done! No tricky corner cases.
1204		 */
1205		if (!ret)
1206			return ret;
1207		/*
1208		 * The atomic access to the futex value generated a
1209		 * pagefault, so retry the user-access and the wakeup:
1210		 */
1211		if (ret == -EFAULT)
1212			goto pi_faulted;
1213		/*
1214		 * A unconditional UNLOCK_PI op raced against a waiter
1215		 * setting the FUTEX_WAITERS bit. Try again.
1216		 */
1217		if (ret == -EAGAIN)
1218			goto pi_retry;
1219		/*
1220		 * wake_futex_pi has detected invalid state. Tell user
1221		 * space.
1222		 */
1223		return ret;
1224	}
1225
1226	/*
1227	 * We have no kernel internal state, i.e. no waiters in the
1228	 * kernel. Waiters which are about to queue themselves are stuck
1229	 * on hb->lock. So we can safely ignore them. We do neither
1230	 * preserve the WAITERS bit not the OWNER_DIED one. We are the
1231	 * owner.
1232	 */
1233	if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) {
1234		spin_unlock(&hb->lock);
1235		switch (ret) {
1236		case -EFAULT:
1237			goto pi_faulted;
1238
1239		case -EAGAIN:
1240			goto pi_retry;
1241
1242		default:
1243			WARN_ON_ONCE(1);
1244			return ret;
1245		}
1246	}
1247
1248	/*
1249	 * If uval has changed, let user space handle it.
1250	 */
1251	ret = (curval == uval) ? 0 : -EAGAIN;
1252
1253out_unlock:
1254	spin_unlock(&hb->lock);
1255	return ret;
1256
1257pi_retry:
1258	cond_resched();
1259	goto retry;
1260
1261pi_faulted:
1262
1263	ret = fault_in_user_writeable(uaddr);
1264	if (!ret)
1265		goto retry;
1266
1267	return ret;
1268}
1269