Loading...
1// SPDX-License-Identifier: GPL-2.0-or-later
2
3#include <linux/slab.h>
4#include <linux/sched/task.h>
5
6#include "futex.h"
7#include "../locking/rtmutex_common.h"
8
9/*
10 * PI code:
11 */
12int refill_pi_state_cache(void)
13{
14 struct futex_pi_state *pi_state;
15
16 if (likely(current->pi_state_cache))
17 return 0;
18
19 pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
20
21 if (!pi_state)
22 return -ENOMEM;
23
24 INIT_LIST_HEAD(&pi_state->list);
25 /* pi_mutex gets initialized later */
26 pi_state->owner = NULL;
27 refcount_set(&pi_state->refcount, 1);
28 pi_state->key = FUTEX_KEY_INIT;
29
30 current->pi_state_cache = pi_state;
31
32 return 0;
33}
34
35static struct futex_pi_state *alloc_pi_state(void)
36{
37 struct futex_pi_state *pi_state = current->pi_state_cache;
38
39 WARN_ON(!pi_state);
40 current->pi_state_cache = NULL;
41
42 return pi_state;
43}
44
45static void pi_state_update_owner(struct futex_pi_state *pi_state,
46 struct task_struct *new_owner)
47{
48 struct task_struct *old_owner = pi_state->owner;
49
50 lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
51
52 if (old_owner) {
53 raw_spin_lock(&old_owner->pi_lock);
54 WARN_ON(list_empty(&pi_state->list));
55 list_del_init(&pi_state->list);
56 raw_spin_unlock(&old_owner->pi_lock);
57 }
58
59 if (new_owner) {
60 raw_spin_lock(&new_owner->pi_lock);
61 WARN_ON(!list_empty(&pi_state->list));
62 list_add(&pi_state->list, &new_owner->pi_state_list);
63 pi_state->owner = new_owner;
64 raw_spin_unlock(&new_owner->pi_lock);
65 }
66}
67
68void get_pi_state(struct futex_pi_state *pi_state)
69{
70 WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
71}
72
73/*
74 * Drops a reference to the pi_state object and frees or caches it
75 * when the last reference is gone.
76 */
77void put_pi_state(struct futex_pi_state *pi_state)
78{
79 if (!pi_state)
80 return;
81
82 if (!refcount_dec_and_test(&pi_state->refcount))
83 return;
84
85 /*
86 * If pi_state->owner is NULL, the owner is most probably dying
87 * and has cleaned up the pi_state already
88 */
89 if (pi_state->owner) {
90 unsigned long flags;
91
92 raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
93 pi_state_update_owner(pi_state, NULL);
94 rt_mutex_proxy_unlock(&pi_state->pi_mutex);
95 raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
96 }
97
98 if (current->pi_state_cache) {
99 kfree(pi_state);
100 } else {
101 /*
102 * pi_state->list is already empty.
103 * clear pi_state->owner.
104 * refcount is at 0 - put it back to 1.
105 */
106 pi_state->owner = NULL;
107 refcount_set(&pi_state->refcount, 1);
108 current->pi_state_cache = pi_state;
109 }
110}
111
112/*
113 * We need to check the following states:
114 *
115 * Waiter | pi_state | pi->owner | uTID | uODIED | ?
116 *
117 * [1] NULL | --- | --- | 0 | 0/1 | Valid
118 * [2] NULL | --- | --- | >0 | 0/1 | Valid
119 *
120 * [3] Found | NULL | -- | Any | 0/1 | Invalid
121 *
122 * [4] Found | Found | NULL | 0 | 1 | Valid
123 * [5] Found | Found | NULL | >0 | 1 | Invalid
124 *
125 * [6] Found | Found | task | 0 | 1 | Valid
126 *
127 * [7] Found | Found | NULL | Any | 0 | Invalid
128 *
129 * [8] Found | Found | task | ==taskTID | 0/1 | Valid
130 * [9] Found | Found | task | 0 | 0 | Invalid
131 * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
132 *
133 * [1] Indicates that the kernel can acquire the futex atomically. We
134 * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
135 *
136 * [2] Valid, if TID does not belong to a kernel thread. If no matching
137 * thread is found then it indicates that the owner TID has died.
138 *
139 * [3] Invalid. The waiter is queued on a non PI futex
140 *
141 * [4] Valid state after exit_robust_list(), which sets the user space
142 * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
143 *
144 * [5] The user space value got manipulated between exit_robust_list()
145 * and exit_pi_state_list()
146 *
147 * [6] Valid state after exit_pi_state_list() which sets the new owner in
148 * the pi_state but cannot access the user space value.
149 *
150 * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
151 *
152 * [8] Owner and user space value match
153 *
154 * [9] There is no transient state which sets the user space TID to 0
155 * except exit_robust_list(), but this is indicated by the
156 * FUTEX_OWNER_DIED bit. See [4]
157 *
158 * [10] There is no transient state which leaves owner and user space
159 * TID out of sync. Except one error case where the kernel is denied
160 * write access to the user address, see fixup_pi_state_owner().
161 *
162 *
163 * Serialization and lifetime rules:
164 *
165 * hb->lock:
166 *
167 * hb -> futex_q, relation
168 * futex_q -> pi_state, relation
169 *
170 * (cannot be raw because hb can contain arbitrary amount
171 * of futex_q's)
172 *
173 * pi_mutex->wait_lock:
174 *
175 * {uval, pi_state}
176 *
177 * (and pi_mutex 'obviously')
178 *
179 * p->pi_lock:
180 *
181 * p->pi_state_list -> pi_state->list, relation
182 * pi_mutex->owner -> pi_state->owner, relation
183 *
184 * pi_state->refcount:
185 *
186 * pi_state lifetime
187 *
188 *
189 * Lock order:
190 *
191 * hb->lock
192 * pi_mutex->wait_lock
193 * p->pi_lock
194 *
195 */
196
197/*
198 * Validate that the existing waiter has a pi_state and sanity check
199 * the pi_state against the user space value. If correct, attach to
200 * it.
201 */
202static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
203 struct futex_pi_state *pi_state,
204 struct futex_pi_state **ps)
205{
206 pid_t pid = uval & FUTEX_TID_MASK;
207 u32 uval2;
208 int ret;
209
210 /*
211 * Userspace might have messed up non-PI and PI futexes [3]
212 */
213 if (unlikely(!pi_state))
214 return -EINVAL;
215
216 /*
217 * We get here with hb->lock held, and having found a
218 * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
219 * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(),
220 * which in turn means that futex_lock_pi() still has a reference on
221 * our pi_state.
222 *
223 * The waiter holding a reference on @pi_state also protects against
224 * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
225 * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
226 * free pi_state before we can take a reference ourselves.
227 */
228 WARN_ON(!refcount_read(&pi_state->refcount));
229
230 /*
231 * Now that we have a pi_state, we can acquire wait_lock
232 * and do the state validation.
233 */
234 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
235
236 /*
237 * Since {uval, pi_state} is serialized by wait_lock, and our current
238 * uval was read without holding it, it can have changed. Verify it
239 * still is what we expect it to be, otherwise retry the entire
240 * operation.
241 */
242 if (futex_get_value_locked(&uval2, uaddr))
243 goto out_efault;
244
245 if (uval != uval2)
246 goto out_eagain;
247
248 /*
249 * Handle the owner died case:
250 */
251 if (uval & FUTEX_OWNER_DIED) {
252 /*
253 * exit_pi_state_list sets owner to NULL and wakes the
254 * topmost waiter. The task which acquires the
255 * pi_state->rt_mutex will fixup owner.
256 */
257 if (!pi_state->owner) {
258 /*
259 * No pi state owner, but the user space TID
260 * is not 0. Inconsistent state. [5]
261 */
262 if (pid)
263 goto out_einval;
264 /*
265 * Take a ref on the state and return success. [4]
266 */
267 goto out_attach;
268 }
269
270 /*
271 * If TID is 0, then either the dying owner has not
272 * yet executed exit_pi_state_list() or some waiter
273 * acquired the rtmutex in the pi state, but did not
274 * yet fixup the TID in user space.
275 *
276 * Take a ref on the state and return success. [6]
277 */
278 if (!pid)
279 goto out_attach;
280 } else {
281 /*
282 * If the owner died bit is not set, then the pi_state
283 * must have an owner. [7]
284 */
285 if (!pi_state->owner)
286 goto out_einval;
287 }
288
289 /*
290 * Bail out if user space manipulated the futex value. If pi
291 * state exists then the owner TID must be the same as the
292 * user space TID. [9/10]
293 */
294 if (pid != task_pid_vnr(pi_state->owner))
295 goto out_einval;
296
297out_attach:
298 get_pi_state(pi_state);
299 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
300 *ps = pi_state;
301 return 0;
302
303out_einval:
304 ret = -EINVAL;
305 goto out_error;
306
307out_eagain:
308 ret = -EAGAIN;
309 goto out_error;
310
311out_efault:
312 ret = -EFAULT;
313 goto out_error;
314
315out_error:
316 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
317 return ret;
318}
319
320static int handle_exit_race(u32 __user *uaddr, u32 uval,
321 struct task_struct *tsk)
322{
323 u32 uval2;
324
325 /*
326 * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
327 * caller that the alleged owner is busy.
328 */
329 if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
330 return -EBUSY;
331
332 /*
333 * Reread the user space value to handle the following situation:
334 *
335 * CPU0 CPU1
336 *
337 * sys_exit() sys_futex()
338 * do_exit() futex_lock_pi()
339 * futex_lock_pi_atomic()
340 * exit_signals(tsk) No waiters:
341 * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID
342 * mm_release(tsk) Set waiter bit
343 * exit_robust_list(tsk) { *uaddr = 0x80000PID;
344 * Set owner died attach_to_pi_owner() {
345 * *uaddr = 0xC0000000; tsk = get_task(PID);
346 * } if (!tsk->flags & PF_EXITING) {
347 * ... attach();
348 * tsk->futex_state = } else {
349 * FUTEX_STATE_DEAD; if (tsk->futex_state !=
350 * FUTEX_STATE_DEAD)
351 * return -EAGAIN;
352 * return -ESRCH; <--- FAIL
353 * }
354 *
355 * Returning ESRCH unconditionally is wrong here because the
356 * user space value has been changed by the exiting task.
357 *
358 * The same logic applies to the case where the exiting task is
359 * already gone.
360 */
361 if (futex_get_value_locked(&uval2, uaddr))
362 return -EFAULT;
363
364 /* If the user space value has changed, try again. */
365 if (uval2 != uval)
366 return -EAGAIN;
367
368 /*
369 * The exiting task did not have a robust list, the robust list was
370 * corrupted or the user space value in *uaddr is simply bogus.
371 * Give up and tell user space.
372 */
373 return -ESRCH;
374}
375
376static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
377 struct futex_pi_state **ps)
378{
379 /*
380 * No existing pi state. First waiter. [2]
381 *
382 * This creates pi_state, we have hb->lock held, this means nothing can
383 * observe this state, wait_lock is irrelevant.
384 */
385 struct futex_pi_state *pi_state = alloc_pi_state();
386
387 /*
388 * Initialize the pi_mutex in locked state and make @p
389 * the owner of it:
390 */
391 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
392
393 /* Store the key for possible exit cleanups: */
394 pi_state->key = *key;
395
396 WARN_ON(!list_empty(&pi_state->list));
397 list_add(&pi_state->list, &p->pi_state_list);
398 /*
399 * Assignment without holding pi_state->pi_mutex.wait_lock is safe
400 * because there is no concurrency as the object is not published yet.
401 */
402 pi_state->owner = p;
403
404 *ps = pi_state;
405}
406/*
407 * Lookup the task for the TID provided from user space and attach to
408 * it after doing proper sanity checks.
409 */
410static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
411 struct futex_pi_state **ps,
412 struct task_struct **exiting)
413{
414 pid_t pid = uval & FUTEX_TID_MASK;
415 struct task_struct *p;
416
417 /*
418 * We are the first waiter - try to look up the real owner and attach
419 * the new pi_state to it, but bail out when TID = 0 [1]
420 *
421 * The !pid check is paranoid. None of the call sites should end up
422 * with pid == 0, but better safe than sorry. Let the caller retry
423 */
424 if (!pid)
425 return -EAGAIN;
426 p = find_get_task_by_vpid(pid);
427 if (!p)
428 return handle_exit_race(uaddr, uval, NULL);
429
430 if (unlikely(p->flags & PF_KTHREAD)) {
431 put_task_struct(p);
432 return -EPERM;
433 }
434
435 /*
436 * We need to look at the task state to figure out, whether the
437 * task is exiting. To protect against the change of the task state
438 * in futex_exit_release(), we do this protected by p->pi_lock:
439 */
440 raw_spin_lock_irq(&p->pi_lock);
441 if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
442 /*
443 * The task is on the way out. When the futex state is
444 * FUTEX_STATE_DEAD, we know that the task has finished
445 * the cleanup:
446 */
447 int ret = handle_exit_race(uaddr, uval, p);
448
449 raw_spin_unlock_irq(&p->pi_lock);
450 /*
451 * If the owner task is between FUTEX_STATE_EXITING and
452 * FUTEX_STATE_DEAD then store the task pointer and keep
453 * the reference on the task struct. The calling code will
454 * drop all locks, wait for the task to reach
455 * FUTEX_STATE_DEAD and then drop the refcount. This is
456 * required to prevent a live lock when the current task
457 * preempted the exiting task between the two states.
458 */
459 if (ret == -EBUSY)
460 *exiting = p;
461 else
462 put_task_struct(p);
463 return ret;
464 }
465
466 __attach_to_pi_owner(p, key, ps);
467 raw_spin_unlock_irq(&p->pi_lock);
468
469 put_task_struct(p);
470
471 return 0;
472}
473
474static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
475{
476 int err;
477 u32 curval;
478
479 if (unlikely(should_fail_futex(true)))
480 return -EFAULT;
481
482 err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
483 if (unlikely(err))
484 return err;
485
486 /* If user space value changed, let the caller retry */
487 return curval != uval ? -EAGAIN : 0;
488}
489
490/**
491 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
492 * @uaddr: the pi futex user address
493 * @hb: the pi futex hash bucket
494 * @key: the futex key associated with uaddr and hb
495 * @ps: the pi_state pointer where we store the result of the
496 * lookup
497 * @task: the task to perform the atomic lock work for. This will
498 * be "current" except in the case of requeue pi.
499 * @exiting: Pointer to store the task pointer of the owner task
500 * which is in the middle of exiting
501 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
502 *
503 * Return:
504 * - 0 - ready to wait;
505 * - 1 - acquired the lock;
506 * - <0 - error
507 *
508 * The hb->lock must be held by the caller.
509 *
510 * @exiting is only set when the return value is -EBUSY. If so, this holds
511 * a refcount on the exiting task on return and the caller needs to drop it
512 * after waiting for the exit to complete.
513 */
514int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
515 union futex_key *key,
516 struct futex_pi_state **ps,
517 struct task_struct *task,
518 struct task_struct **exiting,
519 int set_waiters)
520{
521 u32 uval, newval, vpid = task_pid_vnr(task);
522 struct futex_q *top_waiter;
523 int ret;
524
525 /*
526 * Read the user space value first so we can validate a few
527 * things before proceeding further.
528 */
529 if (futex_get_value_locked(&uval, uaddr))
530 return -EFAULT;
531
532 if (unlikely(should_fail_futex(true)))
533 return -EFAULT;
534
535 /*
536 * Detect deadlocks.
537 */
538 if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
539 return -EDEADLK;
540
541 if ((unlikely(should_fail_futex(true))))
542 return -EDEADLK;
543
544 /*
545 * Lookup existing state first. If it exists, try to attach to
546 * its pi_state.
547 */
548 top_waiter = futex_top_waiter(hb, key);
549 if (top_waiter)
550 return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
551
552 /*
553 * No waiter and user TID is 0. We are here because the
554 * waiters or the owner died bit is set or called from
555 * requeue_cmp_pi or for whatever reason something took the
556 * syscall.
557 */
558 if (!(uval & FUTEX_TID_MASK)) {
559 /*
560 * We take over the futex. No other waiters and the user space
561 * TID is 0. We preserve the owner died bit.
562 */
563 newval = uval & FUTEX_OWNER_DIED;
564 newval |= vpid;
565
566 /* The futex requeue_pi code can enforce the waiters bit */
567 if (set_waiters)
568 newval |= FUTEX_WAITERS;
569
570 ret = lock_pi_update_atomic(uaddr, uval, newval);
571 if (ret)
572 return ret;
573
574 /*
575 * If the waiter bit was requested the caller also needs PI
576 * state attached to the new owner of the user space futex.
577 *
578 * @task is guaranteed to be alive and it cannot be exiting
579 * because it is either sleeping or waiting in
580 * futex_requeue_pi_wakeup_sync().
581 *
582 * No need to do the full attach_to_pi_owner() exercise
583 * because @task is known and valid.
584 */
585 if (set_waiters) {
586 raw_spin_lock_irq(&task->pi_lock);
587 __attach_to_pi_owner(task, key, ps);
588 raw_spin_unlock_irq(&task->pi_lock);
589 }
590 return 1;
591 }
592
593 /*
594 * First waiter. Set the waiters bit before attaching ourself to
595 * the owner. If owner tries to unlock, it will be forced into
596 * the kernel and blocked on hb->lock.
597 */
598 newval = uval | FUTEX_WAITERS;
599 ret = lock_pi_update_atomic(uaddr, uval, newval);
600 if (ret)
601 return ret;
602 /*
603 * If the update of the user space value succeeded, we try to
604 * attach to the owner. If that fails, no harm done, we only
605 * set the FUTEX_WAITERS bit in the user space variable.
606 */
607 return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
608}
609
610/*
611 * Caller must hold a reference on @pi_state.
612 */
613static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
614{
615 struct rt_mutex_waiter *top_waiter;
616 struct task_struct *new_owner;
617 bool postunlock = false;
618 DEFINE_RT_WAKE_Q(wqh);
619 u32 curval, newval;
620 int ret = 0;
621
622 top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
623 if (WARN_ON_ONCE(!top_waiter)) {
624 /*
625 * As per the comment in futex_unlock_pi() this should not happen.
626 *
627 * When this happens, give up our locks and try again, giving
628 * the futex_lock_pi() instance time to complete, either by
629 * waiting on the rtmutex or removing itself from the futex
630 * queue.
631 */
632 ret = -EAGAIN;
633 goto out_unlock;
634 }
635
636 new_owner = top_waiter->task;
637
638 /*
639 * We pass it to the next owner. The WAITERS bit is always kept
640 * enabled while there is PI state around. We cleanup the owner
641 * died bit, because we are the owner.
642 */
643 newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
644
645 if (unlikely(should_fail_futex(true))) {
646 ret = -EFAULT;
647 goto out_unlock;
648 }
649
650 ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
651 if (!ret && (curval != uval)) {
652 /*
653 * If a unconditional UNLOCK_PI operation (user space did not
654 * try the TID->0 transition) raced with a waiter setting the
655 * FUTEX_WAITERS flag between get_user() and locking the hash
656 * bucket lock, retry the operation.
657 */
658 if ((FUTEX_TID_MASK & curval) == uval)
659 ret = -EAGAIN;
660 else
661 ret = -EINVAL;
662 }
663
664 if (!ret) {
665 /*
666 * This is a point of no return; once we modified the uval
667 * there is no going back and subsequent operations must
668 * not fail.
669 */
670 pi_state_update_owner(pi_state, new_owner);
671 postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
672 }
673
674out_unlock:
675 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
676
677 if (postunlock)
678 rt_mutex_postunlock(&wqh);
679
680 return ret;
681}
682
683static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
684 struct task_struct *argowner)
685{
686 struct futex_pi_state *pi_state = q->pi_state;
687 struct task_struct *oldowner, *newowner;
688 u32 uval, curval, newval, newtid;
689 int err = 0;
690
691 oldowner = pi_state->owner;
692
693 /*
694 * We are here because either:
695 *
696 * - we stole the lock and pi_state->owner needs updating to reflect
697 * that (@argowner == current),
698 *
699 * or:
700 *
701 * - someone stole our lock and we need to fix things to point to the
702 * new owner (@argowner == NULL).
703 *
704 * Either way, we have to replace the TID in the user space variable.
705 * This must be atomic as we have to preserve the owner died bit here.
706 *
707 * Note: We write the user space value _before_ changing the pi_state
708 * because we can fault here. Imagine swapped out pages or a fork
709 * that marked all the anonymous memory readonly for cow.
710 *
711 * Modifying pi_state _before_ the user space value would leave the
712 * pi_state in an inconsistent state when we fault here, because we
713 * need to drop the locks to handle the fault. This might be observed
714 * in the PID checks when attaching to PI state .
715 */
716retry:
717 if (!argowner) {
718 if (oldowner != current) {
719 /*
720 * We raced against a concurrent self; things are
721 * already fixed up. Nothing to do.
722 */
723 return 0;
724 }
725
726 if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
727 /* We got the lock. pi_state is correct. Tell caller. */
728 return 1;
729 }
730
731 /*
732 * The trylock just failed, so either there is an owner or
733 * there is a higher priority waiter than this one.
734 */
735 newowner = rt_mutex_owner(&pi_state->pi_mutex);
736 /*
737 * If the higher priority waiter has not yet taken over the
738 * rtmutex then newowner is NULL. We can't return here with
739 * that state because it's inconsistent vs. the user space
740 * state. So drop the locks and try again. It's a valid
741 * situation and not any different from the other retry
742 * conditions.
743 */
744 if (unlikely(!newowner)) {
745 err = -EAGAIN;
746 goto handle_err;
747 }
748 } else {
749 WARN_ON_ONCE(argowner != current);
750 if (oldowner == current) {
751 /*
752 * We raced against a concurrent self; things are
753 * already fixed up. Nothing to do.
754 */
755 return 1;
756 }
757 newowner = argowner;
758 }
759
760 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
761 /* Owner died? */
762 if (!pi_state->owner)
763 newtid |= FUTEX_OWNER_DIED;
764
765 err = futex_get_value_locked(&uval, uaddr);
766 if (err)
767 goto handle_err;
768
769 for (;;) {
770 newval = (uval & FUTEX_OWNER_DIED) | newtid;
771
772 err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
773 if (err)
774 goto handle_err;
775
776 if (curval == uval)
777 break;
778 uval = curval;
779 }
780
781 /*
782 * We fixed up user space. Now we need to fix the pi_state
783 * itself.
784 */
785 pi_state_update_owner(pi_state, newowner);
786
787 return argowner == current;
788
789 /*
790 * In order to reschedule or handle a page fault, we need to drop the
791 * locks here. In the case of a fault, this gives the other task
792 * (either the highest priority waiter itself or the task which stole
793 * the rtmutex) the chance to try the fixup of the pi_state. So once we
794 * are back from handling the fault we need to check the pi_state after
795 * reacquiring the locks and before trying to do another fixup. When
796 * the fixup has been done already we simply return.
797 *
798 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
799 * drop hb->lock since the caller owns the hb -> futex_q relation.
800 * Dropping the pi_mutex->wait_lock requires the state revalidate.
801 */
802handle_err:
803 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
804 spin_unlock(q->lock_ptr);
805
806 switch (err) {
807 case -EFAULT:
808 err = fault_in_user_writeable(uaddr);
809 break;
810
811 case -EAGAIN:
812 cond_resched();
813 err = 0;
814 break;
815
816 default:
817 WARN_ON_ONCE(1);
818 break;
819 }
820
821 spin_lock(q->lock_ptr);
822 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
823
824 /*
825 * Check if someone else fixed it for us:
826 */
827 if (pi_state->owner != oldowner)
828 return argowner == current;
829
830 /* Retry if err was -EAGAIN or the fault in succeeded */
831 if (!err)
832 goto retry;
833
834 /*
835 * fault_in_user_writeable() failed so user state is immutable. At
836 * best we can make the kernel state consistent but user state will
837 * be most likely hosed and any subsequent unlock operation will be
838 * rejected due to PI futex rule [10].
839 *
840 * Ensure that the rtmutex owner is also the pi_state owner despite
841 * the user space value claiming something different. There is no
842 * point in unlocking the rtmutex if current is the owner as it
843 * would need to wait until the next waiter has taken the rtmutex
844 * to guarantee consistent state. Keep it simple. Userspace asked
845 * for this wreckaged state.
846 *
847 * The rtmutex has an owner - either current or some other
848 * task. See the EAGAIN loop above.
849 */
850 pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
851
852 return err;
853}
854
855static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
856 struct task_struct *argowner)
857{
858 struct futex_pi_state *pi_state = q->pi_state;
859 int ret;
860
861 lockdep_assert_held(q->lock_ptr);
862
863 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
864 ret = __fixup_pi_state_owner(uaddr, q, argowner);
865 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
866 return ret;
867}
868
869/**
870 * fixup_pi_owner() - Post lock pi_state and corner case management
871 * @uaddr: user address of the futex
872 * @q: futex_q (contains pi_state and access to the rt_mutex)
873 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
874 *
875 * After attempting to lock an rt_mutex, this function is called to cleanup
876 * the pi_state owner as well as handle race conditions that may allow us to
877 * acquire the lock. Must be called with the hb lock held.
878 *
879 * Return:
880 * - 1 - success, lock taken;
881 * - 0 - success, lock not taken;
882 * - <0 - on error (-EFAULT)
883 */
884int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
885{
886 if (locked) {
887 /*
888 * Got the lock. We might not be the anticipated owner if we
889 * did a lock-steal - fix up the PI-state in that case:
890 *
891 * Speculative pi_state->owner read (we don't hold wait_lock);
892 * since we own the lock pi_state->owner == current is the
893 * stable state, anything else needs more attention.
894 */
895 if (q->pi_state->owner != current)
896 return fixup_pi_state_owner(uaddr, q, current);
897 return 1;
898 }
899
900 /*
901 * If we didn't get the lock; check if anybody stole it from us. In
902 * that case, we need to fix up the uval to point to them instead of
903 * us, otherwise bad things happen. [10]
904 *
905 * Another speculative read; pi_state->owner == current is unstable
906 * but needs our attention.
907 */
908 if (q->pi_state->owner == current)
909 return fixup_pi_state_owner(uaddr, q, NULL);
910
911 /*
912 * Paranoia check. If we did not take the lock, then we should not be
913 * the owner of the rt_mutex. Warn and establish consistent state.
914 */
915 if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
916 return fixup_pi_state_owner(uaddr, q, current);
917
918 return 0;
919}
920
921/*
922 * Userspace tried a 0 -> TID atomic transition of the futex value
923 * and failed. The kernel side here does the whole locking operation:
924 * if there are waiters then it will block as a consequence of relying
925 * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
926 * a 0 value of the futex too.).
927 *
928 * Also serves as futex trylock_pi()'ing, and due semantics.
929 */
930int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
931{
932 struct hrtimer_sleeper timeout, *to;
933 struct task_struct *exiting = NULL;
934 struct rt_mutex_waiter rt_waiter;
935 struct futex_hash_bucket *hb;
936 struct futex_q q = futex_q_init;
937 int res, ret;
938
939 if (!IS_ENABLED(CONFIG_FUTEX_PI))
940 return -ENOSYS;
941
942 if (refill_pi_state_cache())
943 return -ENOMEM;
944
945 to = futex_setup_timer(time, &timeout, flags, 0);
946
947retry:
948 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
949 if (unlikely(ret != 0))
950 goto out;
951
952retry_private:
953 hb = futex_q_lock(&q);
954
955 ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
956 &exiting, 0);
957 if (unlikely(ret)) {
958 /*
959 * Atomic work succeeded and we got the lock,
960 * or failed. Either way, we do _not_ block.
961 */
962 switch (ret) {
963 case 1:
964 /* We got the lock. */
965 ret = 0;
966 goto out_unlock_put_key;
967 case -EFAULT:
968 goto uaddr_faulted;
969 case -EBUSY:
970 case -EAGAIN:
971 /*
972 * Two reasons for this:
973 * - EBUSY: Task is exiting and we just wait for the
974 * exit to complete.
975 * - EAGAIN: The user space value changed.
976 */
977 futex_q_unlock(hb);
978 /*
979 * Handle the case where the owner is in the middle of
980 * exiting. Wait for the exit to complete otherwise
981 * this task might loop forever, aka. live lock.
982 */
983 wait_for_owner_exiting(ret, exiting);
984 cond_resched();
985 goto retry;
986 default:
987 goto out_unlock_put_key;
988 }
989 }
990
991 WARN_ON(!q.pi_state);
992
993 /*
994 * Only actually queue now that the atomic ops are done:
995 */
996 __futex_queue(&q, hb);
997
998 if (trylock) {
999 ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
1000 /* Fixup the trylock return value: */
1001 ret = ret ? 0 : -EWOULDBLOCK;
1002 goto no_block;
1003 }
1004
1005 rt_mutex_init_waiter(&rt_waiter);
1006
1007 /*
1008 * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
1009 * hold it while doing rt_mutex_start_proxy(), because then it will
1010 * include hb->lock in the blocking chain, even through we'll not in
1011 * fact hold it while blocking. This will lead it to report -EDEADLK
1012 * and BUG when futex_unlock_pi() interleaves with this.
1013 *
1014 * Therefore acquire wait_lock while holding hb->lock, but drop the
1015 * latter before calling __rt_mutex_start_proxy_lock(). This
1016 * interleaves with futex_unlock_pi() -- which does a similar lock
1017 * handoff -- such that the latter can observe the futex_q::pi_state
1018 * before __rt_mutex_start_proxy_lock() is done.
1019 */
1020 raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
1021 spin_unlock(q.lock_ptr);
1022 /*
1023 * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
1024 * such that futex_unlock_pi() is guaranteed to observe the waiter when
1025 * it sees the futex_q::pi_state.
1026 */
1027 ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
1028 raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
1029
1030 if (ret) {
1031 if (ret == 1)
1032 ret = 0;
1033 goto cleanup;
1034 }
1035
1036 if (unlikely(to))
1037 hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
1038
1039 ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
1040
1041cleanup:
1042 spin_lock(q.lock_ptr);
1043 /*
1044 * If we failed to acquire the lock (deadlock/signal/timeout), we must
1045 * first acquire the hb->lock before removing the lock from the
1046 * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
1047 * lists consistent.
1048 *
1049 * In particular; it is important that futex_unlock_pi() can not
1050 * observe this inconsistency.
1051 */
1052 if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
1053 ret = 0;
1054
1055no_block:
1056 /*
1057 * Fixup the pi_state owner and possibly acquire the lock if we
1058 * haven't already.
1059 */
1060 res = fixup_pi_owner(uaddr, &q, !ret);
1061 /*
1062 * If fixup_pi_owner() returned an error, propagate that. If it acquired
1063 * the lock, clear our -ETIMEDOUT or -EINTR.
1064 */
1065 if (res)
1066 ret = (res < 0) ? res : 0;
1067
1068 futex_unqueue_pi(&q);
1069 spin_unlock(q.lock_ptr);
1070 goto out;
1071
1072out_unlock_put_key:
1073 futex_q_unlock(hb);
1074
1075out:
1076 if (to) {
1077 hrtimer_cancel(&to->timer);
1078 destroy_hrtimer_on_stack(&to->timer);
1079 }
1080 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1081
1082uaddr_faulted:
1083 futex_q_unlock(hb);
1084
1085 ret = fault_in_user_writeable(uaddr);
1086 if (ret)
1087 goto out;
1088
1089 if (!(flags & FLAGS_SHARED))
1090 goto retry_private;
1091
1092 goto retry;
1093}
1094
1095/*
1096 * Userspace attempted a TID -> 0 atomic transition, and failed.
1097 * This is the in-kernel slowpath: we look up the PI state (if any),
1098 * and do the rt-mutex unlock.
1099 */
1100int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
1101{
1102 u32 curval, uval, vpid = task_pid_vnr(current);
1103 union futex_key key = FUTEX_KEY_INIT;
1104 struct futex_hash_bucket *hb;
1105 struct futex_q *top_waiter;
1106 int ret;
1107
1108 if (!IS_ENABLED(CONFIG_FUTEX_PI))
1109 return -ENOSYS;
1110
1111retry:
1112 if (get_user(uval, uaddr))
1113 return -EFAULT;
1114 /*
1115 * We release only a lock we actually own:
1116 */
1117 if ((uval & FUTEX_TID_MASK) != vpid)
1118 return -EPERM;
1119
1120 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
1121 if (ret)
1122 return ret;
1123
1124 hb = futex_hash(&key);
1125 spin_lock(&hb->lock);
1126
1127 /*
1128 * Check waiters first. We do not trust user space values at
1129 * all and we at least want to know if user space fiddled
1130 * with the futex value instead of blindly unlocking.
1131 */
1132 top_waiter = futex_top_waiter(hb, &key);
1133 if (top_waiter) {
1134 struct futex_pi_state *pi_state = top_waiter->pi_state;
1135
1136 ret = -EINVAL;
1137 if (!pi_state)
1138 goto out_unlock;
1139
1140 /*
1141 * If current does not own the pi_state then the futex is
1142 * inconsistent and user space fiddled with the futex value.
1143 */
1144 if (pi_state->owner != current)
1145 goto out_unlock;
1146
1147 get_pi_state(pi_state);
1148 /*
1149 * By taking wait_lock while still holding hb->lock, we ensure
1150 * there is no point where we hold neither; and therefore
1151 * wake_futex_p() must observe a state consistent with what we
1152 * observed.
1153 *
1154 * In particular; this forces __rt_mutex_start_proxy() to
1155 * complete such that we're guaranteed to observe the
1156 * rt_waiter. Also see the WARN in wake_futex_pi().
1157 */
1158 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
1159 spin_unlock(&hb->lock);
1160
1161 /* drops pi_state->pi_mutex.wait_lock */
1162 ret = wake_futex_pi(uaddr, uval, pi_state);
1163
1164 put_pi_state(pi_state);
1165
1166 /*
1167 * Success, we're done! No tricky corner cases.
1168 */
1169 if (!ret)
1170 return ret;
1171 /*
1172 * The atomic access to the futex value generated a
1173 * pagefault, so retry the user-access and the wakeup:
1174 */
1175 if (ret == -EFAULT)
1176 goto pi_faulted;
1177 /*
1178 * A unconditional UNLOCK_PI op raced against a waiter
1179 * setting the FUTEX_WAITERS bit. Try again.
1180 */
1181 if (ret == -EAGAIN)
1182 goto pi_retry;
1183 /*
1184 * wake_futex_pi has detected invalid state. Tell user
1185 * space.
1186 */
1187 return ret;
1188 }
1189
1190 /*
1191 * We have no kernel internal state, i.e. no waiters in the
1192 * kernel. Waiters which are about to queue themselves are stuck
1193 * on hb->lock. So we can safely ignore them. We do neither
1194 * preserve the WAITERS bit not the OWNER_DIED one. We are the
1195 * owner.
1196 */
1197 if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) {
1198 spin_unlock(&hb->lock);
1199 switch (ret) {
1200 case -EFAULT:
1201 goto pi_faulted;
1202
1203 case -EAGAIN:
1204 goto pi_retry;
1205
1206 default:
1207 WARN_ON_ONCE(1);
1208 return ret;
1209 }
1210 }
1211
1212 /*
1213 * If uval has changed, let user space handle it.
1214 */
1215 ret = (curval == uval) ? 0 : -EAGAIN;
1216
1217out_unlock:
1218 spin_unlock(&hb->lock);
1219 return ret;
1220
1221pi_retry:
1222 cond_resched();
1223 goto retry;
1224
1225pi_faulted:
1226
1227 ret = fault_in_user_writeable(uaddr);
1228 if (!ret)
1229 goto retry;
1230
1231 return ret;
1232}
1233
1// SPDX-License-Identifier: GPL-2.0-or-later
2
3#include <linux/slab.h>
4#include <linux/sched/rt.h>
5#include <linux/sched/task.h>
6
7#include "futex.h"
8#include "../locking/rtmutex_common.h"
9
10/*
11 * PI code:
12 */
13int refill_pi_state_cache(void)
14{
15 struct futex_pi_state *pi_state;
16
17 if (likely(current->pi_state_cache))
18 return 0;
19
20 pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
21
22 if (!pi_state)
23 return -ENOMEM;
24
25 INIT_LIST_HEAD(&pi_state->list);
26 /* pi_mutex gets initialized later */
27 pi_state->owner = NULL;
28 refcount_set(&pi_state->refcount, 1);
29 pi_state->key = FUTEX_KEY_INIT;
30
31 current->pi_state_cache = pi_state;
32
33 return 0;
34}
35
36static struct futex_pi_state *alloc_pi_state(void)
37{
38 struct futex_pi_state *pi_state = current->pi_state_cache;
39
40 WARN_ON(!pi_state);
41 current->pi_state_cache = NULL;
42
43 return pi_state;
44}
45
46static void pi_state_update_owner(struct futex_pi_state *pi_state,
47 struct task_struct *new_owner)
48{
49 struct task_struct *old_owner = pi_state->owner;
50
51 lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
52
53 if (old_owner) {
54 raw_spin_lock(&old_owner->pi_lock);
55 WARN_ON(list_empty(&pi_state->list));
56 list_del_init(&pi_state->list);
57 raw_spin_unlock(&old_owner->pi_lock);
58 }
59
60 if (new_owner) {
61 raw_spin_lock(&new_owner->pi_lock);
62 WARN_ON(!list_empty(&pi_state->list));
63 list_add(&pi_state->list, &new_owner->pi_state_list);
64 pi_state->owner = new_owner;
65 raw_spin_unlock(&new_owner->pi_lock);
66 }
67}
68
69void get_pi_state(struct futex_pi_state *pi_state)
70{
71 WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
72}
73
74/*
75 * Drops a reference to the pi_state object and frees or caches it
76 * when the last reference is gone.
77 */
78void put_pi_state(struct futex_pi_state *pi_state)
79{
80 if (!pi_state)
81 return;
82
83 if (!refcount_dec_and_test(&pi_state->refcount))
84 return;
85
86 /*
87 * If pi_state->owner is NULL, the owner is most probably dying
88 * and has cleaned up the pi_state already
89 */
90 if (pi_state->owner) {
91 unsigned long flags;
92
93 raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
94 pi_state_update_owner(pi_state, NULL);
95 rt_mutex_proxy_unlock(&pi_state->pi_mutex);
96 raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
97 }
98
99 if (current->pi_state_cache) {
100 kfree(pi_state);
101 } else {
102 /*
103 * pi_state->list is already empty.
104 * clear pi_state->owner.
105 * refcount is at 0 - put it back to 1.
106 */
107 pi_state->owner = NULL;
108 refcount_set(&pi_state->refcount, 1);
109 current->pi_state_cache = pi_state;
110 }
111}
112
113/*
114 * We need to check the following states:
115 *
116 * Waiter | pi_state | pi->owner | uTID | uODIED | ?
117 *
118 * [1] NULL | --- | --- | 0 | 0/1 | Valid
119 * [2] NULL | --- | --- | >0 | 0/1 | Valid
120 *
121 * [3] Found | NULL | -- | Any | 0/1 | Invalid
122 *
123 * [4] Found | Found | NULL | 0 | 1 | Valid
124 * [5] Found | Found | NULL | >0 | 1 | Invalid
125 *
126 * [6] Found | Found | task | 0 | 1 | Valid
127 *
128 * [7] Found | Found | NULL | Any | 0 | Invalid
129 *
130 * [8] Found | Found | task | ==taskTID | 0/1 | Valid
131 * [9] Found | Found | task | 0 | 0 | Invalid
132 * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
133 *
134 * [1] Indicates that the kernel can acquire the futex atomically. We
135 * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
136 *
137 * [2] Valid, if TID does not belong to a kernel thread. If no matching
138 * thread is found then it indicates that the owner TID has died.
139 *
140 * [3] Invalid. The waiter is queued on a non PI futex
141 *
142 * [4] Valid state after exit_robust_list(), which sets the user space
143 * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
144 *
145 * [5] The user space value got manipulated between exit_robust_list()
146 * and exit_pi_state_list()
147 *
148 * [6] Valid state after exit_pi_state_list() which sets the new owner in
149 * the pi_state but cannot access the user space value.
150 *
151 * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
152 *
153 * [8] Owner and user space value match
154 *
155 * [9] There is no transient state which sets the user space TID to 0
156 * except exit_robust_list(), but this is indicated by the
157 * FUTEX_OWNER_DIED bit. See [4]
158 *
159 * [10] There is no transient state which leaves owner and user space
160 * TID out of sync. Except one error case where the kernel is denied
161 * write access to the user address, see fixup_pi_state_owner().
162 *
163 *
164 * Serialization and lifetime rules:
165 *
166 * hb->lock:
167 *
168 * hb -> futex_q, relation
169 * futex_q -> pi_state, relation
170 *
171 * (cannot be raw because hb can contain arbitrary amount
172 * of futex_q's)
173 *
174 * pi_mutex->wait_lock:
175 *
176 * {uval, pi_state}
177 *
178 * (and pi_mutex 'obviously')
179 *
180 * p->pi_lock:
181 *
182 * p->pi_state_list -> pi_state->list, relation
183 * pi_mutex->owner -> pi_state->owner, relation
184 *
185 * pi_state->refcount:
186 *
187 * pi_state lifetime
188 *
189 *
190 * Lock order:
191 *
192 * hb->lock
193 * pi_mutex->wait_lock
194 * p->pi_lock
195 *
196 */
197
198/*
199 * Validate that the existing waiter has a pi_state and sanity check
200 * the pi_state against the user space value. If correct, attach to
201 * it.
202 */
203static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
204 struct futex_pi_state *pi_state,
205 struct futex_pi_state **ps)
206{
207 pid_t pid = uval & FUTEX_TID_MASK;
208 u32 uval2;
209 int ret;
210
211 /*
212 * Userspace might have messed up non-PI and PI futexes [3]
213 */
214 if (unlikely(!pi_state))
215 return -EINVAL;
216
217 /*
218 * We get here with hb->lock held, and having found a
219 * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
220 * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(),
221 * which in turn means that futex_lock_pi() still has a reference on
222 * our pi_state.
223 *
224 * The waiter holding a reference on @pi_state also protects against
225 * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
226 * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
227 * free pi_state before we can take a reference ourselves.
228 */
229 WARN_ON(!refcount_read(&pi_state->refcount));
230
231 /*
232 * Now that we have a pi_state, we can acquire wait_lock
233 * and do the state validation.
234 */
235 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
236
237 /*
238 * Since {uval, pi_state} is serialized by wait_lock, and our current
239 * uval was read without holding it, it can have changed. Verify it
240 * still is what we expect it to be, otherwise retry the entire
241 * operation.
242 */
243 if (futex_get_value_locked(&uval2, uaddr))
244 goto out_efault;
245
246 if (uval != uval2)
247 goto out_eagain;
248
249 /*
250 * Handle the owner died case:
251 */
252 if (uval & FUTEX_OWNER_DIED) {
253 /*
254 * exit_pi_state_list sets owner to NULL and wakes the
255 * topmost waiter. The task which acquires the
256 * pi_state->rt_mutex will fixup owner.
257 */
258 if (!pi_state->owner) {
259 /*
260 * No pi state owner, but the user space TID
261 * is not 0. Inconsistent state. [5]
262 */
263 if (pid)
264 goto out_einval;
265 /*
266 * Take a ref on the state and return success. [4]
267 */
268 goto out_attach;
269 }
270
271 /*
272 * If TID is 0, then either the dying owner has not
273 * yet executed exit_pi_state_list() or some waiter
274 * acquired the rtmutex in the pi state, but did not
275 * yet fixup the TID in user space.
276 *
277 * Take a ref on the state and return success. [6]
278 */
279 if (!pid)
280 goto out_attach;
281 } else {
282 /*
283 * If the owner died bit is not set, then the pi_state
284 * must have an owner. [7]
285 */
286 if (!pi_state->owner)
287 goto out_einval;
288 }
289
290 /*
291 * Bail out if user space manipulated the futex value. If pi
292 * state exists then the owner TID must be the same as the
293 * user space TID. [9/10]
294 */
295 if (pid != task_pid_vnr(pi_state->owner))
296 goto out_einval;
297
298out_attach:
299 get_pi_state(pi_state);
300 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
301 *ps = pi_state;
302 return 0;
303
304out_einval:
305 ret = -EINVAL;
306 goto out_error;
307
308out_eagain:
309 ret = -EAGAIN;
310 goto out_error;
311
312out_efault:
313 ret = -EFAULT;
314 goto out_error;
315
316out_error:
317 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
318 return ret;
319}
320
321static int handle_exit_race(u32 __user *uaddr, u32 uval,
322 struct task_struct *tsk)
323{
324 u32 uval2;
325
326 /*
327 * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
328 * caller that the alleged owner is busy.
329 */
330 if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
331 return -EBUSY;
332
333 /*
334 * Reread the user space value to handle the following situation:
335 *
336 * CPU0 CPU1
337 *
338 * sys_exit() sys_futex()
339 * do_exit() futex_lock_pi()
340 * futex_lock_pi_atomic()
341 * exit_signals(tsk) No waiters:
342 * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID
343 * mm_release(tsk) Set waiter bit
344 * exit_robust_list(tsk) { *uaddr = 0x80000PID;
345 * Set owner died attach_to_pi_owner() {
346 * *uaddr = 0xC0000000; tsk = get_task(PID);
347 * } if (!tsk->flags & PF_EXITING) {
348 * ... attach();
349 * tsk->futex_state = } else {
350 * FUTEX_STATE_DEAD; if (tsk->futex_state !=
351 * FUTEX_STATE_DEAD)
352 * return -EAGAIN;
353 * return -ESRCH; <--- FAIL
354 * }
355 *
356 * Returning ESRCH unconditionally is wrong here because the
357 * user space value has been changed by the exiting task.
358 *
359 * The same logic applies to the case where the exiting task is
360 * already gone.
361 */
362 if (futex_get_value_locked(&uval2, uaddr))
363 return -EFAULT;
364
365 /* If the user space value has changed, try again. */
366 if (uval2 != uval)
367 return -EAGAIN;
368
369 /*
370 * The exiting task did not have a robust list, the robust list was
371 * corrupted or the user space value in *uaddr is simply bogus.
372 * Give up and tell user space.
373 */
374 return -ESRCH;
375}
376
377static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
378 struct futex_pi_state **ps)
379{
380 /*
381 * No existing pi state. First waiter. [2]
382 *
383 * This creates pi_state, we have hb->lock held, this means nothing can
384 * observe this state, wait_lock is irrelevant.
385 */
386 struct futex_pi_state *pi_state = alloc_pi_state();
387
388 /*
389 * Initialize the pi_mutex in locked state and make @p
390 * the owner of it:
391 */
392 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
393
394 /* Store the key for possible exit cleanups: */
395 pi_state->key = *key;
396
397 WARN_ON(!list_empty(&pi_state->list));
398 list_add(&pi_state->list, &p->pi_state_list);
399 /*
400 * Assignment without holding pi_state->pi_mutex.wait_lock is safe
401 * because there is no concurrency as the object is not published yet.
402 */
403 pi_state->owner = p;
404
405 *ps = pi_state;
406}
407/*
408 * Lookup the task for the TID provided from user space and attach to
409 * it after doing proper sanity checks.
410 */
411static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
412 struct futex_pi_state **ps,
413 struct task_struct **exiting)
414{
415 pid_t pid = uval & FUTEX_TID_MASK;
416 struct task_struct *p;
417
418 /*
419 * We are the first waiter - try to look up the real owner and attach
420 * the new pi_state to it, but bail out when TID = 0 [1]
421 *
422 * The !pid check is paranoid. None of the call sites should end up
423 * with pid == 0, but better safe than sorry. Let the caller retry
424 */
425 if (!pid)
426 return -EAGAIN;
427 p = find_get_task_by_vpid(pid);
428 if (!p)
429 return handle_exit_race(uaddr, uval, NULL);
430
431 if (unlikely(p->flags & PF_KTHREAD)) {
432 put_task_struct(p);
433 return -EPERM;
434 }
435
436 /*
437 * We need to look at the task state to figure out, whether the
438 * task is exiting. To protect against the change of the task state
439 * in futex_exit_release(), we do this protected by p->pi_lock:
440 */
441 raw_spin_lock_irq(&p->pi_lock);
442 if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
443 /*
444 * The task is on the way out. When the futex state is
445 * FUTEX_STATE_DEAD, we know that the task has finished
446 * the cleanup:
447 */
448 int ret = handle_exit_race(uaddr, uval, p);
449
450 raw_spin_unlock_irq(&p->pi_lock);
451 /*
452 * If the owner task is between FUTEX_STATE_EXITING and
453 * FUTEX_STATE_DEAD then store the task pointer and keep
454 * the reference on the task struct. The calling code will
455 * drop all locks, wait for the task to reach
456 * FUTEX_STATE_DEAD and then drop the refcount. This is
457 * required to prevent a live lock when the current task
458 * preempted the exiting task between the two states.
459 */
460 if (ret == -EBUSY)
461 *exiting = p;
462 else
463 put_task_struct(p);
464 return ret;
465 }
466
467 __attach_to_pi_owner(p, key, ps);
468 raw_spin_unlock_irq(&p->pi_lock);
469
470 put_task_struct(p);
471
472 return 0;
473}
474
475static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
476{
477 int err;
478 u32 curval;
479
480 if (unlikely(should_fail_futex(true)))
481 return -EFAULT;
482
483 err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
484 if (unlikely(err))
485 return err;
486
487 /* If user space value changed, let the caller retry */
488 return curval != uval ? -EAGAIN : 0;
489}
490
491/**
492 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
493 * @uaddr: the pi futex user address
494 * @hb: the pi futex hash bucket
495 * @key: the futex key associated with uaddr and hb
496 * @ps: the pi_state pointer where we store the result of the
497 * lookup
498 * @task: the task to perform the atomic lock work for. This will
499 * be "current" except in the case of requeue pi.
500 * @exiting: Pointer to store the task pointer of the owner task
501 * which is in the middle of exiting
502 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
503 *
504 * Return:
505 * - 0 - ready to wait;
506 * - 1 - acquired the lock;
507 * - <0 - error
508 *
509 * The hb->lock must be held by the caller.
510 *
511 * @exiting is only set when the return value is -EBUSY. If so, this holds
512 * a refcount on the exiting task on return and the caller needs to drop it
513 * after waiting for the exit to complete.
514 */
515int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
516 union futex_key *key,
517 struct futex_pi_state **ps,
518 struct task_struct *task,
519 struct task_struct **exiting,
520 int set_waiters)
521{
522 u32 uval, newval, vpid = task_pid_vnr(task);
523 struct futex_q *top_waiter;
524 int ret;
525
526 /*
527 * Read the user space value first so we can validate a few
528 * things before proceeding further.
529 */
530 if (futex_get_value_locked(&uval, uaddr))
531 return -EFAULT;
532
533 if (unlikely(should_fail_futex(true)))
534 return -EFAULT;
535
536 /*
537 * Detect deadlocks.
538 */
539 if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
540 return -EDEADLK;
541
542 if ((unlikely(should_fail_futex(true))))
543 return -EDEADLK;
544
545 /*
546 * Lookup existing state first. If it exists, try to attach to
547 * its pi_state.
548 */
549 top_waiter = futex_top_waiter(hb, key);
550 if (top_waiter)
551 return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
552
553 /*
554 * No waiter and user TID is 0. We are here because the
555 * waiters or the owner died bit is set or called from
556 * requeue_cmp_pi or for whatever reason something took the
557 * syscall.
558 */
559 if (!(uval & FUTEX_TID_MASK)) {
560 /*
561 * We take over the futex. No other waiters and the user space
562 * TID is 0. We preserve the owner died bit.
563 */
564 newval = uval & FUTEX_OWNER_DIED;
565 newval |= vpid;
566
567 /* The futex requeue_pi code can enforce the waiters bit */
568 if (set_waiters)
569 newval |= FUTEX_WAITERS;
570
571 ret = lock_pi_update_atomic(uaddr, uval, newval);
572 if (ret)
573 return ret;
574
575 /*
576 * If the waiter bit was requested the caller also needs PI
577 * state attached to the new owner of the user space futex.
578 *
579 * @task is guaranteed to be alive and it cannot be exiting
580 * because it is either sleeping or waiting in
581 * futex_requeue_pi_wakeup_sync().
582 *
583 * No need to do the full attach_to_pi_owner() exercise
584 * because @task is known and valid.
585 */
586 if (set_waiters) {
587 raw_spin_lock_irq(&task->pi_lock);
588 __attach_to_pi_owner(task, key, ps);
589 raw_spin_unlock_irq(&task->pi_lock);
590 }
591 return 1;
592 }
593
594 /*
595 * First waiter. Set the waiters bit before attaching ourself to
596 * the owner. If owner tries to unlock, it will be forced into
597 * the kernel and blocked on hb->lock.
598 */
599 newval = uval | FUTEX_WAITERS;
600 ret = lock_pi_update_atomic(uaddr, uval, newval);
601 if (ret)
602 return ret;
603 /*
604 * If the update of the user space value succeeded, we try to
605 * attach to the owner. If that fails, no harm done, we only
606 * set the FUTEX_WAITERS bit in the user space variable.
607 */
608 return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
609}
610
611/*
612 * Caller must hold a reference on @pi_state.
613 */
614static int wake_futex_pi(u32 __user *uaddr, u32 uval,
615 struct futex_pi_state *pi_state,
616 struct rt_mutex_waiter *top_waiter)
617{
618 struct task_struct *new_owner;
619 bool postunlock = false;
620 DEFINE_RT_WAKE_Q(wqh);
621 u32 curval, newval;
622 int ret = 0;
623
624 new_owner = top_waiter->task;
625
626 /*
627 * We pass it to the next owner. The WAITERS bit is always kept
628 * enabled while there is PI state around. We cleanup the owner
629 * died bit, because we are the owner.
630 */
631 newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
632
633 if (unlikely(should_fail_futex(true))) {
634 ret = -EFAULT;
635 goto out_unlock;
636 }
637
638 ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
639 if (!ret && (curval != uval)) {
640 /*
641 * If a unconditional UNLOCK_PI operation (user space did not
642 * try the TID->0 transition) raced with a waiter setting the
643 * FUTEX_WAITERS flag between get_user() and locking the hash
644 * bucket lock, retry the operation.
645 */
646 if ((FUTEX_TID_MASK & curval) == uval)
647 ret = -EAGAIN;
648 else
649 ret = -EINVAL;
650 }
651
652 if (!ret) {
653 /*
654 * This is a point of no return; once we modified the uval
655 * there is no going back and subsequent operations must
656 * not fail.
657 */
658 pi_state_update_owner(pi_state, new_owner);
659 postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
660 }
661
662out_unlock:
663 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
664
665 if (postunlock)
666 rt_mutex_postunlock(&wqh);
667
668 return ret;
669}
670
671static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
672 struct task_struct *argowner)
673{
674 struct futex_pi_state *pi_state = q->pi_state;
675 struct task_struct *oldowner, *newowner;
676 u32 uval, curval, newval, newtid;
677 int err = 0;
678
679 oldowner = pi_state->owner;
680
681 /*
682 * We are here because either:
683 *
684 * - we stole the lock and pi_state->owner needs updating to reflect
685 * that (@argowner == current),
686 *
687 * or:
688 *
689 * - someone stole our lock and we need to fix things to point to the
690 * new owner (@argowner == NULL).
691 *
692 * Either way, we have to replace the TID in the user space variable.
693 * This must be atomic as we have to preserve the owner died bit here.
694 *
695 * Note: We write the user space value _before_ changing the pi_state
696 * because we can fault here. Imagine swapped out pages or a fork
697 * that marked all the anonymous memory readonly for cow.
698 *
699 * Modifying pi_state _before_ the user space value would leave the
700 * pi_state in an inconsistent state when we fault here, because we
701 * need to drop the locks to handle the fault. This might be observed
702 * in the PID checks when attaching to PI state .
703 */
704retry:
705 if (!argowner) {
706 if (oldowner != current) {
707 /*
708 * We raced against a concurrent self; things are
709 * already fixed up. Nothing to do.
710 */
711 return 0;
712 }
713
714 if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
715 /* We got the lock. pi_state is correct. Tell caller. */
716 return 1;
717 }
718
719 /*
720 * The trylock just failed, so either there is an owner or
721 * there is a higher priority waiter than this one.
722 */
723 newowner = rt_mutex_owner(&pi_state->pi_mutex);
724 /*
725 * If the higher priority waiter has not yet taken over the
726 * rtmutex then newowner is NULL. We can't return here with
727 * that state because it's inconsistent vs. the user space
728 * state. So drop the locks and try again. It's a valid
729 * situation and not any different from the other retry
730 * conditions.
731 */
732 if (unlikely(!newowner)) {
733 err = -EAGAIN;
734 goto handle_err;
735 }
736 } else {
737 WARN_ON_ONCE(argowner != current);
738 if (oldowner == current) {
739 /*
740 * We raced against a concurrent self; things are
741 * already fixed up. Nothing to do.
742 */
743 return 1;
744 }
745 newowner = argowner;
746 }
747
748 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
749 /* Owner died? */
750 if (!pi_state->owner)
751 newtid |= FUTEX_OWNER_DIED;
752
753 err = futex_get_value_locked(&uval, uaddr);
754 if (err)
755 goto handle_err;
756
757 for (;;) {
758 newval = (uval & FUTEX_OWNER_DIED) | newtid;
759
760 err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
761 if (err)
762 goto handle_err;
763
764 if (curval == uval)
765 break;
766 uval = curval;
767 }
768
769 /*
770 * We fixed up user space. Now we need to fix the pi_state
771 * itself.
772 */
773 pi_state_update_owner(pi_state, newowner);
774
775 return argowner == current;
776
777 /*
778 * In order to reschedule or handle a page fault, we need to drop the
779 * locks here. In the case of a fault, this gives the other task
780 * (either the highest priority waiter itself or the task which stole
781 * the rtmutex) the chance to try the fixup of the pi_state. So once we
782 * are back from handling the fault we need to check the pi_state after
783 * reacquiring the locks and before trying to do another fixup. When
784 * the fixup has been done already we simply return.
785 *
786 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
787 * drop hb->lock since the caller owns the hb -> futex_q relation.
788 * Dropping the pi_mutex->wait_lock requires the state revalidate.
789 */
790handle_err:
791 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
792 spin_unlock(q->lock_ptr);
793
794 switch (err) {
795 case -EFAULT:
796 err = fault_in_user_writeable(uaddr);
797 break;
798
799 case -EAGAIN:
800 cond_resched();
801 err = 0;
802 break;
803
804 default:
805 WARN_ON_ONCE(1);
806 break;
807 }
808
809 spin_lock(q->lock_ptr);
810 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
811
812 /*
813 * Check if someone else fixed it for us:
814 */
815 if (pi_state->owner != oldowner)
816 return argowner == current;
817
818 /* Retry if err was -EAGAIN or the fault in succeeded */
819 if (!err)
820 goto retry;
821
822 /*
823 * fault_in_user_writeable() failed so user state is immutable. At
824 * best we can make the kernel state consistent but user state will
825 * be most likely hosed and any subsequent unlock operation will be
826 * rejected due to PI futex rule [10].
827 *
828 * Ensure that the rtmutex owner is also the pi_state owner despite
829 * the user space value claiming something different. There is no
830 * point in unlocking the rtmutex if current is the owner as it
831 * would need to wait until the next waiter has taken the rtmutex
832 * to guarantee consistent state. Keep it simple. Userspace asked
833 * for this wreckaged state.
834 *
835 * The rtmutex has an owner - either current or some other
836 * task. See the EAGAIN loop above.
837 */
838 pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
839
840 return err;
841}
842
843static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
844 struct task_struct *argowner)
845{
846 struct futex_pi_state *pi_state = q->pi_state;
847 int ret;
848
849 lockdep_assert_held(q->lock_ptr);
850
851 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
852 ret = __fixup_pi_state_owner(uaddr, q, argowner);
853 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
854 return ret;
855}
856
857/**
858 * fixup_pi_owner() - Post lock pi_state and corner case management
859 * @uaddr: user address of the futex
860 * @q: futex_q (contains pi_state and access to the rt_mutex)
861 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
862 *
863 * After attempting to lock an rt_mutex, this function is called to cleanup
864 * the pi_state owner as well as handle race conditions that may allow us to
865 * acquire the lock. Must be called with the hb lock held.
866 *
867 * Return:
868 * - 1 - success, lock taken;
869 * - 0 - success, lock not taken;
870 * - <0 - on error (-EFAULT)
871 */
872int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
873{
874 if (locked) {
875 /*
876 * Got the lock. We might not be the anticipated owner if we
877 * did a lock-steal - fix up the PI-state in that case:
878 *
879 * Speculative pi_state->owner read (we don't hold wait_lock);
880 * since we own the lock pi_state->owner == current is the
881 * stable state, anything else needs more attention.
882 */
883 if (q->pi_state->owner != current)
884 return fixup_pi_state_owner(uaddr, q, current);
885 return 1;
886 }
887
888 /*
889 * If we didn't get the lock; check if anybody stole it from us. In
890 * that case, we need to fix up the uval to point to them instead of
891 * us, otherwise bad things happen. [10]
892 *
893 * Another speculative read; pi_state->owner == current is unstable
894 * but needs our attention.
895 */
896 if (q->pi_state->owner == current)
897 return fixup_pi_state_owner(uaddr, q, NULL);
898
899 /*
900 * Paranoia check. If we did not take the lock, then we should not be
901 * the owner of the rt_mutex. Warn and establish consistent state.
902 */
903 if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
904 return fixup_pi_state_owner(uaddr, q, current);
905
906 return 0;
907}
908
909/*
910 * Userspace tried a 0 -> TID atomic transition of the futex value
911 * and failed. The kernel side here does the whole locking operation:
912 * if there are waiters then it will block as a consequence of relying
913 * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
914 * a 0 value of the futex too.).
915 *
916 * Also serves as futex trylock_pi()'ing, and due semantics.
917 */
918int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
919{
920 struct hrtimer_sleeper timeout, *to;
921 struct task_struct *exiting = NULL;
922 struct rt_mutex_waiter rt_waiter;
923 struct futex_hash_bucket *hb;
924 struct futex_q q = futex_q_init;
925 int res, ret;
926
927 if (!IS_ENABLED(CONFIG_FUTEX_PI))
928 return -ENOSYS;
929
930 if (refill_pi_state_cache())
931 return -ENOMEM;
932
933 to = futex_setup_timer(time, &timeout, flags, 0);
934
935retry:
936 ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE);
937 if (unlikely(ret != 0))
938 goto out;
939
940retry_private:
941 hb = futex_q_lock(&q);
942
943 ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
944 &exiting, 0);
945 if (unlikely(ret)) {
946 /*
947 * Atomic work succeeded and we got the lock,
948 * or failed. Either way, we do _not_ block.
949 */
950 switch (ret) {
951 case 1:
952 /* We got the lock. */
953 ret = 0;
954 goto out_unlock_put_key;
955 case -EFAULT:
956 goto uaddr_faulted;
957 case -EBUSY:
958 case -EAGAIN:
959 /*
960 * Two reasons for this:
961 * - EBUSY: Task is exiting and we just wait for the
962 * exit to complete.
963 * - EAGAIN: The user space value changed.
964 */
965 futex_q_unlock(hb);
966 /*
967 * Handle the case where the owner is in the middle of
968 * exiting. Wait for the exit to complete otherwise
969 * this task might loop forever, aka. live lock.
970 */
971 wait_for_owner_exiting(ret, exiting);
972 cond_resched();
973 goto retry;
974 default:
975 goto out_unlock_put_key;
976 }
977 }
978
979 WARN_ON(!q.pi_state);
980
981 /*
982 * Only actually queue now that the atomic ops are done:
983 */
984 __futex_queue(&q, hb);
985
986 if (trylock) {
987 ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
988 /* Fixup the trylock return value: */
989 ret = ret ? 0 : -EWOULDBLOCK;
990 goto no_block;
991 }
992
993 /*
994 * Must be done before we enqueue the waiter, here is unfortunately
995 * under the hb lock, but that *should* work because it does nothing.
996 */
997 rt_mutex_pre_schedule();
998
999 rt_mutex_init_waiter(&rt_waiter);
1000
1001 /*
1002 * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
1003 * hold it while doing rt_mutex_start_proxy(), because then it will
1004 * include hb->lock in the blocking chain, even through we'll not in
1005 * fact hold it while blocking. This will lead it to report -EDEADLK
1006 * and BUG when futex_unlock_pi() interleaves with this.
1007 *
1008 * Therefore acquire wait_lock while holding hb->lock, but drop the
1009 * latter before calling __rt_mutex_start_proxy_lock(). This
1010 * interleaves with futex_unlock_pi() -- which does a similar lock
1011 * handoff -- such that the latter can observe the futex_q::pi_state
1012 * before __rt_mutex_start_proxy_lock() is done.
1013 */
1014 raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
1015 spin_unlock(q.lock_ptr);
1016 /*
1017 * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
1018 * such that futex_unlock_pi() is guaranteed to observe the waiter when
1019 * it sees the futex_q::pi_state.
1020 */
1021 ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
1022 raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
1023
1024 if (ret) {
1025 if (ret == 1)
1026 ret = 0;
1027 goto cleanup;
1028 }
1029
1030 if (unlikely(to))
1031 hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
1032
1033 ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
1034
1035cleanup:
1036 /*
1037 * If we failed to acquire the lock (deadlock/signal/timeout), we must
1038 * must unwind the above, however we canont lock hb->lock because
1039 * rt_mutex already has a waiter enqueued and hb->lock can itself try
1040 * and enqueue an rt_waiter through rtlock.
1041 *
1042 * Doing the cleanup without holding hb->lock can cause inconsistent
1043 * state between hb and pi_state, but only in the direction of not
1044 * seeing a waiter that is leaving.
1045 *
1046 * See futex_unlock_pi(), it deals with this inconsistency.
1047 *
1048 * There be dragons here, since we must deal with the inconsistency on
1049 * the way out (here), it is impossible to detect/warn about the race
1050 * the other way around (missing an incoming waiter).
1051 *
1052 * What could possibly go wrong...
1053 */
1054 if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
1055 ret = 0;
1056
1057 /*
1058 * Now that the rt_waiter has been dequeued, it is safe to use
1059 * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up
1060 * the
1061 */
1062 spin_lock(q.lock_ptr);
1063 /*
1064 * Waiter is unqueued.
1065 */
1066 rt_mutex_post_schedule();
1067no_block:
1068 /*
1069 * Fixup the pi_state owner and possibly acquire the lock if we
1070 * haven't already.
1071 */
1072 res = fixup_pi_owner(uaddr, &q, !ret);
1073 /*
1074 * If fixup_pi_owner() returned an error, propagate that. If it acquired
1075 * the lock, clear our -ETIMEDOUT or -EINTR.
1076 */
1077 if (res)
1078 ret = (res < 0) ? res : 0;
1079
1080 futex_unqueue_pi(&q);
1081 spin_unlock(q.lock_ptr);
1082 goto out;
1083
1084out_unlock_put_key:
1085 futex_q_unlock(hb);
1086
1087out:
1088 if (to) {
1089 hrtimer_cancel(&to->timer);
1090 destroy_hrtimer_on_stack(&to->timer);
1091 }
1092 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1093
1094uaddr_faulted:
1095 futex_q_unlock(hb);
1096
1097 ret = fault_in_user_writeable(uaddr);
1098 if (ret)
1099 goto out;
1100
1101 if (!(flags & FLAGS_SHARED))
1102 goto retry_private;
1103
1104 goto retry;
1105}
1106
1107/*
1108 * Userspace attempted a TID -> 0 atomic transition, and failed.
1109 * This is the in-kernel slowpath: we look up the PI state (if any),
1110 * and do the rt-mutex unlock.
1111 */
1112int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
1113{
1114 u32 curval, uval, vpid = task_pid_vnr(current);
1115 union futex_key key = FUTEX_KEY_INIT;
1116 struct futex_hash_bucket *hb;
1117 struct futex_q *top_waiter;
1118 int ret;
1119
1120 if (!IS_ENABLED(CONFIG_FUTEX_PI))
1121 return -ENOSYS;
1122
1123retry:
1124 if (get_user(uval, uaddr))
1125 return -EFAULT;
1126 /*
1127 * We release only a lock we actually own:
1128 */
1129 if ((uval & FUTEX_TID_MASK) != vpid)
1130 return -EPERM;
1131
1132 ret = get_futex_key(uaddr, flags, &key, FUTEX_WRITE);
1133 if (ret)
1134 return ret;
1135
1136 hb = futex_hash(&key);
1137 spin_lock(&hb->lock);
1138retry_hb:
1139
1140 /*
1141 * Check waiters first. We do not trust user space values at
1142 * all and we at least want to know if user space fiddled
1143 * with the futex value instead of blindly unlocking.
1144 */
1145 top_waiter = futex_top_waiter(hb, &key);
1146 if (top_waiter) {
1147 struct futex_pi_state *pi_state = top_waiter->pi_state;
1148 struct rt_mutex_waiter *rt_waiter;
1149
1150 ret = -EINVAL;
1151 if (!pi_state)
1152 goto out_unlock;
1153
1154 /*
1155 * If current does not own the pi_state then the futex is
1156 * inconsistent and user space fiddled with the futex value.
1157 */
1158 if (pi_state->owner != current)
1159 goto out_unlock;
1160
1161 /*
1162 * By taking wait_lock while still holding hb->lock, we ensure
1163 * there is no point where we hold neither; and thereby
1164 * wake_futex_pi() must observe any new waiters.
1165 *
1166 * Since the cleanup: case in futex_lock_pi() removes the
1167 * rt_waiter without holding hb->lock, it is possible for
1168 * wake_futex_pi() to not find a waiter while the above does,
1169 * in this case the waiter is on the way out and it can be
1170 * ignored.
1171 *
1172 * In particular; this forces __rt_mutex_start_proxy() to
1173 * complete such that we're guaranteed to observe the
1174 * rt_waiter.
1175 */
1176 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
1177
1178 /*
1179 * Futex vs rt_mutex waiter state -- if there are no rt_mutex
1180 * waiters even though futex thinks there are, then the waiter
1181 * is leaving. The entry needs to be removed from the list so a
1182 * new futex_lock_pi() is not using this stale PI-state while
1183 * the futex is available in user space again.
1184 * There can be more than one task on its way out so it needs
1185 * to retry.
1186 */
1187 rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
1188 if (!rt_waiter) {
1189 __futex_unqueue(top_waiter);
1190 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1191 goto retry_hb;
1192 }
1193
1194 get_pi_state(pi_state);
1195 spin_unlock(&hb->lock);
1196
1197 /* drops pi_state->pi_mutex.wait_lock */
1198 ret = wake_futex_pi(uaddr, uval, pi_state, rt_waiter);
1199
1200 put_pi_state(pi_state);
1201
1202 /*
1203 * Success, we're done! No tricky corner cases.
1204 */
1205 if (!ret)
1206 return ret;
1207 /*
1208 * The atomic access to the futex value generated a
1209 * pagefault, so retry the user-access and the wakeup:
1210 */
1211 if (ret == -EFAULT)
1212 goto pi_faulted;
1213 /*
1214 * A unconditional UNLOCK_PI op raced against a waiter
1215 * setting the FUTEX_WAITERS bit. Try again.
1216 */
1217 if (ret == -EAGAIN)
1218 goto pi_retry;
1219 /*
1220 * wake_futex_pi has detected invalid state. Tell user
1221 * space.
1222 */
1223 return ret;
1224 }
1225
1226 /*
1227 * We have no kernel internal state, i.e. no waiters in the
1228 * kernel. Waiters which are about to queue themselves are stuck
1229 * on hb->lock. So we can safely ignore them. We do neither
1230 * preserve the WAITERS bit not the OWNER_DIED one. We are the
1231 * owner.
1232 */
1233 if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) {
1234 spin_unlock(&hb->lock);
1235 switch (ret) {
1236 case -EFAULT:
1237 goto pi_faulted;
1238
1239 case -EAGAIN:
1240 goto pi_retry;
1241
1242 default:
1243 WARN_ON_ONCE(1);
1244 return ret;
1245 }
1246 }
1247
1248 /*
1249 * If uval has changed, let user space handle it.
1250 */
1251 ret = (curval == uval) ? 0 : -EAGAIN;
1252
1253out_unlock:
1254 spin_unlock(&hb->lock);
1255 return ret;
1256
1257pi_retry:
1258 cond_resched();
1259 goto retry;
1260
1261pi_faulted:
1262
1263 ret = fault_in_user_writeable(uaddr);
1264 if (!ret)
1265 goto retry;
1266
1267 return ret;
1268}
1269