Linux Audio

Check our new training course

Loading...
Note: File does not exist in v6.2.
  1// SPDX-License-Identifier: GPL-2.0
  2/*
  3 * Code related to the io_uring_register() syscall
  4 *
  5 * Copyright (C) 2023 Jens Axboe
  6 */
  7#include <linux/kernel.h>
  8#include <linux/errno.h>
  9#include <linux/syscalls.h>
 10#include <linux/refcount.h>
 11#include <linux/bits.h>
 12#include <linux/fs.h>
 13#include <linux/file.h>
 14#include <linux/slab.h>
 15#include <linux/uaccess.h>
 16#include <linux/nospec.h>
 17#include <linux/compat.h>
 18#include <linux/io_uring.h>
 19#include <linux/io_uring_types.h>
 20
 21#include "io_uring.h"
 22#include "opdef.h"
 23#include "tctx.h"
 24#include "rsrc.h"
 25#include "sqpoll.h"
 26#include "register.h"
 27#include "cancel.h"
 28#include "kbuf.h"
 29#include "napi.h"
 30#include "eventfd.h"
 31#include "msg_ring.h"
 32#include "memmap.h"
 33
 34#define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
 35				 IORING_REGISTER_LAST + IORING_OP_LAST)
 36
 37static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
 38			   unsigned nr_args)
 39{
 40	struct io_uring_probe *p;
 41	size_t size;
 42	int i, ret;
 43
 44	if (nr_args > IORING_OP_LAST)
 45		nr_args = IORING_OP_LAST;
 46
 47	size = struct_size(p, ops, nr_args);
 48	p = kzalloc(size, GFP_KERNEL);
 49	if (!p)
 50		return -ENOMEM;
 51
 52	ret = -EFAULT;
 53	if (copy_from_user(p, arg, size))
 54		goto out;
 55	ret = -EINVAL;
 56	if (memchr_inv(p, 0, size))
 57		goto out;
 58
 59	p->last_op = IORING_OP_LAST - 1;
 60
 61	for (i = 0; i < nr_args; i++) {
 62		p->ops[i].op = i;
 63		if (io_uring_op_supported(i))
 64			p->ops[i].flags = IO_URING_OP_SUPPORTED;
 65	}
 66	p->ops_len = i;
 67
 68	ret = 0;
 69	if (copy_to_user(arg, p, size))
 70		ret = -EFAULT;
 71out:
 72	kfree(p);
 73	return ret;
 74}
 75
 76int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
 77{
 78	const struct cred *creds;
 79
 80	creds = xa_erase(&ctx->personalities, id);
 81	if (creds) {
 82		put_cred(creds);
 83		return 0;
 84	}
 85
 86	return -EINVAL;
 87}
 88
 89
 90static int io_register_personality(struct io_ring_ctx *ctx)
 91{
 92	const struct cred *creds;
 93	u32 id;
 94	int ret;
 95
 96	creds = get_current_cred();
 97
 98	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
 99			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
100	if (ret < 0) {
101		put_cred(creds);
102		return ret;
103	}
104	return id;
105}
106
107static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
108					   void __user *arg, unsigned int nr_args)
109{
110	struct io_uring_restriction *res;
111	size_t size;
112	int i, ret;
113
114	/* Restrictions allowed only if rings started disabled */
115	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
116		return -EBADFD;
117
118	/* We allow only a single restrictions registration */
119	if (ctx->restrictions.registered)
120		return -EBUSY;
121
122	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
123		return -EINVAL;
124
125	size = array_size(nr_args, sizeof(*res));
126	if (size == SIZE_MAX)
127		return -EOVERFLOW;
128
129	res = memdup_user(arg, size);
130	if (IS_ERR(res))
131		return PTR_ERR(res);
132
133	ret = 0;
134
135	for (i = 0; i < nr_args; i++) {
136		switch (res[i].opcode) {
137		case IORING_RESTRICTION_REGISTER_OP:
138			if (res[i].register_op >= IORING_REGISTER_LAST) {
139				ret = -EINVAL;
140				goto out;
141			}
142
143			__set_bit(res[i].register_op,
144				  ctx->restrictions.register_op);
145			break;
146		case IORING_RESTRICTION_SQE_OP:
147			if (res[i].sqe_op >= IORING_OP_LAST) {
148				ret = -EINVAL;
149				goto out;
150			}
151
152			__set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
153			break;
154		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
155			ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
156			break;
157		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
158			ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
159			break;
160		default:
161			ret = -EINVAL;
162			goto out;
163		}
164	}
165
166out:
167	/* Reset all restrictions if an error happened */
168	if (ret != 0)
169		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
170	else
171		ctx->restrictions.registered = true;
172
173	kfree(res);
174	return ret;
175}
176
177static int io_register_enable_rings(struct io_ring_ctx *ctx)
178{
179	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
180		return -EBADFD;
181
182	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
183		WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
184		/*
185		 * Lazy activation attempts would fail if it was polled before
186		 * submitter_task is set.
187		 */
188		if (wq_has_sleeper(&ctx->poll_wq))
189			io_activate_pollwq(ctx);
190	}
191
192	if (ctx->restrictions.registered)
193		ctx->restricted = 1;
194
195	ctx->flags &= ~IORING_SETUP_R_DISABLED;
196	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
197		wake_up(&ctx->sq_data->wait);
198	return 0;
199}
200
201static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
202					 cpumask_var_t new_mask)
203{
204	int ret;
205
206	if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
207		ret = io_wq_cpu_affinity(current->io_uring, new_mask);
208	} else {
209		mutex_unlock(&ctx->uring_lock);
210		ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
211		mutex_lock(&ctx->uring_lock);
212	}
213
214	return ret;
215}
216
217static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
218				       void __user *arg, unsigned len)
219{
220	cpumask_var_t new_mask;
221	int ret;
222
223	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
224		return -ENOMEM;
225
226	cpumask_clear(new_mask);
227	if (len > cpumask_size())
228		len = cpumask_size();
229
230#ifdef CONFIG_COMPAT
231	if (in_compat_syscall())
232		ret = compat_get_bitmap(cpumask_bits(new_mask),
233					(const compat_ulong_t __user *)arg,
234					len * 8 /* CHAR_BIT */);
235	else
236#endif
237		ret = copy_from_user(new_mask, arg, len);
238
239	if (ret) {
240		free_cpumask_var(new_mask);
241		return -EFAULT;
242	}
243
244	ret = __io_register_iowq_aff(ctx, new_mask);
245	free_cpumask_var(new_mask);
246	return ret;
247}
248
249static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
250{
251	return __io_register_iowq_aff(ctx, NULL);
252}
253
254static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
255					       void __user *arg)
256	__must_hold(&ctx->uring_lock)
257{
258	struct io_tctx_node *node;
259	struct io_uring_task *tctx = NULL;
260	struct io_sq_data *sqd = NULL;
261	__u32 new_count[2];
262	int i, ret;
263
264	if (copy_from_user(new_count, arg, sizeof(new_count)))
265		return -EFAULT;
266	for (i = 0; i < ARRAY_SIZE(new_count); i++)
267		if (new_count[i] > INT_MAX)
268			return -EINVAL;
269
270	if (ctx->flags & IORING_SETUP_SQPOLL) {
271		sqd = ctx->sq_data;
272		if (sqd) {
273			/*
274			 * Observe the correct sqd->lock -> ctx->uring_lock
275			 * ordering. Fine to drop uring_lock here, we hold
276			 * a ref to the ctx.
277			 */
278			refcount_inc(&sqd->refs);
279			mutex_unlock(&ctx->uring_lock);
280			mutex_lock(&sqd->lock);
281			mutex_lock(&ctx->uring_lock);
282			if (sqd->thread)
283				tctx = sqd->thread->io_uring;
284		}
285	} else {
286		tctx = current->io_uring;
287	}
288
289	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
290
291	for (i = 0; i < ARRAY_SIZE(new_count); i++)
292		if (new_count[i])
293			ctx->iowq_limits[i] = new_count[i];
294	ctx->iowq_limits_set = true;
295
296	if (tctx && tctx->io_wq) {
297		ret = io_wq_max_workers(tctx->io_wq, new_count);
298		if (ret)
299			goto err;
300	} else {
301		memset(new_count, 0, sizeof(new_count));
302	}
303
304	if (sqd) {
305		mutex_unlock(&ctx->uring_lock);
306		mutex_unlock(&sqd->lock);
307		io_put_sq_data(sqd);
308		mutex_lock(&ctx->uring_lock);
309	}
310
311	if (copy_to_user(arg, new_count, sizeof(new_count)))
312		return -EFAULT;
313
314	/* that's it for SQPOLL, only the SQPOLL task creates requests */
315	if (sqd)
316		return 0;
317
318	/* now propagate the restriction to all registered users */
319	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
320		tctx = node->task->io_uring;
321		if (WARN_ON_ONCE(!tctx->io_wq))
322			continue;
323
324		for (i = 0; i < ARRAY_SIZE(new_count); i++)
325			new_count[i] = ctx->iowq_limits[i];
326		/* ignore errors, it always returns zero anyway */
327		(void)io_wq_max_workers(tctx->io_wq, new_count);
328	}
329	return 0;
330err:
331	if (sqd) {
332		mutex_unlock(&ctx->uring_lock);
333		mutex_unlock(&sqd->lock);
334		io_put_sq_data(sqd);
335		mutex_lock(&ctx->uring_lock);
336	}
337	return ret;
338}
339
340static int io_register_clock(struct io_ring_ctx *ctx,
341			     struct io_uring_clock_register __user *arg)
342{
343	struct io_uring_clock_register reg;
344
345	if (copy_from_user(&reg, arg, sizeof(reg)))
346		return -EFAULT;
347	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
348		return -EINVAL;
349
350	switch (reg.clockid) {
351	case CLOCK_MONOTONIC:
352		ctx->clock_offset = 0;
353		break;
354	case CLOCK_BOOTTIME:
355		ctx->clock_offset = TK_OFFS_BOOT;
356		break;
357	default:
358		return -EINVAL;
359	}
360
361	ctx->clockid = reg.clockid;
362	return 0;
363}
364
365/*
366 * State to maintain until we can swap. Both new and old state, used for
367 * either mapping or freeing.
368 */
369struct io_ring_ctx_rings {
370	unsigned short n_ring_pages;
371	unsigned short n_sqe_pages;
372	struct page **ring_pages;
373	struct page **sqe_pages;
374	struct io_uring_sqe *sq_sqes;
375	struct io_rings *rings;
376};
377
378static void io_register_free_rings(struct io_uring_params *p,
379				   struct io_ring_ctx_rings *r)
380{
381	if (!(p->flags & IORING_SETUP_NO_MMAP)) {
382		io_pages_unmap(r->rings, &r->ring_pages, &r->n_ring_pages,
383				true);
384		io_pages_unmap(r->sq_sqes, &r->sqe_pages, &r->n_sqe_pages,
385				true);
386	} else {
387		io_pages_free(&r->ring_pages, r->n_ring_pages);
388		io_pages_free(&r->sqe_pages, r->n_sqe_pages);
389		vunmap(r->rings);
390		vunmap(r->sq_sqes);
391	}
392}
393
394#define swap_old(ctx, o, n, field)		\
395	do {					\
396		(o).field = (ctx)->field;	\
397		(ctx)->field = (n).field;	\
398	} while (0)
399
400#define RESIZE_FLAGS	(IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
401#define COPY_FLAGS	(IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
402			 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP)
403
404static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
405{
406	struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
407	size_t size, sq_array_offset;
408	unsigned i, tail, old_head;
409	struct io_uring_params p;
410	void *ptr;
411	int ret;
412
413	/* for single issuer, must be owner resizing */
414	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER &&
415	    current != ctx->submitter_task)
416		return -EEXIST;
417	/* limited to DEFER_TASKRUN for now */
418	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
419		return -EINVAL;
420	if (copy_from_user(&p, arg, sizeof(p)))
421		return -EFAULT;
422	if (p.flags & ~RESIZE_FLAGS)
423		return -EINVAL;
424
425	/* properties that are always inherited */
426	p.flags |= (ctx->flags & COPY_FLAGS);
427
428	ret = io_uring_fill_params(p.sq_entries, &p);
429	if (unlikely(ret))
430		return ret;
431
432	/* nothing to do, but copy params back */
433	if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) {
434		if (copy_to_user(arg, &p, sizeof(p)))
435			return -EFAULT;
436		return 0;
437	}
438
439	size = rings_size(p.flags, p.sq_entries, p.cq_entries,
440				&sq_array_offset);
441	if (size == SIZE_MAX)
442		return -EOVERFLOW;
443
444	if (!(p.flags & IORING_SETUP_NO_MMAP))
445		n.rings = io_pages_map(&n.ring_pages, &n.n_ring_pages, size);
446	else
447		n.rings = __io_uaddr_map(&n.ring_pages, &n.n_ring_pages,
448						p.cq_off.user_addr, size);
449	if (IS_ERR(n.rings))
450		return PTR_ERR(n.rings);
451
452	/*
453	 * At this point n.rings is shared with userspace, just like o.rings
454	 * is as well. While we don't expect userspace to modify it while
455	 * a resize is in progress, and it's most likely that userspace will
456	 * shoot itself in the foot if it does, we can't always assume good
457	 * intent... Use read/write once helpers from here on to indicate the
458	 * shared nature of it.
459	 */
460	WRITE_ONCE(n.rings->sq_ring_mask, p.sq_entries - 1);
461	WRITE_ONCE(n.rings->cq_ring_mask, p.cq_entries - 1);
462	WRITE_ONCE(n.rings->sq_ring_entries, p.sq_entries);
463	WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries);
464
465	if (copy_to_user(arg, &p, sizeof(p))) {
466		io_register_free_rings(&p, &n);
467		return -EFAULT;
468	}
469
470	if (p.flags & IORING_SETUP_SQE128)
471		size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries);
472	else
473		size = array_size(sizeof(struct io_uring_sqe), p.sq_entries);
474	if (size == SIZE_MAX) {
475		io_register_free_rings(&p, &n);
476		return -EOVERFLOW;
477	}
478
479	if (!(p.flags & IORING_SETUP_NO_MMAP))
480		ptr = io_pages_map(&n.sqe_pages, &n.n_sqe_pages, size);
481	else
482		ptr = __io_uaddr_map(&n.sqe_pages, &n.n_sqe_pages,
483					p.sq_off.user_addr,
484					size);
485	if (IS_ERR(ptr)) {
486		io_register_free_rings(&p, &n);
487		return PTR_ERR(ptr);
488	}
489
490	/*
491	 * If using SQPOLL, park the thread
492	 */
493	if (ctx->sq_data) {
494		mutex_unlock(&ctx->uring_lock);
495		io_sq_thread_park(ctx->sq_data);
496		mutex_lock(&ctx->uring_lock);
497	}
498
499	/*
500	 * We'll do the swap. Grab the ctx->resize_lock, which will exclude
501	 * any new mmap's on the ring fd. Clear out existing mappings to prevent
502	 * mmap from seeing them, as we'll unmap them. Any attempt to mmap
503	 * existing rings beyond this point will fail. Not that it could proceed
504	 * at this point anyway, as the io_uring mmap side needs go grab the
505	 * ctx->resize_lock as well. Likewise, hold the completion lock over the
506	 * duration of the actual swap.
507	 */
508	mutex_lock(&ctx->resize_lock);
509	spin_lock(&ctx->completion_lock);
510	o.rings = ctx->rings;
511	ctx->rings = NULL;
512	o.sq_sqes = ctx->sq_sqes;
513	ctx->sq_sqes = NULL;
514
515	/*
516	 * Now copy SQ and CQ entries, if any. If either of the destination
517	 * rings can't hold what is already there, then fail the operation.
518	 */
519	n.sq_sqes = ptr;
520	tail = READ_ONCE(o.rings->sq.tail);
521	old_head = READ_ONCE(o.rings->sq.head);
522	if (tail - old_head > p.sq_entries)
523		goto overflow;
524	for (i = old_head; i < tail; i++) {
525		unsigned src_head = i & (ctx->sq_entries - 1);
526		unsigned dst_head = i & (p.sq_entries - 1);
527
528		n.sq_sqes[dst_head] = o.sq_sqes[src_head];
529	}
530	WRITE_ONCE(n.rings->sq.head, READ_ONCE(o.rings->sq.head));
531	WRITE_ONCE(n.rings->sq.tail, READ_ONCE(o.rings->sq.tail));
532
533	tail = READ_ONCE(o.rings->cq.tail);
534	old_head = READ_ONCE(o.rings->cq.head);
535	if (tail - old_head > p.cq_entries) {
536overflow:
537		/* restore old rings, and return -EOVERFLOW via cleanup path */
538		ctx->rings = o.rings;
539		ctx->sq_sqes = o.sq_sqes;
540		to_free = &n;
541		ret = -EOVERFLOW;
542		goto out;
543	}
544	for (i = old_head; i < tail; i++) {
545		unsigned src_head = i & (ctx->cq_entries - 1);
546		unsigned dst_head = i & (p.cq_entries - 1);
547
548		n.rings->cqes[dst_head] = o.rings->cqes[src_head];
549	}
550	WRITE_ONCE(n.rings->cq.head, READ_ONCE(o.rings->cq.head));
551	WRITE_ONCE(n.rings->cq.tail, READ_ONCE(o.rings->cq.tail));
552	/* invalidate cached cqe refill */
553	ctx->cqe_cached = ctx->cqe_sentinel = NULL;
554
555	WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped));
556	atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags));
557	WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags));
558	WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow));
559
560	/* all done, store old pointers and assign new ones */
561	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
562		ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset);
563
564	ctx->sq_entries = p.sq_entries;
565	ctx->cq_entries = p.cq_entries;
566
567	ctx->rings = n.rings;
568	ctx->sq_sqes = n.sq_sqes;
569	swap_old(ctx, o, n, n_ring_pages);
570	swap_old(ctx, o, n, n_sqe_pages);
571	swap_old(ctx, o, n, ring_pages);
572	swap_old(ctx, o, n, sqe_pages);
573	to_free = &o;
574	ret = 0;
575out:
576	spin_unlock(&ctx->completion_lock);
577	mutex_unlock(&ctx->resize_lock);
578	io_register_free_rings(&p, to_free);
579
580	if (ctx->sq_data)
581		io_sq_thread_unpark(ctx->sq_data);
582
583	return ret;
584}
585
586static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
587{
588	struct io_uring_mem_region_reg __user *reg_uptr = uarg;
589	struct io_uring_mem_region_reg reg;
590	struct io_uring_region_desc __user *rd_uptr;
591	struct io_uring_region_desc rd;
592	int ret;
593
594	if (io_region_is_set(&ctx->param_region))
595		return -EBUSY;
596	if (copy_from_user(&reg, reg_uptr, sizeof(reg)))
597		return -EFAULT;
598	rd_uptr = u64_to_user_ptr(reg.region_uptr);
599	if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
600		return -EFAULT;
601
602	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
603		return -EINVAL;
604	if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
605		return -EINVAL;
606
607	/*
608	 * This ensures there are no waiters. Waiters are unlocked and it's
609	 * hard to synchronise with them, especially if we need to initialise
610	 * the region.
611	 */
612	if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
613	    !(ctx->flags & IORING_SETUP_R_DISABLED))
614		return -EINVAL;
615
616	ret = io_create_region(ctx, &ctx->param_region, &rd);
617	if (ret)
618		return ret;
619	if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
620		io_free_region(ctx, &ctx->param_region);
621		return -EFAULT;
622	}
623
624	if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
625		ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region);
626		ctx->cq_wait_size = rd.size;
627	}
628	return 0;
629}
630
631static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
632			       void __user *arg, unsigned nr_args)
633	__releases(ctx->uring_lock)
634	__acquires(ctx->uring_lock)
635{
636	int ret;
637
638	/*
639	 * We don't quiesce the refs for register anymore and so it can't be
640	 * dying as we're holding a file ref here.
641	 */
642	if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
643		return -ENXIO;
644
645	if (ctx->submitter_task && ctx->submitter_task != current)
646		return -EEXIST;
647
648	if (ctx->restricted) {
649		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
650		if (!test_bit(opcode, ctx->restrictions.register_op))
651			return -EACCES;
652	}
653
654	switch (opcode) {
655	case IORING_REGISTER_BUFFERS:
656		ret = -EFAULT;
657		if (!arg)
658			break;
659		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
660		break;
661	case IORING_UNREGISTER_BUFFERS:
662		ret = -EINVAL;
663		if (arg || nr_args)
664			break;
665		ret = io_sqe_buffers_unregister(ctx);
666		break;
667	case IORING_REGISTER_FILES:
668		ret = -EFAULT;
669		if (!arg)
670			break;
671		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
672		break;
673	case IORING_UNREGISTER_FILES:
674		ret = -EINVAL;
675		if (arg || nr_args)
676			break;
677		ret = io_sqe_files_unregister(ctx);
678		break;
679	case IORING_REGISTER_FILES_UPDATE:
680		ret = io_register_files_update(ctx, arg, nr_args);
681		break;
682	case IORING_REGISTER_EVENTFD:
683		ret = -EINVAL;
684		if (nr_args != 1)
685			break;
686		ret = io_eventfd_register(ctx, arg, 0);
687		break;
688	case IORING_REGISTER_EVENTFD_ASYNC:
689		ret = -EINVAL;
690		if (nr_args != 1)
691			break;
692		ret = io_eventfd_register(ctx, arg, 1);
693		break;
694	case IORING_UNREGISTER_EVENTFD:
695		ret = -EINVAL;
696		if (arg || nr_args)
697			break;
698		ret = io_eventfd_unregister(ctx);
699		break;
700	case IORING_REGISTER_PROBE:
701		ret = -EINVAL;
702		if (!arg || nr_args > 256)
703			break;
704		ret = io_probe(ctx, arg, nr_args);
705		break;
706	case IORING_REGISTER_PERSONALITY:
707		ret = -EINVAL;
708		if (arg || nr_args)
709			break;
710		ret = io_register_personality(ctx);
711		break;
712	case IORING_UNREGISTER_PERSONALITY:
713		ret = -EINVAL;
714		if (arg)
715			break;
716		ret = io_unregister_personality(ctx, nr_args);
717		break;
718	case IORING_REGISTER_ENABLE_RINGS:
719		ret = -EINVAL;
720		if (arg || nr_args)
721			break;
722		ret = io_register_enable_rings(ctx);
723		break;
724	case IORING_REGISTER_RESTRICTIONS:
725		ret = io_register_restrictions(ctx, arg, nr_args);
726		break;
727	case IORING_REGISTER_FILES2:
728		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
729		break;
730	case IORING_REGISTER_FILES_UPDATE2:
731		ret = io_register_rsrc_update(ctx, arg, nr_args,
732					      IORING_RSRC_FILE);
733		break;
734	case IORING_REGISTER_BUFFERS2:
735		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
736		break;
737	case IORING_REGISTER_BUFFERS_UPDATE:
738		ret = io_register_rsrc_update(ctx, arg, nr_args,
739					      IORING_RSRC_BUFFER);
740		break;
741	case IORING_REGISTER_IOWQ_AFF:
742		ret = -EINVAL;
743		if (!arg || !nr_args)
744			break;
745		ret = io_register_iowq_aff(ctx, arg, nr_args);
746		break;
747	case IORING_UNREGISTER_IOWQ_AFF:
748		ret = -EINVAL;
749		if (arg || nr_args)
750			break;
751		ret = io_unregister_iowq_aff(ctx);
752		break;
753	case IORING_REGISTER_IOWQ_MAX_WORKERS:
754		ret = -EINVAL;
755		if (!arg || nr_args != 2)
756			break;
757		ret = io_register_iowq_max_workers(ctx, arg);
758		break;
759	case IORING_REGISTER_RING_FDS:
760		ret = io_ringfd_register(ctx, arg, nr_args);
761		break;
762	case IORING_UNREGISTER_RING_FDS:
763		ret = io_ringfd_unregister(ctx, arg, nr_args);
764		break;
765	case IORING_REGISTER_PBUF_RING:
766		ret = -EINVAL;
767		if (!arg || nr_args != 1)
768			break;
769		ret = io_register_pbuf_ring(ctx, arg);
770		break;
771	case IORING_UNREGISTER_PBUF_RING:
772		ret = -EINVAL;
773		if (!arg || nr_args != 1)
774			break;
775		ret = io_unregister_pbuf_ring(ctx, arg);
776		break;
777	case IORING_REGISTER_SYNC_CANCEL:
778		ret = -EINVAL;
779		if (!arg || nr_args != 1)
780			break;
781		ret = io_sync_cancel(ctx, arg);
782		break;
783	case IORING_REGISTER_FILE_ALLOC_RANGE:
784		ret = -EINVAL;
785		if (!arg || nr_args)
786			break;
787		ret = io_register_file_alloc_range(ctx, arg);
788		break;
789	case IORING_REGISTER_PBUF_STATUS:
790		ret = -EINVAL;
791		if (!arg || nr_args != 1)
792			break;
793		ret = io_register_pbuf_status(ctx, arg);
794		break;
795	case IORING_REGISTER_NAPI:
796		ret = -EINVAL;
797		if (!arg || nr_args != 1)
798			break;
799		ret = io_register_napi(ctx, arg);
800		break;
801	case IORING_UNREGISTER_NAPI:
802		ret = -EINVAL;
803		if (nr_args != 1)
804			break;
805		ret = io_unregister_napi(ctx, arg);
806		break;
807	case IORING_REGISTER_CLOCK:
808		ret = -EINVAL;
809		if (!arg || nr_args)
810			break;
811		ret = io_register_clock(ctx, arg);
812		break;
813	case IORING_REGISTER_CLONE_BUFFERS:
814		ret = -EINVAL;
815		if (!arg || nr_args != 1)
816			break;
817		ret = io_register_clone_buffers(ctx, arg);
818		break;
819	case IORING_REGISTER_RESIZE_RINGS:
820		ret = -EINVAL;
821		if (!arg || nr_args != 1)
822			break;
823		ret = io_register_resize_rings(ctx, arg);
824		break;
825	case IORING_REGISTER_MEM_REGION:
826		ret = -EINVAL;
827		if (!arg || nr_args != 1)
828			break;
829		ret = io_register_mem_region(ctx, arg);
830		break;
831	default:
832		ret = -EINVAL;
833		break;
834	}
835
836	return ret;
837}
838
839/*
840 * Given an 'fd' value, return the ctx associated with if. If 'registered' is
841 * true, then the registered index is used. Otherwise, the normal fd table.
842 * Caller must call fput() on the returned file, unless it's an ERR_PTR.
843 */
844struct file *io_uring_register_get_file(unsigned int fd, bool registered)
845{
846	struct file *file;
847
848	if (registered) {
849		/*
850		 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
851		 * need only dereference our task private array to find it.
852		 */
853		struct io_uring_task *tctx = current->io_uring;
854
855		if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
856			return ERR_PTR(-EINVAL);
857		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
858		file = tctx->registered_rings[fd];
859	} else {
860		file = fget(fd);
861	}
862
863	if (unlikely(!file))
864		return ERR_PTR(-EBADF);
865	if (io_is_uring_fops(file))
866		return file;
867	fput(file);
868	return ERR_PTR(-EOPNOTSUPP);
869}
870
871/*
872 * "blind" registration opcodes are ones where there's no ring given, and
873 * hence the source fd must be -1.
874 */
875static int io_uring_register_blind(unsigned int opcode, void __user *arg,
876				   unsigned int nr_args)
877{
878	switch (opcode) {
879	case IORING_REGISTER_SEND_MSG_RING: {
880		struct io_uring_sqe sqe;
881
882		if (!arg || nr_args != 1)
883			return -EINVAL;
884		if (copy_from_user(&sqe, arg, sizeof(sqe)))
885			return -EFAULT;
886		/* no flags supported */
887		if (sqe.flags)
888			return -EINVAL;
889		if (sqe.opcode == IORING_OP_MSG_RING)
890			return io_uring_sync_msg_ring(&sqe);
891		}
892	}
893
894	return -EINVAL;
895}
896
897SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
898		void __user *, arg, unsigned int, nr_args)
899{
900	struct io_ring_ctx *ctx;
901	long ret = -EBADF;
902	struct file *file;
903	bool use_registered_ring;
904
905	use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
906	opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
907
908	if (opcode >= IORING_REGISTER_LAST)
909		return -EINVAL;
910
911	if (fd == -1)
912		return io_uring_register_blind(opcode, arg, nr_args);
913
914	file = io_uring_register_get_file(fd, use_registered_ring);
915	if (IS_ERR(file))
916		return PTR_ERR(file);
917	ctx = file->private_data;
918
919	mutex_lock(&ctx->uring_lock);
920	ret = __io_uring_register(ctx, opcode, arg, nr_args);
921
922	trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
923				ctx->buf_table.nr, ret);
924	mutex_unlock(&ctx->uring_lock);
925	if (!use_registered_ring)
926		fput(file);
927	return ret;
928}