Linux Audio

Check our new training course

Loading...
Note: File does not exist in v6.2.
  1// SPDX-License-Identifier: GPL-2.0
  2/*
  3 * Code related to the io_uring_register() syscall
  4 *
  5 * Copyright (C) 2023 Jens Axboe
  6 */
  7#include <linux/kernel.h>
  8#include <linux/errno.h>
  9#include <linux/syscalls.h>
 10#include <linux/refcount.h>
 11#include <linux/bits.h>
 12#include <linux/fs.h>
 13#include <linux/file.h>
 14#include <linux/slab.h>
 15#include <linux/uaccess.h>
 16#include <linux/nospec.h>
 17#include <linux/compat.h>
 18#include <linux/io_uring.h>
 19#include <linux/io_uring_types.h>
 20
 21#include "io_uring.h"
 22#include "opdef.h"
 23#include "tctx.h"
 24#include "rsrc.h"
 25#include "sqpoll.h"
 26#include "register.h"
 27#include "cancel.h"
 28#include "kbuf.h"
 29
 30#define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
 31				 IORING_REGISTER_LAST + IORING_OP_LAST)
 32
 33static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
 34			       unsigned int eventfd_async)
 35{
 36	struct io_ev_fd *ev_fd;
 37	__s32 __user *fds = arg;
 38	int fd;
 39
 40	ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
 41					lockdep_is_held(&ctx->uring_lock));
 42	if (ev_fd)
 43		return -EBUSY;
 44
 45	if (copy_from_user(&fd, fds, sizeof(*fds)))
 46		return -EFAULT;
 47
 48	ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
 49	if (!ev_fd)
 50		return -ENOMEM;
 51
 52	ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
 53	if (IS_ERR(ev_fd->cq_ev_fd)) {
 54		int ret = PTR_ERR(ev_fd->cq_ev_fd);
 55		kfree(ev_fd);
 56		return ret;
 57	}
 58
 59	spin_lock(&ctx->completion_lock);
 60	ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
 61	spin_unlock(&ctx->completion_lock);
 62
 63	ev_fd->eventfd_async = eventfd_async;
 64	ctx->has_evfd = true;
 65	rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
 66	atomic_set(&ev_fd->refs, 1);
 67	atomic_set(&ev_fd->ops, 0);
 68	return 0;
 69}
 70
 71int io_eventfd_unregister(struct io_ring_ctx *ctx)
 72{
 73	struct io_ev_fd *ev_fd;
 74
 75	ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
 76					lockdep_is_held(&ctx->uring_lock));
 77	if (ev_fd) {
 78		ctx->has_evfd = false;
 79		rcu_assign_pointer(ctx->io_ev_fd, NULL);
 80		if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops))
 81			call_rcu(&ev_fd->rcu, io_eventfd_ops);
 82		return 0;
 83	}
 84
 85	return -ENXIO;
 86}
 87
 88static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
 89			   unsigned nr_args)
 90{
 91	struct io_uring_probe *p;
 92	size_t size;
 93	int i, ret;
 94
 95	size = struct_size(p, ops, nr_args);
 96	if (size == SIZE_MAX)
 97		return -EOVERFLOW;
 98	p = kzalloc(size, GFP_KERNEL);
 99	if (!p)
100		return -ENOMEM;
101
102	ret = -EFAULT;
103	if (copy_from_user(p, arg, size))
104		goto out;
105	ret = -EINVAL;
106	if (memchr_inv(p, 0, size))
107		goto out;
108
109	p->last_op = IORING_OP_LAST - 1;
110	if (nr_args > IORING_OP_LAST)
111		nr_args = IORING_OP_LAST;
112
113	for (i = 0; i < nr_args; i++) {
114		p->ops[i].op = i;
115		if (!io_issue_defs[i].not_supported)
116			p->ops[i].flags = IO_URING_OP_SUPPORTED;
117	}
118	p->ops_len = i;
119
120	ret = 0;
121	if (copy_to_user(arg, p, size))
122		ret = -EFAULT;
123out:
124	kfree(p);
125	return ret;
126}
127
128int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
129{
130	const struct cred *creds;
131
132	creds = xa_erase(&ctx->personalities, id);
133	if (creds) {
134		put_cred(creds);
135		return 0;
136	}
137
138	return -EINVAL;
139}
140
141
142static int io_register_personality(struct io_ring_ctx *ctx)
143{
144	const struct cred *creds;
145	u32 id;
146	int ret;
147
148	creds = get_current_cred();
149
150	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
151			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
152	if (ret < 0) {
153		put_cred(creds);
154		return ret;
155	}
156	return id;
157}
158
159static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
160					   void __user *arg, unsigned int nr_args)
161{
162	struct io_uring_restriction *res;
163	size_t size;
164	int i, ret;
165
166	/* Restrictions allowed only if rings started disabled */
167	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
168		return -EBADFD;
169
170	/* We allow only a single restrictions registration */
171	if (ctx->restrictions.registered)
172		return -EBUSY;
173
174	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
175		return -EINVAL;
176
177	size = array_size(nr_args, sizeof(*res));
178	if (size == SIZE_MAX)
179		return -EOVERFLOW;
180
181	res = memdup_user(arg, size);
182	if (IS_ERR(res))
183		return PTR_ERR(res);
184
185	ret = 0;
186
187	for (i = 0; i < nr_args; i++) {
188		switch (res[i].opcode) {
189		case IORING_RESTRICTION_REGISTER_OP:
190			if (res[i].register_op >= IORING_REGISTER_LAST) {
191				ret = -EINVAL;
192				goto out;
193			}
194
195			__set_bit(res[i].register_op,
196				  ctx->restrictions.register_op);
197			break;
198		case IORING_RESTRICTION_SQE_OP:
199			if (res[i].sqe_op >= IORING_OP_LAST) {
200				ret = -EINVAL;
201				goto out;
202			}
203
204			__set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
205			break;
206		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
207			ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
208			break;
209		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
210			ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
211			break;
212		default:
213			ret = -EINVAL;
214			goto out;
215		}
216	}
217
218out:
219	/* Reset all restrictions if an error happened */
220	if (ret != 0)
221		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
222	else
223		ctx->restrictions.registered = true;
224
225	kfree(res);
226	return ret;
227}
228
229static int io_register_enable_rings(struct io_ring_ctx *ctx)
230{
231	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
232		return -EBADFD;
233
234	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
235		WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
236		/*
237		 * Lazy activation attempts would fail if it was polled before
238		 * submitter_task is set.
239		 */
240		if (wq_has_sleeper(&ctx->poll_wq))
241			io_activate_pollwq(ctx);
242	}
243
244	if (ctx->restrictions.registered)
245		ctx->restricted = 1;
246
247	ctx->flags &= ~IORING_SETUP_R_DISABLED;
248	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
249		wake_up(&ctx->sq_data->wait);
250	return 0;
251}
252
253static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
254					 cpumask_var_t new_mask)
255{
256	int ret;
257
258	if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
259		ret = io_wq_cpu_affinity(current->io_uring, new_mask);
260	} else {
261		mutex_unlock(&ctx->uring_lock);
262		ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
263		mutex_lock(&ctx->uring_lock);
264	}
265
266	return ret;
267}
268
269static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
270				       void __user *arg, unsigned len)
271{
272	cpumask_var_t new_mask;
273	int ret;
274
275	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
276		return -ENOMEM;
277
278	cpumask_clear(new_mask);
279	if (len > cpumask_size())
280		len = cpumask_size();
281
282#ifdef CONFIG_COMPAT
283	if (in_compat_syscall())
284		ret = compat_get_bitmap(cpumask_bits(new_mask),
285					(const compat_ulong_t __user *)arg,
286					len * 8 /* CHAR_BIT */);
287	else
288#endif
289		ret = copy_from_user(new_mask, arg, len);
290
291	if (ret) {
292		free_cpumask_var(new_mask);
293		return -EFAULT;
294	}
295
296	ret = __io_register_iowq_aff(ctx, new_mask);
297	free_cpumask_var(new_mask);
298	return ret;
299}
300
301static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
302{
303	return __io_register_iowq_aff(ctx, NULL);
304}
305
306static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
307					       void __user *arg)
308	__must_hold(&ctx->uring_lock)
309{
310	struct io_tctx_node *node;
311	struct io_uring_task *tctx = NULL;
312	struct io_sq_data *sqd = NULL;
313	__u32 new_count[2];
314	int i, ret;
315
316	if (copy_from_user(new_count, arg, sizeof(new_count)))
317		return -EFAULT;
318	for (i = 0; i < ARRAY_SIZE(new_count); i++)
319		if (new_count[i] > INT_MAX)
320			return -EINVAL;
321
322	if (ctx->flags & IORING_SETUP_SQPOLL) {
323		sqd = ctx->sq_data;
324		if (sqd) {
325			/*
326			 * Observe the correct sqd->lock -> ctx->uring_lock
327			 * ordering. Fine to drop uring_lock here, we hold
328			 * a ref to the ctx.
329			 */
330			refcount_inc(&sqd->refs);
331			mutex_unlock(&ctx->uring_lock);
332			mutex_lock(&sqd->lock);
333			mutex_lock(&ctx->uring_lock);
334			if (sqd->thread)
335				tctx = sqd->thread->io_uring;
336		}
337	} else {
338		tctx = current->io_uring;
339	}
340
341	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
342
343	for (i = 0; i < ARRAY_SIZE(new_count); i++)
344		if (new_count[i])
345			ctx->iowq_limits[i] = new_count[i];
346	ctx->iowq_limits_set = true;
347
348	if (tctx && tctx->io_wq) {
349		ret = io_wq_max_workers(tctx->io_wq, new_count);
350		if (ret)
351			goto err;
352	} else {
353		memset(new_count, 0, sizeof(new_count));
354	}
355
356	if (sqd) {
357		mutex_unlock(&sqd->lock);
358		io_put_sq_data(sqd);
359	}
360
361	if (copy_to_user(arg, new_count, sizeof(new_count)))
362		return -EFAULT;
363
364	/* that's it for SQPOLL, only the SQPOLL task creates requests */
365	if (sqd)
366		return 0;
367
368	/* now propagate the restriction to all registered users */
369	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
370		struct io_uring_task *tctx = node->task->io_uring;
371
372		if (WARN_ON_ONCE(!tctx->io_wq))
373			continue;
374
375		for (i = 0; i < ARRAY_SIZE(new_count); i++)
376			new_count[i] = ctx->iowq_limits[i];
377		/* ignore errors, it always returns zero anyway */
378		(void)io_wq_max_workers(tctx->io_wq, new_count);
379	}
380	return 0;
381err:
382	if (sqd) {
383		mutex_unlock(&sqd->lock);
384		io_put_sq_data(sqd);
385	}
386	return ret;
387}
388
389static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
390			       void __user *arg, unsigned nr_args)
391	__releases(ctx->uring_lock)
392	__acquires(ctx->uring_lock)
393{
394	int ret;
395
396	/*
397	 * We don't quiesce the refs for register anymore and so it can't be
398	 * dying as we're holding a file ref here.
399	 */
400	if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
401		return -ENXIO;
402
403	if (ctx->submitter_task && ctx->submitter_task != current)
404		return -EEXIST;
405
406	if (ctx->restricted) {
407		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
408		if (!test_bit(opcode, ctx->restrictions.register_op))
409			return -EACCES;
410	}
411
412	switch (opcode) {
413	case IORING_REGISTER_BUFFERS:
414		ret = -EFAULT;
415		if (!arg)
416			break;
417		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
418		break;
419	case IORING_UNREGISTER_BUFFERS:
420		ret = -EINVAL;
421		if (arg || nr_args)
422			break;
423		ret = io_sqe_buffers_unregister(ctx);
424		break;
425	case IORING_REGISTER_FILES:
426		ret = -EFAULT;
427		if (!arg)
428			break;
429		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
430		break;
431	case IORING_UNREGISTER_FILES:
432		ret = -EINVAL;
433		if (arg || nr_args)
434			break;
435		ret = io_sqe_files_unregister(ctx);
436		break;
437	case IORING_REGISTER_FILES_UPDATE:
438		ret = io_register_files_update(ctx, arg, nr_args);
439		break;
440	case IORING_REGISTER_EVENTFD:
441		ret = -EINVAL;
442		if (nr_args != 1)
443			break;
444		ret = io_eventfd_register(ctx, arg, 0);
445		break;
446	case IORING_REGISTER_EVENTFD_ASYNC:
447		ret = -EINVAL;
448		if (nr_args != 1)
449			break;
450		ret = io_eventfd_register(ctx, arg, 1);
451		break;
452	case IORING_UNREGISTER_EVENTFD:
453		ret = -EINVAL;
454		if (arg || nr_args)
455			break;
456		ret = io_eventfd_unregister(ctx);
457		break;
458	case IORING_REGISTER_PROBE:
459		ret = -EINVAL;
460		if (!arg || nr_args > 256)
461			break;
462		ret = io_probe(ctx, arg, nr_args);
463		break;
464	case IORING_REGISTER_PERSONALITY:
465		ret = -EINVAL;
466		if (arg || nr_args)
467			break;
468		ret = io_register_personality(ctx);
469		break;
470	case IORING_UNREGISTER_PERSONALITY:
471		ret = -EINVAL;
472		if (arg)
473			break;
474		ret = io_unregister_personality(ctx, nr_args);
475		break;
476	case IORING_REGISTER_ENABLE_RINGS:
477		ret = -EINVAL;
478		if (arg || nr_args)
479			break;
480		ret = io_register_enable_rings(ctx);
481		break;
482	case IORING_REGISTER_RESTRICTIONS:
483		ret = io_register_restrictions(ctx, arg, nr_args);
484		break;
485	case IORING_REGISTER_FILES2:
486		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
487		break;
488	case IORING_REGISTER_FILES_UPDATE2:
489		ret = io_register_rsrc_update(ctx, arg, nr_args,
490					      IORING_RSRC_FILE);
491		break;
492	case IORING_REGISTER_BUFFERS2:
493		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
494		break;
495	case IORING_REGISTER_BUFFERS_UPDATE:
496		ret = io_register_rsrc_update(ctx, arg, nr_args,
497					      IORING_RSRC_BUFFER);
498		break;
499	case IORING_REGISTER_IOWQ_AFF:
500		ret = -EINVAL;
501		if (!arg || !nr_args)
502			break;
503		ret = io_register_iowq_aff(ctx, arg, nr_args);
504		break;
505	case IORING_UNREGISTER_IOWQ_AFF:
506		ret = -EINVAL;
507		if (arg || nr_args)
508			break;
509		ret = io_unregister_iowq_aff(ctx);
510		break;
511	case IORING_REGISTER_IOWQ_MAX_WORKERS:
512		ret = -EINVAL;
513		if (!arg || nr_args != 2)
514			break;
515		ret = io_register_iowq_max_workers(ctx, arg);
516		break;
517	case IORING_REGISTER_RING_FDS:
518		ret = io_ringfd_register(ctx, arg, nr_args);
519		break;
520	case IORING_UNREGISTER_RING_FDS:
521		ret = io_ringfd_unregister(ctx, arg, nr_args);
522		break;
523	case IORING_REGISTER_PBUF_RING:
524		ret = -EINVAL;
525		if (!arg || nr_args != 1)
526			break;
527		ret = io_register_pbuf_ring(ctx, arg);
528		break;
529	case IORING_UNREGISTER_PBUF_RING:
530		ret = -EINVAL;
531		if (!arg || nr_args != 1)
532			break;
533		ret = io_unregister_pbuf_ring(ctx, arg);
534		break;
535	case IORING_REGISTER_SYNC_CANCEL:
536		ret = -EINVAL;
537		if (!arg || nr_args != 1)
538			break;
539		ret = io_sync_cancel(ctx, arg);
540		break;
541	case IORING_REGISTER_FILE_ALLOC_RANGE:
542		ret = -EINVAL;
543		if (!arg || nr_args)
544			break;
545		ret = io_register_file_alloc_range(ctx, arg);
546		break;
547	case IORING_REGISTER_PBUF_STATUS:
548		ret = -EINVAL;
549		if (!arg || nr_args != 1)
550			break;
551		ret = io_register_pbuf_status(ctx, arg);
552		break;
553	default:
554		ret = -EINVAL;
555		break;
556	}
557
558	return ret;
559}
560
561SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
562		void __user *, arg, unsigned int, nr_args)
563{
564	struct io_ring_ctx *ctx;
565	long ret = -EBADF;
566	struct file *file;
567	bool use_registered_ring;
568
569	use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
570	opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
571
572	if (opcode >= IORING_REGISTER_LAST)
573		return -EINVAL;
574
575	if (use_registered_ring) {
576		/*
577		 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
578		 * need only dereference our task private array to find it.
579		 */
580		struct io_uring_task *tctx = current->io_uring;
581
582		if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
583			return -EINVAL;
584		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
585		file = tctx->registered_rings[fd];
586		if (unlikely(!file))
587			return -EBADF;
588	} else {
589		file = fget(fd);
590		if (unlikely(!file))
591			return -EBADF;
592		ret = -EOPNOTSUPP;
593		if (!io_is_uring_fops(file))
594			goto out_fput;
595	}
596
597	ctx = file->private_data;
598
599	mutex_lock(&ctx->uring_lock);
600	ret = __io_uring_register(ctx, opcode, arg, nr_args);
601	mutex_unlock(&ctx->uring_lock);
602	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
603out_fput:
604	if (!use_registered_ring)
605		fput(file);
606	return ret;
607}