user-trap.c - samples/seccomp/user-trap.c - Linux diff v6.13.7

  1#include <signal.h>
  2#include <stdio.h>
  3#include <stdlib.h>
  4#include <unistd.h>
  5#include <errno.h>
  6#include <fcntl.h>
  7#include <string.h>
  8#include <stddef.h>
  9#include <sys/sysmacros.h>
 10#include <sys/types.h>
 11#include <sys/wait.h>
 12#include <sys/socket.h>
 13#include <sys/stat.h>
 14#include <sys/mman.h>
 15#include <sys/syscall.h>
 16#include <sys/user.h>
 17#include <sys/ioctl.h>
 18#include <sys/ptrace.h>
 19#include <sys/mount.h>
 20#include <linux/limits.h>
 21#include <linux/filter.h>
 22#include <linux/seccomp.h>
 23
 24#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
 25
 26static int seccomp(unsigned int op, unsigned int flags, void *args)
 27{
 28	errno = 0;
 29	return syscall(__NR_seccomp, op, flags, args);
 30}
 31
 32static int send_fd(int sock, int fd)
 33{
 34	struct msghdr msg = {};
 35	struct cmsghdr *cmsg;
 36	int *fd_ptr;
 37	char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c';
 38	struct iovec io = {
 39		.iov_base = &c,
 40		.iov_len = 1,
 41	};
 42
 43	msg.msg_iov = &io;
 44	msg.msg_iovlen = 1;
 45	msg.msg_control = buf;
 46	msg.msg_controllen = sizeof(buf);
 47	cmsg = CMSG_FIRSTHDR(&msg);
 48	cmsg->cmsg_level = SOL_SOCKET;
 49	cmsg->cmsg_type = SCM_RIGHTS;
 50	cmsg->cmsg_len = CMSG_LEN(sizeof(int));
 51	fd_ptr = (int *)CMSG_DATA(cmsg);
 52	*fd_ptr = fd;
 53	msg.msg_controllen = cmsg->cmsg_len;
 54
 55	if (sendmsg(sock, &msg, 0) < 0) {
 56		perror("sendmsg");
 57		return -1;
 58	}
 59
 60	return 0;
 61}
 62
 63static int recv_fd(int sock)
 64{
 65	struct msghdr msg = {};
 66	struct cmsghdr *cmsg;
 67	int *fd_ptr;
 68	char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c';
 69	struct iovec io = {
 70		.iov_base = &c,
 71		.iov_len = 1,
 72	};
 73
 74	msg.msg_iov = &io;
 75	msg.msg_iovlen = 1;
 76	msg.msg_control = buf;
 77	msg.msg_controllen = sizeof(buf);
 78
 79	if (recvmsg(sock, &msg, 0) < 0) {
 80		perror("recvmsg");
 81		return -1;
 82	}
 83
 84	cmsg = CMSG_FIRSTHDR(&msg);
 85	fd_ptr = (int *)CMSG_DATA(cmsg);
 86
 87	return *fd_ptr;
 88}
 89
 90static int user_trap_syscall(int nr, unsigned int flags)
 91{
 92	struct sock_filter filter[] = {
 93		BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
 94			offsetof(struct seccomp_data, nr)),
 95		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1),
 96		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_USER_NOTIF),
 97		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
 98	};
 99
100	struct sock_fprog prog = {
101		.len = (unsigned short)ARRAY_SIZE(filter),
102		.filter = filter,
103	};
104
105	return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog);
106}
107
108static int handle_req(struct seccomp_notif *req,
109		      struct seccomp_notif_resp *resp, int listener)
110{
111	char path[PATH_MAX], source[PATH_MAX], target[PATH_MAX];
112	int ret = -1, mem;
113
114	resp->id = req->id;
115	resp->error = -EPERM;
116	resp->val = 0;
117
118	if (req->data.nr != __NR_mount) {
119		fprintf(stderr, "huh? trapped something besides mount? %d\n", req->data.nr);
120		return -1;
121	}
122
123	/* Only allow bind mounts. */
124	if (!(req->data.args[3] & MS_BIND))
125		return 0;
126
127	/*
128	 * Ok, let's read the task's memory to see where they wanted their
129	 * mount to go.
130	 */
131	snprintf(path, sizeof(path), "/proc/%d/mem", req->pid);
132	mem = open(path, O_RDONLY);
133	if (mem < 0) {
134		perror("open mem");
135		return -1;
136	}
137
138	/*
139	 * Now we avoid a TOCTOU: we referred to a pid by its pid, but since
140	 * the pid that made the syscall may have died, we need to confirm that
141	 * the pid is still valid after we open its /proc/pid/mem file. We can
142	 * ask the listener fd this as follows.
143	 *
144	 * Note that this check should occur *after* any task-specific
145	 * resources are opened, to make sure that the task has not died and
146	 * we're not wrongly reading someone else's state in order to make
147	 * decisions.
148	 */
149	if (ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req->id) < 0) {
150		fprintf(stderr, "task died before we could map its memory\n");
151		goto out;
152	}
153
154	/*
155	 * Phew, we've got the right /proc/pid/mem. Now we can read it. Note
156	 * that to avoid another TOCTOU, we should read all of the pointer args
157	 * before we decide to allow the syscall.
158	 */
159	if (lseek(mem, req->data.args[0], SEEK_SET) < 0) {
160		perror("seek");
161		goto out;
162	}
163
164	ret = read(mem, source, sizeof(source));
165	if (ret < 0) {
166		perror("read");
167		goto out;
168	}
169
170	if (lseek(mem, req->data.args[1], SEEK_SET) < 0) {
171		perror("seek");
172		goto out;
173	}
174
175	ret = read(mem, target, sizeof(target));
176	if (ret < 0) {
177		perror("read");
178		goto out;
179	}
180
181	/*
182	 * Our policy is to only allow bind mounts inside /tmp. This isn't very
183	 * interesting, because we could do unprivlieged bind mounts with user
184	 * namespaces already, but you get the idea.
185	 */
186	if (!strncmp(source, "/tmp/", 5) && !strncmp(target, "/tmp/", 5)) {
187		if (mount(source, target, NULL, req->data.args[3], NULL) < 0) {
188			ret = -1;
189			perror("actual mount");
190			goto out;
191		}
192		resp->error = 0;
193	}
194
195	/* Even if we didn't allow it because of policy, generating the
196	 * response was be a success, because we want to tell the worker EPERM.
197	 */
198	ret = 0;
199
200out:
201	close(mem);
202	return ret;
203}
204
205int main(void)
206{
207	int sk_pair[2], ret = 1, status, listener;
208	pid_t worker = 0 , tracer = 0;
209
210	if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair) < 0) {
211		perror("socketpair");
212		return 1;
213	}
214
215	worker = fork();
216	if (worker < 0) {
217		perror("fork");
218		goto close_pair;
219	}
220
221	if (worker == 0) {
222		listener = user_trap_syscall(__NR_mount,
223					     SECCOMP_FILTER_FLAG_NEW_LISTENER);
224		if (listener < 0) {
225			perror("seccomp");
226			exit(1);
227		}
228
229		/*
230		 * Drop privileges. We definitely can't mount as uid 1000.
231		 */
232		if (setuid(1000) < 0) {
233			perror("setuid");
234			exit(1);
235		}
236
237		/*
238		 * Send the listener to the parent; also serves as
239		 * synchronization.
240		 */
241		if (send_fd(sk_pair[1], listener) < 0)
242			exit(1);
243		close(listener);
244
245		if (mkdir("/tmp/foo", 0755) < 0) {
246			perror("mkdir");
247			exit(1);
248		}
249
250		/*
251		 * Try a bad mount just for grins.
252		 */
253		if (mount("/dev/sda", "/tmp/foo", NULL, 0, NULL) != -1) {
254			fprintf(stderr, "huh? mounted /dev/sda?\n");
255			exit(1);
256		}
257
258		if (errno != EPERM) {
259			perror("bad error from mount");
260			exit(1);
261		}
262
263		/*
264		 * Ok, we expect this one to succeed.
265		 */
266		if (mount("/tmp/foo", "/tmp/foo", NULL, MS_BIND, NULL) < 0) {
267			perror("mount");
268			exit(1);
269		}
270
271		exit(0);
272	}
273
274	/*
275	 * Get the listener from the child.
276	 */
277	listener = recv_fd(sk_pair[0]);
278	if (listener < 0)
279		goto out_kill;
280
281	/*
282	 * Fork a task to handle the requests. This isn't strictly necessary,
283	 * but it makes the particular writing of this sample easier, since we
284	 * can just wait ofr the tracee to exit and kill the tracer.
285	 */
286	tracer = fork();
287	if (tracer < 0) {
288		perror("fork");
289		goto out_kill;
290	}
291
292	if (tracer == 0) {
293		struct seccomp_notif *req;
294		struct seccomp_notif_resp *resp;
295		struct seccomp_notif_sizes sizes;
296
297		if (seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes) < 0) {
298			perror("seccomp(GET_NOTIF_SIZES)");
299			goto out_close;
300		}
301
302		req = malloc(sizes.seccomp_notif);
303		if (!req)
304			goto out_close;
305
306		resp = malloc(sizes.seccomp_notif_resp);
307		if (!resp)
308			goto out_req;
309		memset(resp, 0, sizes.seccomp_notif_resp);
310
311		while (1) {
312			memset(req, 0, sizes.seccomp_notif);
313			if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, req)) {
314				perror("ioctl recv");
315				goto out_resp;
316			}
317
318			if (handle_req(req, resp, listener) < 0)
319				goto out_resp;
320
321			/*
322			 * ENOENT here means that the task may have gotten a
323			 * signal and restarted the syscall. It's up to the
324			 * handler to decide what to do in this case, but for
325			 * the sample code, we just ignore it. Probably
326			 * something better should happen, like undoing the
327			 * mount, or keeping track of the args to make sure we
328			 * don't do it again.
329			 */
330			if (ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, resp) < 0 &&
331			    errno != ENOENT) {
332				perror("ioctl send");
333				goto out_resp;
334			}
335		}
336out_resp:
337		free(resp);
338out_req:
339		free(req);
340out_close:
341		close(listener);
342		exit(1);
343	}
344
345	close(listener);
346
347	if (waitpid(worker, &status, 0) != worker) {
348		perror("waitpid");
349		goto out_kill;
350	}
351
352	if (umount2("/tmp/foo", MNT_DETACH) < 0 && errno != EINVAL) {
353		perror("umount2");
354		goto out_kill;
355	}
356
357	if (remove("/tmp/foo") < 0 && errno != ENOENT) {
358		perror("remove");
359		exit(1);
360	}
361
362	if (!WIFEXITED(status) || WEXITSTATUS(status)) {
363		fprintf(stderr, "worker exited nonzero\n");
364		goto out_kill;
365	}
366
367	ret = 0;
368
369out_kill:
370	if (tracer > 0)
371		kill(tracer, SIGKILL);
372	if (worker > 0)
373		kill(worker, SIGKILL);
374
375close_pair:
376	close(sk_pair[0]);
377	close(sk_pair[1]);
378	return ret;
379}

  1#include <signal.h>
  2#include <stdio.h>
  3#include <stdlib.h>
  4#include <unistd.h>
  5#include <errno.h>
  6#include <fcntl.h>
  7#include <string.h>
  8#include <stddef.h>
  9#include <sys/sysmacros.h>
 10#include <sys/types.h>
 11#include <sys/wait.h>
 12#include <sys/socket.h>
 13#include <sys/stat.h>
 14#include <sys/mman.h>
 15#include <sys/syscall.h>
 16#include <sys/user.h>
 17#include <sys/ioctl.h>
 18#include <sys/ptrace.h>
 19#include <sys/mount.h>
 20#include <linux/limits.h>
 21#include <linux/filter.h>
 22#include <linux/seccomp.h>
 23
 24#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
 25
 26static int seccomp(unsigned int op, unsigned int flags, void *args)
 27{
 28	errno = 0;
 29	return syscall(__NR_seccomp, op, flags, args);
 30}
 31
 32static int send_fd(int sock, int fd)
 33{
 34	struct msghdr msg = {};
 35	struct cmsghdr *cmsg;
 
 36	char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c';
 37	struct iovec io = {
 38		.iov_base = &c,
 39		.iov_len = 1,
 40	};
 41
 42	msg.msg_iov = &io;
 43	msg.msg_iovlen = 1;
 44	msg.msg_control = buf;
 45	msg.msg_controllen = sizeof(buf);
 46	cmsg = CMSG_FIRSTHDR(&msg);
 47	cmsg->cmsg_level = SOL_SOCKET;
 48	cmsg->cmsg_type = SCM_RIGHTS;
 49	cmsg->cmsg_len = CMSG_LEN(sizeof(int));
 50	*((int *)CMSG_DATA(cmsg)) = fd;
 
 51	msg.msg_controllen = cmsg->cmsg_len;
 52
 53	if (sendmsg(sock, &msg, 0) < 0) {
 54		perror("sendmsg");
 55		return -1;
 56	}
 57
 58	return 0;
 59}
 60
 61static int recv_fd(int sock)
 62{
 63	struct msghdr msg = {};
 64	struct cmsghdr *cmsg;
 
 65	char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c';
 66	struct iovec io = {
 67		.iov_base = &c,
 68		.iov_len = 1,
 69	};
 70
 71	msg.msg_iov = &io;
 72	msg.msg_iovlen = 1;
 73	msg.msg_control = buf;
 74	msg.msg_controllen = sizeof(buf);
 75
 76	if (recvmsg(sock, &msg, 0) < 0) {
 77		perror("recvmsg");
 78		return -1;
 79	}
 80
 81	cmsg = CMSG_FIRSTHDR(&msg);
 
 82
 83	return *((int *)CMSG_DATA(cmsg));
 84}
 85
 86static int user_trap_syscall(int nr, unsigned int flags)
 87{
 88	struct sock_filter filter[] = {
 89		BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
 90			offsetof(struct seccomp_data, nr)),
 91		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1),
 92		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_USER_NOTIF),
 93		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
 94	};
 95
 96	struct sock_fprog prog = {
 97		.len = (unsigned short)ARRAY_SIZE(filter),
 98		.filter = filter,
 99	};
100
101	return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog);
102}
103
104static int handle_req(struct seccomp_notif *req,
105		      struct seccomp_notif_resp *resp, int listener)
106{
107	char path[PATH_MAX], source[PATH_MAX], target[PATH_MAX];
108	int ret = -1, mem;
109
110	resp->id = req->id;
111	resp->error = -EPERM;
112	resp->val = 0;
113
114	if (req->data.nr != __NR_mount) {
115		fprintf(stderr, "huh? trapped something besides mount? %d\n", req->data.nr);
116		return -1;
117	}
118
119	/* Only allow bind mounts. */
120	if (!(req->data.args[3] & MS_BIND))
121		return 0;
122
123	/*
124	 * Ok, let's read the task's memory to see where they wanted their
125	 * mount to go.
126	 */
127	snprintf(path, sizeof(path), "/proc/%d/mem", req->pid);
128	mem = open(path, O_RDONLY);
129	if (mem < 0) {
130		perror("open mem");
131		return -1;
132	}
133
134	/*
135	 * Now we avoid a TOCTOU: we referred to a pid by its pid, but since
136	 * the pid that made the syscall may have died, we need to confirm that
137	 * the pid is still valid after we open its /proc/pid/mem file. We can
138	 * ask the listener fd this as follows.
139	 *
140	 * Note that this check should occur *after* any task-specific
141	 * resources are opened, to make sure that the task has not died and
142	 * we're not wrongly reading someone else's state in order to make
143	 * decisions.
144	 */
145	if (ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req->id) < 0) {
146		fprintf(stderr, "task died before we could map its memory\n");
147		goto out;
148	}
149
150	/*
151	 * Phew, we've got the right /proc/pid/mem. Now we can read it. Note
152	 * that to avoid another TOCTOU, we should read all of the pointer args
153	 * before we decide to allow the syscall.
154	 */
155	if (lseek(mem, req->data.args[0], SEEK_SET) < 0) {
156		perror("seek");
157		goto out;
158	}
159
160	ret = read(mem, source, sizeof(source));
161	if (ret < 0) {
162		perror("read");
163		goto out;
164	}
165
166	if (lseek(mem, req->data.args[1], SEEK_SET) < 0) {
167		perror("seek");
168		goto out;
169	}
170
171	ret = read(mem, target, sizeof(target));
172	if (ret < 0) {
173		perror("read");
174		goto out;
175	}
176
177	/*
178	 * Our policy is to only allow bind mounts inside /tmp. This isn't very
179	 * interesting, because we could do unprivlieged bind mounts with user
180	 * namespaces already, but you get the idea.
181	 */
182	if (!strncmp(source, "/tmp/", 5) && !strncmp(target, "/tmp/", 5)) {
183		if (mount(source, target, NULL, req->data.args[3], NULL) < 0) {
184			ret = -1;
185			perror("actual mount");
186			goto out;
187		}
188		resp->error = 0;
189	}
190
191	/* Even if we didn't allow it because of policy, generating the
192	 * response was be a success, because we want to tell the worker EPERM.
193	 */
194	ret = 0;
195
196out:
197	close(mem);
198	return ret;
199}
200
201int main(void)
202{
203	int sk_pair[2], ret = 1, status, listener;
204	pid_t worker = 0 , tracer = 0;
205
206	if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair) < 0) {
207		perror("socketpair");
208		return 1;
209	}
210
211	worker = fork();
212	if (worker < 0) {
213		perror("fork");
214		goto close_pair;
215	}
216
217	if (worker == 0) {
218		listener = user_trap_syscall(__NR_mount,
219					     SECCOMP_FILTER_FLAG_NEW_LISTENER);
220		if (listener < 0) {
221			perror("seccomp");
222			exit(1);
223		}
224
225		/*
226		 * Drop privileges. We definitely can't mount as uid 1000.
227		 */
228		if (setuid(1000) < 0) {
229			perror("setuid");
230			exit(1);
231		}
232
233		/*
234		 * Send the listener to the parent; also serves as
235		 * synchronization.
236		 */
237		if (send_fd(sk_pair[1], listener) < 0)
238			exit(1);
239		close(listener);
240
241		if (mkdir("/tmp/foo", 0755) < 0) {
242			perror("mkdir");
243			exit(1);
244		}
245
246		/*
247		 * Try a bad mount just for grins.
248		 */
249		if (mount("/dev/sda", "/tmp/foo", NULL, 0, NULL) != -1) {
250			fprintf(stderr, "huh? mounted /dev/sda?\n");
251			exit(1);
252		}
253
254		if (errno != EPERM) {
255			perror("bad error from mount");
256			exit(1);
257		}
258
259		/*
260		 * Ok, we expect this one to succeed.
261		 */
262		if (mount("/tmp/foo", "/tmp/foo", NULL, MS_BIND, NULL) < 0) {
263			perror("mount");
264			exit(1);
265		}
266
267		exit(0);
268	}
269
270	/*
271	 * Get the listener from the child.
272	 */
273	listener = recv_fd(sk_pair[0]);
274	if (listener < 0)
275		goto out_kill;
276
277	/*
278	 * Fork a task to handle the requests. This isn't strictly necessary,
279	 * but it makes the particular writing of this sample easier, since we
280	 * can just wait ofr the tracee to exit and kill the tracer.
281	 */
282	tracer = fork();
283	if (tracer < 0) {
284		perror("fork");
285		goto out_kill;
286	}
287
288	if (tracer == 0) {
289		struct seccomp_notif *req;
290		struct seccomp_notif_resp *resp;
291		struct seccomp_notif_sizes sizes;
292
293		if (seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes) < 0) {
294			perror("seccomp(GET_NOTIF_SIZES)");
295			goto out_close;
296		}
297
298		req = malloc(sizes.seccomp_notif);
299		if (!req)
300			goto out_close;
301
302		resp = malloc(sizes.seccomp_notif_resp);
303		if (!resp)
304			goto out_req;
305		memset(resp, 0, sizes.seccomp_notif_resp);
306
307		while (1) {
308			memset(req, 0, sizes.seccomp_notif);
309			if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, req)) {
310				perror("ioctl recv");
311				goto out_resp;
312			}
313
314			if (handle_req(req, resp, listener) < 0)
315				goto out_resp;
316
317			/*
318			 * ENOENT here means that the task may have gotten a
319			 * signal and restarted the syscall. It's up to the
320			 * handler to decide what to do in this case, but for
321			 * the sample code, we just ignore it. Probably
322			 * something better should happen, like undoing the
323			 * mount, or keeping track of the args to make sure we
324			 * don't do it again.
325			 */
326			if (ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, resp) < 0 &&
327			    errno != ENOENT) {
328				perror("ioctl send");
329				goto out_resp;
330			}
331		}
332out_resp:
333		free(resp);
334out_req:
335		free(req);
336out_close:
337		close(listener);
338		exit(1);
339	}
340
341	close(listener);
342
343	if (waitpid(worker, &status, 0) != worker) {
344		perror("waitpid");
345		goto out_kill;
346	}
347
348	if (umount2("/tmp/foo", MNT_DETACH) < 0 && errno != EINVAL) {
349		perror("umount2");
350		goto out_kill;
351	}
352
353	if (remove("/tmp/foo") < 0 && errno != ENOENT) {
354		perror("remove");
355		exit(1);
356	}
357
358	if (!WIFEXITED(status) || WEXITSTATUS(status)) {
359		fprintf(stderr, "worker exited nonzero\n");
360		goto out_kill;
361	}
362
363	ret = 0;
364
365out_kill:
366	if (tracer > 0)
367		kill(tracer, SIGKILL);
368	if (worker > 0)
369		kill(worker, SIGKILL);
370
371close_pair:
372	close(sk_pair[0]);
373	close(sk_pair[1]);
374	return ret;
375}