Loading...
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Augment the raw_syscalls tracepoints with the contents of the pointer arguments.
4 *
5 * This exactly matches what is marshalled into the raw_syscall:sys_enter
6 * payload expected by the 'perf trace' beautifiers.
7 */
8
9#include "vmlinux.h"
10#include <bpf/bpf_helpers.h>
11#include <linux/limits.h>
12
13/**
14 * is_power_of_2() - check if a value is a power of two
15 * @n: the value to check
16 *
17 * Determine whether some value is a power of two, where zero is *not*
18 * considered a power of two. Return: true if @n is a power of 2, otherwise
19 * false.
20 */
21#define is_power_of_2(n) (n != 0 && ((n & (n - 1)) == 0))
22
23#define MAX_CPUS 4096
24
25/* bpf-output associated map */
26struct __augmented_syscalls__ {
27 __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
28 __type(key, int);
29 __type(value, __u32);
30 __uint(max_entries, MAX_CPUS);
31} __augmented_syscalls__ SEC(".maps");
32
33/*
34 * What to augment at entry?
35 *
36 * Pointer arg payloads (filenames, etc) passed from userspace to the kernel
37 */
38struct syscalls_sys_enter {
39 __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
40 __type(key, __u32);
41 __type(value, __u32);
42 __uint(max_entries, 512);
43} syscalls_sys_enter SEC(".maps");
44
45/*
46 * What to augment at exit?
47 *
48 * Pointer arg payloads returned from the kernel (struct stat, etc) to userspace.
49 */
50struct syscalls_sys_exit {
51 __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
52 __type(key, __u32);
53 __type(value, __u32);
54 __uint(max_entries, 512);
55} syscalls_sys_exit SEC(".maps");
56
57struct syscall_enter_args {
58 unsigned long long common_tp_fields;
59 long syscall_nr;
60 unsigned long args[6];
61};
62
63struct syscall_exit_args {
64 unsigned long long common_tp_fields;
65 long syscall_nr;
66 long ret;
67};
68
69struct augmented_arg {
70 unsigned int size;
71 int err;
72 char value[PATH_MAX];
73};
74
75struct pids_filtered {
76 __uint(type, BPF_MAP_TYPE_HASH);
77 __type(key, pid_t);
78 __type(value, bool);
79 __uint(max_entries, 64);
80} pids_filtered SEC(".maps");
81
82/*
83 * Desired design of maximum size and alignment (see RFC2553)
84 */
85#define SS_MAXSIZE 128 /* Implementation specific max size */
86
87typedef unsigned short sa_family_t;
88
89/*
90 * FIXME: Should come from system headers
91 *
92 * The definition uses anonymous union and struct in order to control the
93 * default alignment.
94 */
95struct sockaddr_storage {
96 union {
97 struct {
98 sa_family_t ss_family; /* address family */
99 /* Following field(s) are implementation specific */
100 char __data[SS_MAXSIZE - sizeof(unsigned short)];
101 /* space to achieve desired size, */
102 /* _SS_MAXSIZE value minus size of ss_family */
103 };
104 void *__align; /* implementation specific desired alignment */
105 };
106};
107
108struct augmented_args_payload {
109 struct syscall_enter_args args;
110 union {
111 struct {
112 struct augmented_arg arg, arg2;
113 };
114 struct sockaddr_storage saddr;
115 char __data[sizeof(struct augmented_arg)];
116 };
117};
118
119// We need more tmp space than the BPF stack can give us
120struct augmented_args_tmp {
121 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
122 __type(key, int);
123 __type(value, struct augmented_args_payload);
124 __uint(max_entries, 1);
125} augmented_args_tmp SEC(".maps");
126
127static inline struct augmented_args_payload *augmented_args_payload(void)
128{
129 int key = 0;
130 return bpf_map_lookup_elem(&augmented_args_tmp, &key);
131}
132
133static inline int augmented__output(void *ctx, struct augmented_args_payload *args, int len)
134{
135 /* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */
136 return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, args, len);
137}
138
139static inline
140unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const void *arg, unsigned int arg_len)
141{
142 unsigned int augmented_len = sizeof(*augmented_arg);
143 int string_len = bpf_probe_read_user_str(&augmented_arg->value, arg_len, arg);
144
145 augmented_arg->size = augmented_arg->err = 0;
146 /*
147 * probe_read_str may return < 0, e.g. -EFAULT
148 * So we leave that in the augmented_arg->size that userspace will
149 */
150 if (string_len > 0) {
151 augmented_len -= sizeof(augmented_arg->value) - string_len;
152 _Static_assert(is_power_of_2(sizeof(augmented_arg->value)), "sizeof(augmented_arg->value) needs to be a power of two");
153 augmented_len &= sizeof(augmented_arg->value) - 1;
154 augmented_arg->size = string_len;
155 } else {
156 /*
157 * So that username notice the error while still being able
158 * to skip this augmented arg record
159 */
160 augmented_arg->err = string_len;
161 augmented_len = offsetof(struct augmented_arg, value);
162 }
163
164 return augmented_len;
165}
166
167SEC("tp/raw_syscalls/sys_enter")
168int syscall_unaugmented(struct syscall_enter_args *args)
169{
170 return 1;
171}
172
173/*
174 * These will be tail_called from SEC("raw_syscalls:sys_enter"), so will find in
175 * augmented_args_tmp what was read by that raw_syscalls:sys_enter and go
176 * on from there, reading the first syscall arg as a string, i.e. open's
177 * filename.
178 */
179SEC("tp/syscalls/sys_enter_connect")
180int sys_enter_connect(struct syscall_enter_args *args)
181{
182 struct augmented_args_payload *augmented_args = augmented_args_payload();
183 const void *sockaddr_arg = (const void *)args->args[1];
184 unsigned int socklen = args->args[2];
185 unsigned int len = sizeof(augmented_args->args);
186
187 if (augmented_args == NULL)
188 return 1; /* Failure: don't filter */
189
190 _Static_assert(is_power_of_2(sizeof(augmented_args->saddr)), "sizeof(augmented_args->saddr) needs to be a power of two");
191 socklen &= sizeof(augmented_args->saddr) - 1;
192
193 bpf_probe_read_user(&augmented_args->saddr, socklen, sockaddr_arg);
194
195 return augmented__output(args, augmented_args, len + socklen);
196}
197
198SEC("tp/syscalls/sys_enter_sendto")
199int sys_enter_sendto(struct syscall_enter_args *args)
200{
201 struct augmented_args_payload *augmented_args = augmented_args_payload();
202 const void *sockaddr_arg = (const void *)args->args[4];
203 unsigned int socklen = args->args[5];
204 unsigned int len = sizeof(augmented_args->args);
205
206 if (augmented_args == NULL)
207 return 1; /* Failure: don't filter */
208
209 socklen &= sizeof(augmented_args->saddr) - 1;
210
211 bpf_probe_read_user(&augmented_args->saddr, socklen, sockaddr_arg);
212
213 return augmented__output(args, augmented_args, len + socklen);
214}
215
216SEC("tp/syscalls/sys_enter_open")
217int sys_enter_open(struct syscall_enter_args *args)
218{
219 struct augmented_args_payload *augmented_args = augmented_args_payload();
220 const void *filename_arg = (const void *)args->args[0];
221 unsigned int len = sizeof(augmented_args->args);
222
223 if (augmented_args == NULL)
224 return 1; /* Failure: don't filter */
225
226 len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value));
227
228 return augmented__output(args, augmented_args, len);
229}
230
231SEC("tp/syscalls/sys_enter_openat")
232int sys_enter_openat(struct syscall_enter_args *args)
233{
234 struct augmented_args_payload *augmented_args = augmented_args_payload();
235 const void *filename_arg = (const void *)args->args[1];
236 unsigned int len = sizeof(augmented_args->args);
237
238 if (augmented_args == NULL)
239 return 1; /* Failure: don't filter */
240
241 len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value));
242
243 return augmented__output(args, augmented_args, len);
244}
245
246SEC("tp/syscalls/sys_enter_rename")
247int sys_enter_rename(struct syscall_enter_args *args)
248{
249 struct augmented_args_payload *augmented_args = augmented_args_payload();
250 const void *oldpath_arg = (const void *)args->args[0],
251 *newpath_arg = (const void *)args->args[1];
252 unsigned int len = sizeof(augmented_args->args), oldpath_len;
253
254 if (augmented_args == NULL)
255 return 1; /* Failure: don't filter */
256
257 oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value));
258 len += oldpath_len + augmented_arg__read_str((void *)(&augmented_args->arg) + oldpath_len, newpath_arg, sizeof(augmented_args->arg.value));
259
260 return augmented__output(args, augmented_args, len);
261}
262
263SEC("tp/syscalls/sys_enter_renameat")
264int sys_enter_renameat(struct syscall_enter_args *args)
265{
266 struct augmented_args_payload *augmented_args = augmented_args_payload();
267 const void *oldpath_arg = (const void *)args->args[1],
268 *newpath_arg = (const void *)args->args[3];
269 unsigned int len = sizeof(augmented_args->args), oldpath_len;
270
271 if (augmented_args == NULL)
272 return 1; /* Failure: don't filter */
273
274 oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value));
275 len += oldpath_len + augmented_arg__read_str((void *)(&augmented_args->arg) + oldpath_len, newpath_arg, sizeof(augmented_args->arg.value));
276
277 return augmented__output(args, augmented_args, len);
278}
279
280#define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */
281
282// we need just the start, get the size to then copy it
283struct perf_event_attr_size {
284 __u32 type;
285 /*
286 * Size of the attr structure, for fwd/bwd compat.
287 */
288 __u32 size;
289};
290
291SEC("tp/syscalls/sys_enter_perf_event_open")
292int sys_enter_perf_event_open(struct syscall_enter_args *args)
293{
294 struct augmented_args_payload *augmented_args = augmented_args_payload();
295 const struct perf_event_attr_size *attr = (const struct perf_event_attr_size *)args->args[0], *attr_read;
296 unsigned int len = sizeof(augmented_args->args);
297
298 if (augmented_args == NULL)
299 goto failure;
300
301 if (bpf_probe_read_user(&augmented_args->__data, sizeof(*attr), attr) < 0)
302 goto failure;
303
304 attr_read = (const struct perf_event_attr_size *)augmented_args->__data;
305
306 __u32 size = attr_read->size;
307
308 if (!size)
309 size = PERF_ATTR_SIZE_VER0;
310
311 if (size > sizeof(augmented_args->__data))
312 goto failure;
313
314 // Now that we read attr->size and tested it against the size limits, read it completely
315 if (bpf_probe_read_user(&augmented_args->__data, size, attr) < 0)
316 goto failure;
317
318 return augmented__output(args, augmented_args, len + size);
319failure:
320 return 1; /* Failure: don't filter */
321}
322
323SEC("tp/syscalls/sys_enter_clock_nanosleep")
324int sys_enter_clock_nanosleep(struct syscall_enter_args *args)
325{
326 struct augmented_args_payload *augmented_args = augmented_args_payload();
327 const void *rqtp_arg = (const void *)args->args[2];
328 unsigned int len = sizeof(augmented_args->args);
329 __u32 size = sizeof(struct timespec64);
330
331 if (augmented_args == NULL)
332 goto failure;
333
334 if (size > sizeof(augmented_args->__data))
335 goto failure;
336
337 bpf_probe_read_user(&augmented_args->__data, size, rqtp_arg);
338
339 return augmented__output(args, augmented_args, len + size);
340failure:
341 return 1; /* Failure: don't filter */
342}
343
344static pid_t getpid(void)
345{
346 return bpf_get_current_pid_tgid();
347}
348
349static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
350{
351 return bpf_map_lookup_elem(pids, &pid) != NULL;
352}
353
354SEC("tp/raw_syscalls/sys_enter")
355int sys_enter(struct syscall_enter_args *args)
356{
357 struct augmented_args_payload *augmented_args;
358 /*
359 * We start len, the amount of data that will be in the perf ring
360 * buffer, if this is not filtered out by one of pid_filter__has(),
361 * syscall->enabled, etc, with the non-augmented raw syscall payload,
362 * i.e. sizeof(augmented_args->args).
363 *
364 * We'll add to this as we add augmented syscalls right after that
365 * initial, non-augmented raw_syscalls:sys_enter payload.
366 */
367
368 if (pid_filter__has(&pids_filtered, getpid()))
369 return 0;
370
371 augmented_args = augmented_args_payload();
372 if (augmented_args == NULL)
373 return 1;
374
375 bpf_probe_read_kernel(&augmented_args->args, sizeof(augmented_args->args), args);
376
377 /*
378 * Jump to syscall specific augmenter, even if the default one,
379 * "!raw_syscalls:unaugmented" that will just return 1 to return the
380 * unaugmented tracepoint payload.
381 */
382 bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.syscall_nr);
383
384 // If not found on the PROG_ARRAY syscalls map, then we're filtering it:
385 return 0;
386}
387
388SEC("tp/raw_syscalls/sys_exit")
389int sys_exit(struct syscall_exit_args *args)
390{
391 struct syscall_exit_args exit_args;
392
393 if (pid_filter__has(&pids_filtered, getpid()))
394 return 0;
395
396 bpf_probe_read_kernel(&exit_args, sizeof(exit_args), args);
397 /*
398 * Jump to syscall specific return augmenter, even if the default one,
399 * "!raw_syscalls:unaugmented" that will just return 1 to return the
400 * unaugmented tracepoint payload.
401 */
402 bpf_tail_call(args, &syscalls_sys_exit, exit_args.syscall_nr);
403 /*
404 * If not found on the PROG_ARRAY syscalls map, then we're filtering it:
405 */
406 return 0;
407}
408
409char _license[] SEC("license") = "GPL";