Loading...
1// SPDX-License-Identifier: GPL-2.0
2// Copyright (c) 2019 Facebook
3
4#include <stdint.h>
5#include <stddef.h>
6#include <stdbool.h>
7#include <linux/bpf.h>
8#include <linux/ptrace.h>
9#include <linux/sched.h>
10#include <linux/types.h>
11#include <bpf/bpf_helpers.h>
12
13#include "bpf_compiler.h"
14
15typedef uint32_t pid_t;
16struct task_struct {};
17
18#define TASK_COMM_LEN 16
19#define PERF_MAX_STACK_DEPTH 127
20
21#define STROBE_TYPE_INVALID 0
22#define STROBE_TYPE_INT 1
23#define STROBE_TYPE_STR 2
24#define STROBE_TYPE_MAP 3
25
26#define STACK_TABLE_EPOCH_SHIFT 20
27#define STROBE_MAX_STR_LEN 1
28#define STROBE_MAX_CFGS 32
29#define READ_MAP_VAR_PAYLOAD_CAP \
30 ((1 + STROBE_MAX_MAP_ENTRIES * 2) * STROBE_MAX_STR_LEN)
31#define STROBE_MAX_PAYLOAD \
32 (STROBE_MAX_STRS * STROBE_MAX_STR_LEN + \
33 STROBE_MAX_MAPS * READ_MAP_VAR_PAYLOAD_CAP)
34
35struct strobe_value_header {
36 /*
37 * meaning depends on type:
38 * 1. int: 0, if value not set, 1 otherwise
39 * 2. str: 1 always, whether value is set or not is determined by ptr
40 * 3. map: 1 always, pointer points to additional struct with number
41 * of entries (up to STROBE_MAX_MAP_ENTRIES)
42 */
43 uint16_t len;
44 /*
45 * _reserved might be used for some future fields/flags, but we always
46 * want to keep strobe_value_header to be 8 bytes, so BPF can read 16
47 * bytes in one go and get both header and value
48 */
49 uint8_t _reserved[6];
50};
51
52/*
53 * strobe_value_generic is used from BPF probe only, but needs to be a union
54 * of strobe_value_int/strobe_value_str/strobe_value_map
55 */
56struct strobe_value_generic {
57 struct strobe_value_header header;
58 union {
59 int64_t val;
60 void *ptr;
61 };
62};
63
64struct strobe_value_int {
65 struct strobe_value_header header;
66 int64_t value;
67};
68
69struct strobe_value_str {
70 struct strobe_value_header header;
71 const char* value;
72};
73
74struct strobe_value_map {
75 struct strobe_value_header header;
76 const struct strobe_map_raw* value;
77};
78
79struct strobe_map_entry {
80 const char* key;
81 const char* val;
82};
83
84/*
85 * Map of C-string key/value pairs with fixed maximum capacity. Each map has
86 * corresponding int64 ID, which application can use (or ignore) in whatever
87 * way appropriate. Map is "write-only", there is no way to get data out of
88 * map. Map is intended to be used to provide metadata for profilers and is
89 * not to be used for internal in-app communication. All methods are
90 * thread-safe.
91 */
92struct strobe_map_raw {
93 /*
94 * general purpose unique ID that's up to application to decide
95 * whether and how to use; for request metadata use case id is unique
96 * request ID that's used to match metadata with stack traces on
97 * Strobelight backend side
98 */
99 int64_t id;
100 /* number of used entries in map */
101 int64_t cnt;
102 /*
103 * having volatile doesn't change anything on BPF side, but clang
104 * emits warnings for passing `volatile const char *` into
105 * bpf_probe_read_user_str that expects just `const char *`
106 */
107 const char* tag;
108 /*
109 * key/value entries, each consisting of 2 pointers to key and value
110 * C strings
111 */
112 struct strobe_map_entry entries[STROBE_MAX_MAP_ENTRIES];
113};
114
115/* Following values define supported values of TLS mode */
116#define TLS_NOT_SET -1
117#define TLS_LOCAL_EXEC 0
118#define TLS_IMM_EXEC 1
119#define TLS_GENERAL_DYN 2
120
121/*
122 * structure that universally represents TLS location (both for static
123 * executables and shared libraries)
124 */
125struct strobe_value_loc {
126 /*
127 * tls_mode defines what TLS mode was used for particular metavariable:
128 * - -1 (TLS_NOT_SET) - no metavariable;
129 * - 0 (TLS_LOCAL_EXEC) - Local Executable mode;
130 * - 1 (TLS_IMM_EXEC) - Immediate Executable mode;
131 * - 2 (TLS_GENERAL_DYN) - General Dynamic mode;
132 * Local Dynamic mode is not yet supported, because never seen in
133 * practice. Mode defines how offset field is interpreted. See
134 * calc_location() in below for details.
135 */
136 int64_t tls_mode;
137 /*
138 * TLS_LOCAL_EXEC: offset from thread pointer (fs:0 for x86-64,
139 * tpidr_el0 for aarch64).
140 * TLS_IMM_EXEC: absolute address of GOT entry containing offset
141 * from thread pointer;
142 * TLS_GENERAL_DYN: absolute address of double GOT entry
143 * containing tls_index_t struct;
144 */
145 int64_t offset;
146};
147
148struct strobemeta_cfg {
149 int64_t req_meta_idx;
150 struct strobe_value_loc int_locs[STROBE_MAX_INTS];
151 struct strobe_value_loc str_locs[STROBE_MAX_STRS];
152 struct strobe_value_loc map_locs[STROBE_MAX_MAPS];
153};
154
155struct strobe_map_descr {
156 uint64_t id;
157 int16_t tag_len;
158 /*
159 * cnt <0 - map value isn't set;
160 * 0 - map has id set, but no key/value entries
161 */
162 int16_t cnt;
163 /*
164 * both key_lens[i] and val_lens[i] should be >0 for present key/value
165 * entry
166 */
167 uint16_t key_lens[STROBE_MAX_MAP_ENTRIES];
168 uint16_t val_lens[STROBE_MAX_MAP_ENTRIES];
169};
170
171struct strobemeta_payload {
172 /* req_id has valid request ID, if req_meta_valid == 1 */
173 int64_t req_id;
174 uint8_t req_meta_valid;
175 /*
176 * mask has Nth bit set to 1, if Nth metavar was present and
177 * successfully read
178 */
179 uint64_t int_vals_set_mask;
180 int64_t int_vals[STROBE_MAX_INTS];
181 /* len is >0 for present values */
182 uint16_t str_lens[STROBE_MAX_STRS];
183 /* if map_descrs[i].cnt == -1, metavar is not present/set */
184 struct strobe_map_descr map_descrs[STROBE_MAX_MAPS];
185 /*
186 * payload has compactly packed values of str and map variables in the
187 * form: strval1\0strval2\0map1key1\0map1val1\0map2key1\0map2val1\0
188 * (and so on); str_lens[i], key_lens[i] and val_lens[i] determines
189 * value length
190 */
191 char payload[STROBE_MAX_PAYLOAD];
192};
193
194struct strobelight_bpf_sample {
195 uint64_t ktime;
196 char comm[TASK_COMM_LEN];
197 pid_t pid;
198 int user_stack_id;
199 int kernel_stack_id;
200 int has_meta;
201 struct strobemeta_payload metadata;
202 /*
203 * makes it possible to pass (<real payload size> + 1) as data size to
204 * perf_submit() to avoid perf_submit's paranoia about passing zero as
205 * size, as it deduces that <real payload size> might be
206 * **theoretically** zero
207 */
208 char dummy_safeguard;
209};
210
211struct {
212 __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
213 __uint(max_entries, 32);
214 __uint(key_size, sizeof(int));
215 __uint(value_size, sizeof(int));
216} samples SEC(".maps");
217
218struct {
219 __uint(type, BPF_MAP_TYPE_STACK_TRACE);
220 __uint(max_entries, 16);
221 __uint(key_size, sizeof(uint32_t));
222 __uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH);
223} stacks_0 SEC(".maps");
224
225struct {
226 __uint(type, BPF_MAP_TYPE_STACK_TRACE);
227 __uint(max_entries, 16);
228 __uint(key_size, sizeof(uint32_t));
229 __uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH);
230} stacks_1 SEC(".maps");
231
232struct {
233 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
234 __uint(max_entries, 1);
235 __type(key, uint32_t);
236 __type(value, struct strobelight_bpf_sample);
237} sample_heap SEC(".maps");
238
239struct {
240 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
241 __uint(max_entries, STROBE_MAX_CFGS);
242 __type(key, pid_t);
243 __type(value, struct strobemeta_cfg);
244} strobemeta_cfgs SEC(".maps");
245
246/* Type for the dtv. */
247/* https://github.com/lattera/glibc/blob/master/nptl/sysdeps/x86_64/tls.h#L34 */
248typedef union dtv {
249 size_t counter;
250 struct {
251 void* val;
252 bool is_static;
253 } pointer;
254} dtv_t;
255
256/* Partial definition for tcbhead_t */
257/* https://github.com/bminor/glibc/blob/master/sysdeps/x86_64/nptl/tls.h#L42 */
258struct tcbhead {
259 void* tcb;
260 dtv_t* dtv;
261};
262
263/*
264 * TLS module/offset information for shared library case.
265 * For x86-64, this is mapped onto two entries in GOT.
266 * For aarch64, this is pointed to by second GOT entry.
267 */
268struct tls_index {
269 uint64_t module;
270 uint64_t offset;
271};
272
273#ifdef SUBPROGS
274__noinline
275#else
276__always_inline
277#endif
278static void *calc_location(struct strobe_value_loc *loc, void *tls_base)
279{
280 /*
281 * tls_mode value is:
282 * - -1 (TLS_NOT_SET), if no metavar is present;
283 * - 0 (TLS_LOCAL_EXEC), if metavar uses Local Executable mode of TLS
284 * (offset from fs:0 for x86-64 or tpidr_el0 for aarch64);
285 * - 1 (TLS_IMM_EXEC), if metavar uses Immediate Executable mode of TLS;
286 * - 2 (TLS_GENERAL_DYN), if metavar uses General Dynamic mode of TLS;
287 * This schema allows to use something like:
288 * (tls_mode + 1) * (tls_base + offset)
289 * to get NULL for "no metavar" location, or correct pointer for local
290 * executable mode without doing extra ifs.
291 */
292 if (loc->tls_mode <= TLS_LOCAL_EXEC) {
293 /* static executable is simple, we just have offset from
294 * tls_base */
295 void *addr = tls_base + loc->offset;
296 /* multiply by (tls_mode + 1) to get NULL, if we have no
297 * metavar in this slot */
298 return (void *)((loc->tls_mode + 1) * (int64_t)addr);
299 }
300 /*
301 * Other modes are more complicated, we need to jump through few hoops.
302 *
303 * For immediate executable mode (currently supported only for aarch64):
304 * - loc->offset is pointing to a GOT entry containing fixed offset
305 * relative to tls_base;
306 *
307 * For general dynamic mode:
308 * - loc->offset is pointing to a beginning of double GOT entries;
309 * - (for aarch64 only) second entry points to tls_index_t struct;
310 * - (for x86-64 only) two GOT entries are already tls_index_t;
311 * - tls_index_t->module is used to find start of TLS section in
312 * which variable resides;
313 * - tls_index_t->offset provides offset within that TLS section,
314 * pointing to value of variable.
315 */
316 struct tls_index tls_index;
317 dtv_t *dtv;
318 void *tls_ptr;
319
320 bpf_probe_read_user(&tls_index, sizeof(struct tls_index),
321 (void *)loc->offset);
322 /* valid module index is always positive */
323 if (tls_index.module > 0) {
324 /* dtv = ((struct tcbhead *)tls_base)->dtv[tls_index.module] */
325 bpf_probe_read_user(&dtv, sizeof(dtv),
326 &((struct tcbhead *)tls_base)->dtv);
327 dtv += tls_index.module;
328 } else {
329 dtv = NULL;
330 }
331 bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv);
332 /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */
333 return tls_ptr && tls_ptr != (void *)-1
334 ? tls_ptr + tls_index.offset
335 : NULL;
336}
337
338#ifdef SUBPROGS
339__noinline
340#else
341__always_inline
342#endif
343static void read_int_var(struct strobemeta_cfg *cfg,
344 size_t idx, void *tls_base,
345 struct strobe_value_generic *value,
346 struct strobemeta_payload *data)
347{
348 void *location = calc_location(&cfg->int_locs[idx], tls_base);
349 if (!location)
350 return;
351
352 bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
353 data->int_vals[idx] = value->val;
354 if (value->header.len)
355 data->int_vals_set_mask |= (1 << idx);
356}
357
358static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg,
359 size_t idx, void *tls_base,
360 struct strobe_value_generic *value,
361 struct strobemeta_payload *data,
362 size_t off)
363{
364 void *location;
365 uint64_t len;
366
367 data->str_lens[idx] = 0;
368 location = calc_location(&cfg->str_locs[idx], tls_base);
369 if (!location)
370 return 0;
371
372 bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
373 len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, value->ptr);
374 /*
375 * if bpf_probe_read_user_str returns error (<0), due to casting to
376 * unsigned int, it will become big number, so next check is
377 * sufficient to check for errors AND prove to BPF verifier, that
378 * bpf_probe_read_user_str won't return anything bigger than
379 * STROBE_MAX_STR_LEN
380 */
381 if (len > STROBE_MAX_STR_LEN)
382 return 0;
383
384 data->str_lens[idx] = len;
385 return off + len;
386}
387
388static __always_inline uint64_t read_map_var(struct strobemeta_cfg *cfg,
389 size_t idx, void *tls_base,
390 struct strobe_value_generic *value,
391 struct strobemeta_payload *data,
392 size_t off)
393{
394 struct strobe_map_descr* descr = &data->map_descrs[idx];
395 struct strobe_map_raw map;
396 void *location;
397 uint64_t len;
398
399 descr->tag_len = 0; /* presume no tag is set */
400 descr->cnt = -1; /* presume no value is set */
401
402 location = calc_location(&cfg->map_locs[idx], tls_base);
403 if (!location)
404 return off;
405
406 bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
407 if (bpf_probe_read_user(&map, sizeof(struct strobe_map_raw), value->ptr))
408 return off;
409
410 descr->id = map.id;
411 descr->cnt = map.cnt;
412 if (cfg->req_meta_idx == idx) {
413 data->req_id = map.id;
414 data->req_meta_valid = 1;
415 }
416
417 len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, map.tag);
418 if (len <= STROBE_MAX_STR_LEN) {
419 descr->tag_len = len;
420 off += len;
421 }
422
423#ifdef NO_UNROLL
424 __pragma_loop_no_unroll
425#else
426 __pragma_loop_unroll
427#endif
428 for (int i = 0; i < STROBE_MAX_MAP_ENTRIES; ++i) {
429 if (i >= map.cnt)
430 break;
431
432 descr->key_lens[i] = 0;
433 len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN,
434 map.entries[i].key);
435 if (len <= STROBE_MAX_STR_LEN) {
436 descr->key_lens[i] = len;
437 off += len;
438 }
439 descr->val_lens[i] = 0;
440 len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN,
441 map.entries[i].val);
442 if (len <= STROBE_MAX_STR_LEN) {
443 descr->val_lens[i] = len;
444 off += len;
445 }
446 }
447
448 return off;
449}
450
451#ifdef USE_BPF_LOOP
452enum read_type {
453 READ_INT_VAR,
454 READ_MAP_VAR,
455 READ_STR_VAR,
456};
457
458struct read_var_ctx {
459 struct strobemeta_payload *data;
460 void *tls_base;
461 struct strobemeta_cfg *cfg;
462 size_t payload_off;
463 /* value gets mutated */
464 struct strobe_value_generic *value;
465 enum read_type type;
466};
467
468static int read_var_callback(__u64 index, struct read_var_ctx *ctx)
469{
470 /* lose precision info for ctx->payload_off, verifier won't track
471 * double xor, barrier_var() is needed to force clang keep both xors.
472 */
473 ctx->payload_off ^= index;
474 barrier_var(ctx->payload_off);
475 ctx->payload_off ^= index;
476 switch (ctx->type) {
477 case READ_INT_VAR:
478 if (index >= STROBE_MAX_INTS)
479 return 1;
480 read_int_var(ctx->cfg, index, ctx->tls_base, ctx->value, ctx->data);
481 break;
482 case READ_MAP_VAR:
483 if (index >= STROBE_MAX_MAPS)
484 return 1;
485 if (ctx->payload_off > sizeof(ctx->data->payload) - READ_MAP_VAR_PAYLOAD_CAP)
486 return 1;
487 ctx->payload_off = read_map_var(ctx->cfg, index, ctx->tls_base,
488 ctx->value, ctx->data, ctx->payload_off);
489 break;
490 case READ_STR_VAR:
491 if (index >= STROBE_MAX_STRS)
492 return 1;
493 if (ctx->payload_off > sizeof(ctx->data->payload) - STROBE_MAX_STR_LEN)
494 return 1;
495 ctx->payload_off = read_str_var(ctx->cfg, index, ctx->tls_base,
496 ctx->value, ctx->data, ctx->payload_off);
497 break;
498 }
499 return 0;
500}
501#endif /* USE_BPF_LOOP */
502
503/*
504 * read_strobe_meta returns NULL, if no metadata was read; otherwise returns
505 * pointer to *right after* payload ends
506 */
507#ifdef SUBPROGS
508__noinline
509#else
510__always_inline
511#endif
512static void *read_strobe_meta(struct task_struct *task,
513 struct strobemeta_payload *data)
514{
515 pid_t pid = bpf_get_current_pid_tgid() >> 32;
516 struct strobe_value_generic value = {0};
517 struct strobemeta_cfg *cfg;
518 size_t payload_off;
519 void *tls_base;
520
521 cfg = bpf_map_lookup_elem(&strobemeta_cfgs, &pid);
522 if (!cfg)
523 return NULL;
524
525 data->int_vals_set_mask = 0;
526 data->req_meta_valid = 0;
527 payload_off = 0;
528 /*
529 * we don't have struct task_struct definition, it should be:
530 * tls_base = (void *)task->thread.fsbase;
531 */
532 tls_base = (void *)task;
533
534#ifdef USE_BPF_LOOP
535 struct read_var_ctx ctx = {
536 .cfg = cfg,
537 .tls_base = tls_base,
538 .value = &value,
539 .data = data,
540 .payload_off = 0,
541 };
542 int err;
543
544 ctx.type = READ_INT_VAR;
545 err = bpf_loop(STROBE_MAX_INTS, read_var_callback, &ctx, 0);
546 if (err != STROBE_MAX_INTS)
547 return NULL;
548
549 ctx.type = READ_STR_VAR;
550 err = bpf_loop(STROBE_MAX_STRS, read_var_callback, &ctx, 0);
551 if (err != STROBE_MAX_STRS)
552 return NULL;
553
554 ctx.type = READ_MAP_VAR;
555 err = bpf_loop(STROBE_MAX_MAPS, read_var_callback, &ctx, 0);
556 if (err != STROBE_MAX_MAPS)
557 return NULL;
558
559 payload_off = ctx.payload_off;
560 /* this should not really happen, here only to satisfy verifier */
561 if (payload_off > sizeof(data->payload))
562 payload_off = sizeof(data->payload);
563#else
564#ifdef NO_UNROLL
565 __pragma_loop_no_unroll
566#else
567 __pragma_loop_unroll
568#endif /* NO_UNROLL */
569 for (int i = 0; i < STROBE_MAX_INTS; ++i) {
570 read_int_var(cfg, i, tls_base, &value, data);
571 }
572#ifdef NO_UNROLL
573 __pragma_loop_no_unroll
574#else
575 __pragma_loop_unroll
576#endif /* NO_UNROLL */
577 for (int i = 0; i < STROBE_MAX_STRS; ++i) {
578 payload_off = read_str_var(cfg, i, tls_base, &value, data, payload_off);
579 }
580#ifdef NO_UNROLL
581 __pragma_loop_no_unroll
582#else
583 __pragma_loop_unroll
584#endif /* NO_UNROLL */
585 for (int i = 0; i < STROBE_MAX_MAPS; ++i) {
586 payload_off = read_map_var(cfg, i, tls_base, &value, data, payload_off);
587 }
588#endif /* USE_BPF_LOOP */
589
590 /*
591 * return pointer right after end of payload, so it's possible to
592 * calculate exact amount of useful data that needs to be sent
593 */
594 return &data->payload[payload_off];
595}
596
597SEC("raw_tracepoint/kfree_skb")
598int on_event(struct pt_regs *ctx) {
599 pid_t pid = bpf_get_current_pid_tgid() >> 32;
600 struct strobelight_bpf_sample* sample;
601 struct task_struct *task;
602 uint32_t zero = 0;
603 uint64_t ktime_ns;
604 void *sample_end;
605
606 sample = bpf_map_lookup_elem(&sample_heap, &zero);
607 if (!sample)
608 return 0; /* this will never happen */
609
610 sample->pid = pid;
611 bpf_get_current_comm(&sample->comm, TASK_COMM_LEN);
612 ktime_ns = bpf_ktime_get_ns();
613 sample->ktime = ktime_ns;
614
615 task = (struct task_struct *)bpf_get_current_task();
616 sample_end = read_strobe_meta(task, &sample->metadata);
617 sample->has_meta = sample_end != NULL;
618 sample_end = sample_end ? : &sample->metadata;
619
620 if ((ktime_ns >> STACK_TABLE_EPOCH_SHIFT) & 1) {
621 sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_1, 0);
622 sample->user_stack_id = bpf_get_stackid(ctx, &stacks_1, BPF_F_USER_STACK);
623 } else {
624 sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_0, 0);
625 sample->user_stack_id = bpf_get_stackid(ctx, &stacks_0, BPF_F_USER_STACK);
626 }
627
628 uint64_t sample_size = sample_end - (void *)sample;
629 /* should always be true */
630 if (sample_size < sizeof(struct strobelight_bpf_sample))
631 bpf_perf_event_output(ctx, &samples, 0, sample, 1 + sample_size);
632 return 0;
633}
634
635char _license[] SEC("license") = "GPL";
1// SPDX-License-Identifier: GPL-2.0
2// Copyright (c) 2019 Facebook
3
4#include <stdint.h>
5#include <stddef.h>
6#include <stdbool.h>
7#include <linux/bpf.h>
8#include <linux/ptrace.h>
9#include <linux/sched.h>
10#include <linux/types.h>
11#include <bpf/bpf_helpers.h>
12
13typedef uint32_t pid_t;
14struct task_struct {};
15
16#define TASK_COMM_LEN 16
17#define PERF_MAX_STACK_DEPTH 127
18
19#define STROBE_TYPE_INVALID 0
20#define STROBE_TYPE_INT 1
21#define STROBE_TYPE_STR 2
22#define STROBE_TYPE_MAP 3
23
24#define STACK_TABLE_EPOCH_SHIFT 20
25#define STROBE_MAX_STR_LEN 1
26#define STROBE_MAX_CFGS 32
27#define READ_MAP_VAR_PAYLOAD_CAP \
28 ((1 + STROBE_MAX_MAP_ENTRIES * 2) * STROBE_MAX_STR_LEN)
29#define STROBE_MAX_PAYLOAD \
30 (STROBE_MAX_STRS * STROBE_MAX_STR_LEN + \
31 STROBE_MAX_MAPS * READ_MAP_VAR_PAYLOAD_CAP)
32
33struct strobe_value_header {
34 /*
35 * meaning depends on type:
36 * 1. int: 0, if value not set, 1 otherwise
37 * 2. str: 1 always, whether value is set or not is determined by ptr
38 * 3. map: 1 always, pointer points to additional struct with number
39 * of entries (up to STROBE_MAX_MAP_ENTRIES)
40 */
41 uint16_t len;
42 /*
43 * _reserved might be used for some future fields/flags, but we always
44 * want to keep strobe_value_header to be 8 bytes, so BPF can read 16
45 * bytes in one go and get both header and value
46 */
47 uint8_t _reserved[6];
48};
49
50/*
51 * strobe_value_generic is used from BPF probe only, but needs to be a union
52 * of strobe_value_int/strobe_value_str/strobe_value_map
53 */
54struct strobe_value_generic {
55 struct strobe_value_header header;
56 union {
57 int64_t val;
58 void *ptr;
59 };
60};
61
62struct strobe_value_int {
63 struct strobe_value_header header;
64 int64_t value;
65};
66
67struct strobe_value_str {
68 struct strobe_value_header header;
69 const char* value;
70};
71
72struct strobe_value_map {
73 struct strobe_value_header header;
74 const struct strobe_map_raw* value;
75};
76
77struct strobe_map_entry {
78 const char* key;
79 const char* val;
80};
81
82/*
83 * Map of C-string key/value pairs with fixed maximum capacity. Each map has
84 * corresponding int64 ID, which application can use (or ignore) in whatever
85 * way appropriate. Map is "write-only", there is no way to get data out of
86 * map. Map is intended to be used to provide metadata for profilers and is
87 * not to be used for internal in-app communication. All methods are
88 * thread-safe.
89 */
90struct strobe_map_raw {
91 /*
92 * general purpose unique ID that's up to application to decide
93 * whether and how to use; for request metadata use case id is unique
94 * request ID that's used to match metadata with stack traces on
95 * Strobelight backend side
96 */
97 int64_t id;
98 /* number of used entries in map */
99 int64_t cnt;
100 /*
101 * having volatile doesn't change anything on BPF side, but clang
102 * emits warnings for passing `volatile const char *` into
103 * bpf_probe_read_user_str that expects just `const char *`
104 */
105 const char* tag;
106 /*
107 * key/value entries, each consisting of 2 pointers to key and value
108 * C strings
109 */
110 struct strobe_map_entry entries[STROBE_MAX_MAP_ENTRIES];
111};
112
113/* Following values define supported values of TLS mode */
114#define TLS_NOT_SET -1
115#define TLS_LOCAL_EXEC 0
116#define TLS_IMM_EXEC 1
117#define TLS_GENERAL_DYN 2
118
119/*
120 * structure that universally represents TLS location (both for static
121 * executables and shared libraries)
122 */
123struct strobe_value_loc {
124 /*
125 * tls_mode defines what TLS mode was used for particular metavariable:
126 * - -1 (TLS_NOT_SET) - no metavariable;
127 * - 0 (TLS_LOCAL_EXEC) - Local Executable mode;
128 * - 1 (TLS_IMM_EXEC) - Immediate Executable mode;
129 * - 2 (TLS_GENERAL_DYN) - General Dynamic mode;
130 * Local Dynamic mode is not yet supported, because never seen in
131 * practice. Mode defines how offset field is interpreted. See
132 * calc_location() in below for details.
133 */
134 int64_t tls_mode;
135 /*
136 * TLS_LOCAL_EXEC: offset from thread pointer (fs:0 for x86-64,
137 * tpidr_el0 for aarch64).
138 * TLS_IMM_EXEC: absolute address of GOT entry containing offset
139 * from thread pointer;
140 * TLS_GENERAL_DYN: absolute address of double GOT entry
141 * containing tls_index_t struct;
142 */
143 int64_t offset;
144};
145
146struct strobemeta_cfg {
147 int64_t req_meta_idx;
148 struct strobe_value_loc int_locs[STROBE_MAX_INTS];
149 struct strobe_value_loc str_locs[STROBE_MAX_STRS];
150 struct strobe_value_loc map_locs[STROBE_MAX_MAPS];
151};
152
153struct strobe_map_descr {
154 uint64_t id;
155 int16_t tag_len;
156 /*
157 * cnt <0 - map value isn't set;
158 * 0 - map has id set, but no key/value entries
159 */
160 int16_t cnt;
161 /*
162 * both key_lens[i] and val_lens[i] should be >0 for present key/value
163 * entry
164 */
165 uint16_t key_lens[STROBE_MAX_MAP_ENTRIES];
166 uint16_t val_lens[STROBE_MAX_MAP_ENTRIES];
167};
168
169struct strobemeta_payload {
170 /* req_id has valid request ID, if req_meta_valid == 1 */
171 int64_t req_id;
172 uint8_t req_meta_valid;
173 /*
174 * mask has Nth bit set to 1, if Nth metavar was present and
175 * successfully read
176 */
177 uint64_t int_vals_set_mask;
178 int64_t int_vals[STROBE_MAX_INTS];
179 /* len is >0 for present values */
180 uint16_t str_lens[STROBE_MAX_STRS];
181 /* if map_descrs[i].cnt == -1, metavar is not present/set */
182 struct strobe_map_descr map_descrs[STROBE_MAX_MAPS];
183 /*
184 * payload has compactly packed values of str and map variables in the
185 * form: strval1\0strval2\0map1key1\0map1val1\0map2key1\0map2val1\0
186 * (and so on); str_lens[i], key_lens[i] and val_lens[i] determines
187 * value length
188 */
189 char payload[STROBE_MAX_PAYLOAD];
190};
191
192struct strobelight_bpf_sample {
193 uint64_t ktime;
194 char comm[TASK_COMM_LEN];
195 pid_t pid;
196 int user_stack_id;
197 int kernel_stack_id;
198 int has_meta;
199 struct strobemeta_payload metadata;
200 /*
201 * makes it possible to pass (<real payload size> + 1) as data size to
202 * perf_submit() to avoid perf_submit's paranoia about passing zero as
203 * size, as it deduces that <real payload size> might be
204 * **theoretically** zero
205 */
206 char dummy_safeguard;
207};
208
209struct {
210 __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
211 __uint(max_entries, 32);
212 __uint(key_size, sizeof(int));
213 __uint(value_size, sizeof(int));
214} samples SEC(".maps");
215
216struct {
217 __uint(type, BPF_MAP_TYPE_STACK_TRACE);
218 __uint(max_entries, 16);
219 __uint(key_size, sizeof(uint32_t));
220 __uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH);
221} stacks_0 SEC(".maps");
222
223struct {
224 __uint(type, BPF_MAP_TYPE_STACK_TRACE);
225 __uint(max_entries, 16);
226 __uint(key_size, sizeof(uint32_t));
227 __uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH);
228} stacks_1 SEC(".maps");
229
230struct {
231 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
232 __uint(max_entries, 1);
233 __type(key, uint32_t);
234 __type(value, struct strobelight_bpf_sample);
235} sample_heap SEC(".maps");
236
237struct {
238 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
239 __uint(max_entries, STROBE_MAX_CFGS);
240 __type(key, pid_t);
241 __type(value, struct strobemeta_cfg);
242} strobemeta_cfgs SEC(".maps");
243
244/* Type for the dtv. */
245/* https://github.com/lattera/glibc/blob/master/nptl/sysdeps/x86_64/tls.h#L34 */
246typedef union dtv {
247 size_t counter;
248 struct {
249 void* val;
250 bool is_static;
251 } pointer;
252} dtv_t;
253
254/* Partial definition for tcbhead_t */
255/* https://github.com/bminor/glibc/blob/master/sysdeps/x86_64/nptl/tls.h#L42 */
256struct tcbhead {
257 void* tcb;
258 dtv_t* dtv;
259};
260
261/*
262 * TLS module/offset information for shared library case.
263 * For x86-64, this is mapped onto two entries in GOT.
264 * For aarch64, this is pointed to by second GOT entry.
265 */
266struct tls_index {
267 uint64_t module;
268 uint64_t offset;
269};
270
271#ifdef SUBPROGS
272__noinline
273#else
274__always_inline
275#endif
276static void *calc_location(struct strobe_value_loc *loc, void *tls_base)
277{
278 /*
279 * tls_mode value is:
280 * - -1 (TLS_NOT_SET), if no metavar is present;
281 * - 0 (TLS_LOCAL_EXEC), if metavar uses Local Executable mode of TLS
282 * (offset from fs:0 for x86-64 or tpidr_el0 for aarch64);
283 * - 1 (TLS_IMM_EXEC), if metavar uses Immediate Executable mode of TLS;
284 * - 2 (TLS_GENERAL_DYN), if metavar uses General Dynamic mode of TLS;
285 * This schema allows to use something like:
286 * (tls_mode + 1) * (tls_base + offset)
287 * to get NULL for "no metavar" location, or correct pointer for local
288 * executable mode without doing extra ifs.
289 */
290 if (loc->tls_mode <= TLS_LOCAL_EXEC) {
291 /* static executable is simple, we just have offset from
292 * tls_base */
293 void *addr = tls_base + loc->offset;
294 /* multiply by (tls_mode + 1) to get NULL, if we have no
295 * metavar in this slot */
296 return (void *)((loc->tls_mode + 1) * (int64_t)addr);
297 }
298 /*
299 * Other modes are more complicated, we need to jump through few hoops.
300 *
301 * For immediate executable mode (currently supported only for aarch64):
302 * - loc->offset is pointing to a GOT entry containing fixed offset
303 * relative to tls_base;
304 *
305 * For general dynamic mode:
306 * - loc->offset is pointing to a beginning of double GOT entries;
307 * - (for aarch64 only) second entry points to tls_index_t struct;
308 * - (for x86-64 only) two GOT entries are already tls_index_t;
309 * - tls_index_t->module is used to find start of TLS section in
310 * which variable resides;
311 * - tls_index_t->offset provides offset within that TLS section,
312 * pointing to value of variable.
313 */
314 struct tls_index tls_index;
315 dtv_t *dtv;
316 void *tls_ptr;
317
318 bpf_probe_read_user(&tls_index, sizeof(struct tls_index),
319 (void *)loc->offset);
320 /* valid module index is always positive */
321 if (tls_index.module > 0) {
322 /* dtv = ((struct tcbhead *)tls_base)->dtv[tls_index.module] */
323 bpf_probe_read_user(&dtv, sizeof(dtv),
324 &((struct tcbhead *)tls_base)->dtv);
325 dtv += tls_index.module;
326 } else {
327 dtv = NULL;
328 }
329 bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv);
330 /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */
331 return tls_ptr && tls_ptr != (void *)-1
332 ? tls_ptr + tls_index.offset
333 : NULL;
334}
335
336#ifdef SUBPROGS
337__noinline
338#else
339__always_inline
340#endif
341static void read_int_var(struct strobemeta_cfg *cfg,
342 size_t idx, void *tls_base,
343 struct strobe_value_generic *value,
344 struct strobemeta_payload *data)
345{
346 void *location = calc_location(&cfg->int_locs[idx], tls_base);
347 if (!location)
348 return;
349
350 bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
351 data->int_vals[idx] = value->val;
352 if (value->header.len)
353 data->int_vals_set_mask |= (1 << idx);
354}
355
356static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg,
357 size_t idx, void *tls_base,
358 struct strobe_value_generic *value,
359 struct strobemeta_payload *data,
360 size_t off)
361{
362 void *location;
363 uint64_t len;
364
365 data->str_lens[idx] = 0;
366 location = calc_location(&cfg->str_locs[idx], tls_base);
367 if (!location)
368 return 0;
369
370 bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
371 len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, value->ptr);
372 /*
373 * if bpf_probe_read_user_str returns error (<0), due to casting to
374 * unsinged int, it will become big number, so next check is
375 * sufficient to check for errors AND prove to BPF verifier, that
376 * bpf_probe_read_user_str won't return anything bigger than
377 * STROBE_MAX_STR_LEN
378 */
379 if (len > STROBE_MAX_STR_LEN)
380 return 0;
381
382 data->str_lens[idx] = len;
383 return off + len;
384}
385
386static __always_inline uint64_t read_map_var(struct strobemeta_cfg *cfg,
387 size_t idx, void *tls_base,
388 struct strobe_value_generic *value,
389 struct strobemeta_payload *data,
390 size_t off)
391{
392 struct strobe_map_descr* descr = &data->map_descrs[idx];
393 struct strobe_map_raw map;
394 void *location;
395 uint64_t len;
396
397 descr->tag_len = 0; /* presume no tag is set */
398 descr->cnt = -1; /* presume no value is set */
399
400 location = calc_location(&cfg->map_locs[idx], tls_base);
401 if (!location)
402 return off;
403
404 bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
405 if (bpf_probe_read_user(&map, sizeof(struct strobe_map_raw), value->ptr))
406 return off;
407
408 descr->id = map.id;
409 descr->cnt = map.cnt;
410 if (cfg->req_meta_idx == idx) {
411 data->req_id = map.id;
412 data->req_meta_valid = 1;
413 }
414
415 len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, map.tag);
416 if (len <= STROBE_MAX_STR_LEN) {
417 descr->tag_len = len;
418 off += len;
419 }
420
421#ifdef NO_UNROLL
422#pragma clang loop unroll(disable)
423#else
424#pragma unroll
425#endif
426 for (int i = 0; i < STROBE_MAX_MAP_ENTRIES; ++i) {
427 if (i >= map.cnt)
428 break;
429
430 descr->key_lens[i] = 0;
431 len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN,
432 map.entries[i].key);
433 if (len <= STROBE_MAX_STR_LEN) {
434 descr->key_lens[i] = len;
435 off += len;
436 }
437 descr->val_lens[i] = 0;
438 len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN,
439 map.entries[i].val);
440 if (len <= STROBE_MAX_STR_LEN) {
441 descr->val_lens[i] = len;
442 off += len;
443 }
444 }
445
446 return off;
447}
448
449#ifdef USE_BPF_LOOP
450enum read_type {
451 READ_INT_VAR,
452 READ_MAP_VAR,
453 READ_STR_VAR,
454};
455
456struct read_var_ctx {
457 struct strobemeta_payload *data;
458 void *tls_base;
459 struct strobemeta_cfg *cfg;
460 size_t payload_off;
461 /* value gets mutated */
462 struct strobe_value_generic *value;
463 enum read_type type;
464};
465
466static int read_var_callback(__u64 index, struct read_var_ctx *ctx)
467{
468 /* lose precision info for ctx->payload_off, verifier won't track
469 * double xor, barrier_var() is needed to force clang keep both xors.
470 */
471 ctx->payload_off ^= index;
472 barrier_var(ctx->payload_off);
473 ctx->payload_off ^= index;
474 switch (ctx->type) {
475 case READ_INT_VAR:
476 if (index >= STROBE_MAX_INTS)
477 return 1;
478 read_int_var(ctx->cfg, index, ctx->tls_base, ctx->value, ctx->data);
479 break;
480 case READ_MAP_VAR:
481 if (index >= STROBE_MAX_MAPS)
482 return 1;
483 if (ctx->payload_off > sizeof(ctx->data->payload) - READ_MAP_VAR_PAYLOAD_CAP)
484 return 1;
485 ctx->payload_off = read_map_var(ctx->cfg, index, ctx->tls_base,
486 ctx->value, ctx->data, ctx->payload_off);
487 break;
488 case READ_STR_VAR:
489 if (index >= STROBE_MAX_STRS)
490 return 1;
491 if (ctx->payload_off > sizeof(ctx->data->payload) - STROBE_MAX_STR_LEN)
492 return 1;
493 ctx->payload_off = read_str_var(ctx->cfg, index, ctx->tls_base,
494 ctx->value, ctx->data, ctx->payload_off);
495 break;
496 }
497 return 0;
498}
499#endif /* USE_BPF_LOOP */
500
501/*
502 * read_strobe_meta returns NULL, if no metadata was read; otherwise returns
503 * pointer to *right after* payload ends
504 */
505#ifdef SUBPROGS
506__noinline
507#else
508__always_inline
509#endif
510static void *read_strobe_meta(struct task_struct *task,
511 struct strobemeta_payload *data)
512{
513 pid_t pid = bpf_get_current_pid_tgid() >> 32;
514 struct strobe_value_generic value = {0};
515 struct strobemeta_cfg *cfg;
516 size_t payload_off;
517 void *tls_base;
518
519 cfg = bpf_map_lookup_elem(&strobemeta_cfgs, &pid);
520 if (!cfg)
521 return NULL;
522
523 data->int_vals_set_mask = 0;
524 data->req_meta_valid = 0;
525 payload_off = 0;
526 /*
527 * we don't have struct task_struct definition, it should be:
528 * tls_base = (void *)task->thread.fsbase;
529 */
530 tls_base = (void *)task;
531
532#ifdef USE_BPF_LOOP
533 struct read_var_ctx ctx = {
534 .cfg = cfg,
535 .tls_base = tls_base,
536 .value = &value,
537 .data = data,
538 .payload_off = 0,
539 };
540 int err;
541
542 ctx.type = READ_INT_VAR;
543 err = bpf_loop(STROBE_MAX_INTS, read_var_callback, &ctx, 0);
544 if (err != STROBE_MAX_INTS)
545 return NULL;
546
547 ctx.type = READ_STR_VAR;
548 err = bpf_loop(STROBE_MAX_STRS, read_var_callback, &ctx, 0);
549 if (err != STROBE_MAX_STRS)
550 return NULL;
551
552 ctx.type = READ_MAP_VAR;
553 err = bpf_loop(STROBE_MAX_MAPS, read_var_callback, &ctx, 0);
554 if (err != STROBE_MAX_MAPS)
555 return NULL;
556
557 payload_off = ctx.payload_off;
558 /* this should not really happen, here only to satisfy verifer */
559 if (payload_off > sizeof(data->payload))
560 payload_off = sizeof(data->payload);
561#else
562#ifdef NO_UNROLL
563#pragma clang loop unroll(disable)
564#else
565#pragma unroll
566#endif /* NO_UNROLL */
567 for (int i = 0; i < STROBE_MAX_INTS; ++i) {
568 read_int_var(cfg, i, tls_base, &value, data);
569 }
570#ifdef NO_UNROLL
571#pragma clang loop unroll(disable)
572#else
573#pragma unroll
574#endif /* NO_UNROLL */
575 for (int i = 0; i < STROBE_MAX_STRS; ++i) {
576 payload_off = read_str_var(cfg, i, tls_base, &value, data, payload_off);
577 }
578#ifdef NO_UNROLL
579#pragma clang loop unroll(disable)
580#else
581#pragma unroll
582#endif /* NO_UNROLL */
583 for (int i = 0; i < STROBE_MAX_MAPS; ++i) {
584 payload_off = read_map_var(cfg, i, tls_base, &value, data, payload_off);
585 }
586#endif /* USE_BPF_LOOP */
587
588 /*
589 * return pointer right after end of payload, so it's possible to
590 * calculate exact amount of useful data that needs to be sent
591 */
592 return &data->payload[payload_off];
593}
594
595SEC("raw_tracepoint/kfree_skb")
596int on_event(struct pt_regs *ctx) {
597 pid_t pid = bpf_get_current_pid_tgid() >> 32;
598 struct strobelight_bpf_sample* sample;
599 struct task_struct *task;
600 uint32_t zero = 0;
601 uint64_t ktime_ns;
602 void *sample_end;
603
604 sample = bpf_map_lookup_elem(&sample_heap, &zero);
605 if (!sample)
606 return 0; /* this will never happen */
607
608 sample->pid = pid;
609 bpf_get_current_comm(&sample->comm, TASK_COMM_LEN);
610 ktime_ns = bpf_ktime_get_ns();
611 sample->ktime = ktime_ns;
612
613 task = (struct task_struct *)bpf_get_current_task();
614 sample_end = read_strobe_meta(task, &sample->metadata);
615 sample->has_meta = sample_end != NULL;
616 sample_end = sample_end ? : &sample->metadata;
617
618 if ((ktime_ns >> STACK_TABLE_EPOCH_SHIFT) & 1) {
619 sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_1, 0);
620 sample->user_stack_id = bpf_get_stackid(ctx, &stacks_1, BPF_F_USER_STACK);
621 } else {
622 sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_0, 0);
623 sample->user_stack_id = bpf_get_stackid(ctx, &stacks_0, BPF_F_USER_STACK);
624 }
625
626 uint64_t sample_size = sample_end - (void *)sample;
627 /* should always be true */
628 if (sample_size < sizeof(struct strobelight_bpf_sample))
629 bpf_perf_event_output(ctx, &samples, 0, sample, 1 + sample_size);
630 return 0;
631}
632
633char _license[] SEC("license") = "GPL";