Linux Audio

Check our new training course

Loading...
v6.13.7
  1// SPDX-License-Identifier: GPL-2.0
  2// Copyright (c) 2019 Facebook
  3
  4#include <stdint.h>
  5#include <stddef.h>
  6#include <stdbool.h>
  7#include <linux/bpf.h>
  8#include <linux/ptrace.h>
  9#include <linux/sched.h>
 10#include <linux/types.h>
 11#include <bpf/bpf_helpers.h>
 12
 13#include "bpf_compiler.h"
 14
 15typedef uint32_t pid_t;
 16struct task_struct {};
 17
 18#define TASK_COMM_LEN 16
 19#define PERF_MAX_STACK_DEPTH 127
 20
 21#define STROBE_TYPE_INVALID 0
 22#define STROBE_TYPE_INT 1
 23#define STROBE_TYPE_STR 2
 24#define STROBE_TYPE_MAP 3
 25
 26#define STACK_TABLE_EPOCH_SHIFT 20
 27#define STROBE_MAX_STR_LEN 1
 28#define STROBE_MAX_CFGS 32
 29#define READ_MAP_VAR_PAYLOAD_CAP					\
 30	((1 + STROBE_MAX_MAP_ENTRIES * 2) * STROBE_MAX_STR_LEN)
 31#define STROBE_MAX_PAYLOAD						\
 32	(STROBE_MAX_STRS * STROBE_MAX_STR_LEN +				\
 33	 STROBE_MAX_MAPS * READ_MAP_VAR_PAYLOAD_CAP)
 34
 35struct strobe_value_header {
 36	/*
 37	 * meaning depends on type:
 38	 * 1. int: 0, if value not set, 1 otherwise
 39	 * 2. str: 1 always, whether value is set or not is determined by ptr
 40	 * 3. map: 1 always, pointer points to additional struct with number
 41	 *    of entries (up to STROBE_MAX_MAP_ENTRIES)
 42	 */
 43	uint16_t len;
 44	/*
 45	 * _reserved might be used for some future fields/flags, but we always
 46	 * want to keep strobe_value_header to be 8 bytes, so BPF can read 16
 47	 * bytes in one go and get both header and value
 48	 */
 49	uint8_t _reserved[6];
 50};
 51
 52/*
 53 * strobe_value_generic is used from BPF probe only, but needs to be a union
 54 * of strobe_value_int/strobe_value_str/strobe_value_map
 55 */
 56struct strobe_value_generic {
 57	struct strobe_value_header header;
 58	union {
 59		int64_t val;
 60		void *ptr;
 61	};
 62};
 63
 64struct strobe_value_int {
 65	struct strobe_value_header header;
 66	int64_t value;
 67};
 68
 69struct strobe_value_str {
 70	struct strobe_value_header header;
 71	const char* value;
 72};
 73
 74struct strobe_value_map {
 75	struct strobe_value_header header;
 76	const struct strobe_map_raw* value;
 77};
 78
 79struct strobe_map_entry {
 80	const char* key;
 81	const char* val;
 82};
 83
 84/*
 85 * Map of C-string key/value pairs with fixed maximum capacity. Each map has
 86 * corresponding int64 ID, which application can use (or ignore) in whatever
 87 * way appropriate. Map is "write-only", there is no way to get data out of
 88 * map. Map is intended to be used to provide metadata for profilers and is
 89 * not to be used for internal in-app communication. All methods are
 90 * thread-safe.
 91 */
 92struct strobe_map_raw {
 93	/*
 94	 * general purpose unique ID that's up to application to decide
 95	 * whether and how to use; for request metadata use case id is unique
 96	 * request ID that's used to match metadata with stack traces on
 97	 * Strobelight backend side
 98	 */
 99	int64_t id;
100	/* number of used entries in map */
101	int64_t cnt;
102	/*
103	 * having volatile doesn't change anything on BPF side, but clang
104	 * emits warnings for passing `volatile const char *` into
105	 * bpf_probe_read_user_str that expects just `const char *`
106	 */
107	const char* tag;
108	/*
109	 * key/value entries, each consisting of 2 pointers to key and value
110	 * C strings
111	 */
112	struct strobe_map_entry entries[STROBE_MAX_MAP_ENTRIES];
113};
114
115/* Following values define supported values of TLS mode */
116#define TLS_NOT_SET -1
117#define TLS_LOCAL_EXEC 0
118#define TLS_IMM_EXEC 1
119#define TLS_GENERAL_DYN 2
120
121/*
122 * structure that universally represents TLS location (both for static
123 * executables and shared libraries)
124 */
125struct strobe_value_loc {
126	/*
127	 * tls_mode defines what TLS mode was used for particular metavariable:
128	 * - -1 (TLS_NOT_SET) - no metavariable;
129	 * - 0 (TLS_LOCAL_EXEC) - Local Executable mode;
130	 * - 1 (TLS_IMM_EXEC) - Immediate Executable mode;
131	 * - 2 (TLS_GENERAL_DYN) - General Dynamic mode;
132	 * Local Dynamic mode is not yet supported, because never seen in
133	 * practice.  Mode defines how offset field is interpreted. See
134	 * calc_location() in below for details.
135	 */
136	int64_t tls_mode;
137	/*
138	 * TLS_LOCAL_EXEC: offset from thread pointer (fs:0 for x86-64,
139	 * tpidr_el0 for aarch64).
140	 * TLS_IMM_EXEC: absolute address of GOT entry containing offset
141	 * from thread pointer;
142	 * TLS_GENERAL_DYN: absolute address of double GOT entry
143	 * containing tls_index_t struct;
144	 */
145	int64_t offset;
146};
147
148struct strobemeta_cfg {
149	int64_t req_meta_idx;
150	struct strobe_value_loc int_locs[STROBE_MAX_INTS];
151	struct strobe_value_loc str_locs[STROBE_MAX_STRS];
152	struct strobe_value_loc map_locs[STROBE_MAX_MAPS];
153};
154
155struct strobe_map_descr {
156	uint64_t id;
157	int16_t tag_len;
158	/*
159	 * cnt <0 - map value isn't set;
160	 * 0 - map has id set, but no key/value entries
161	 */
162	int16_t cnt;
163	/*
164	 * both key_lens[i] and val_lens[i] should be >0 for present key/value
165	 * entry
166	 */
167	uint16_t key_lens[STROBE_MAX_MAP_ENTRIES];
168	uint16_t val_lens[STROBE_MAX_MAP_ENTRIES];
169};
170
171struct strobemeta_payload {
172	/* req_id has valid request ID, if req_meta_valid == 1 */
173	int64_t req_id;
174	uint8_t req_meta_valid;
175	/*
176	 * mask has Nth bit set to 1, if Nth metavar was present and
177	 * successfully read
178	 */
179	uint64_t int_vals_set_mask;
180	int64_t int_vals[STROBE_MAX_INTS];
181	/* len is >0 for present values */
182	uint16_t str_lens[STROBE_MAX_STRS];
183	/* if map_descrs[i].cnt == -1, metavar is not present/set */
184	struct strobe_map_descr map_descrs[STROBE_MAX_MAPS];
185	/*
186	 * payload has compactly packed values of str and map variables in the
187	 * form: strval1\0strval2\0map1key1\0map1val1\0map2key1\0map2val1\0
188	 * (and so on); str_lens[i], key_lens[i] and val_lens[i] determines
189	 * value length
190	 */
191	char payload[STROBE_MAX_PAYLOAD];
192};
193
194struct strobelight_bpf_sample {
195	uint64_t ktime;
196	char comm[TASK_COMM_LEN];
197	pid_t pid;
198	int user_stack_id;
199	int kernel_stack_id;
200	int has_meta;
201	struct strobemeta_payload metadata;
202	/*
203	 * makes it possible to pass (<real payload size> + 1) as data size to
204	 * perf_submit() to avoid perf_submit's paranoia about passing zero as
205	 * size, as it deduces that <real payload size> might be
206	 * **theoretically** zero
207	 */
208	char dummy_safeguard;
209};
210
211struct {
212	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
213	__uint(max_entries, 32);
214	__uint(key_size, sizeof(int));
215	__uint(value_size, sizeof(int));
216} samples SEC(".maps");
217
218struct {
219	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
220	__uint(max_entries, 16);
221	__uint(key_size, sizeof(uint32_t));
222	__uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH);
223} stacks_0 SEC(".maps");
224
225struct {
226	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
227	__uint(max_entries, 16);
228	__uint(key_size, sizeof(uint32_t));
229	__uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH);
230} stacks_1 SEC(".maps");
231
232struct {
233	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
234	__uint(max_entries, 1);
235	__type(key, uint32_t);
236	__type(value, struct strobelight_bpf_sample);
237} sample_heap SEC(".maps");
238
239struct {
240	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
241	__uint(max_entries, STROBE_MAX_CFGS);
242	__type(key, pid_t);
243	__type(value, struct strobemeta_cfg);
244} strobemeta_cfgs SEC(".maps");
245
246/* Type for the dtv.  */
247/* https://github.com/lattera/glibc/blob/master/nptl/sysdeps/x86_64/tls.h#L34 */
248typedef union dtv {
249	size_t counter;
250	struct {
251		void* val;
252		bool is_static;
253	} pointer;
254} dtv_t;
255
256/* Partial definition for tcbhead_t */
257/* https://github.com/bminor/glibc/blob/master/sysdeps/x86_64/nptl/tls.h#L42 */
258struct tcbhead {
259	void* tcb;
260	dtv_t* dtv;
261};
262
263/*
264 * TLS module/offset information for shared library case.
265 * For x86-64, this is mapped onto two entries in GOT.
266 * For aarch64, this is pointed to by second GOT entry.
267 */
268struct tls_index {
269	uint64_t module;
270	uint64_t offset;
271};
272
273#ifdef SUBPROGS
274__noinline
275#else
276__always_inline
277#endif
278static void *calc_location(struct strobe_value_loc *loc, void *tls_base)
279{
280	/*
281	 * tls_mode value is:
282	 * - -1 (TLS_NOT_SET), if no metavar is present;
283	 * - 0 (TLS_LOCAL_EXEC), if metavar uses Local Executable mode of TLS
284	 * (offset from fs:0 for x86-64 or tpidr_el0 for aarch64);
285	 * - 1 (TLS_IMM_EXEC), if metavar uses Immediate Executable mode of TLS;
286	 * - 2 (TLS_GENERAL_DYN), if metavar uses General Dynamic mode of TLS;
287	 * This schema allows to use something like:
288	 * (tls_mode + 1) * (tls_base + offset)
289	 * to get NULL for "no metavar" location, or correct pointer for local
290	 * executable mode without doing extra ifs.
291	 */
292	if (loc->tls_mode <= TLS_LOCAL_EXEC) {
293		/* static executable is simple, we just have offset from
294		 * tls_base */
295		void *addr = tls_base + loc->offset;
296		/* multiply by (tls_mode + 1) to get NULL, if we have no
297		 * metavar in this slot */
298		return (void *)((loc->tls_mode + 1) * (int64_t)addr);
299	}
300	/*
301	 * Other modes are more complicated, we need to jump through few hoops.
302	 *
303	 * For immediate executable mode (currently supported only for aarch64):
304	 *  - loc->offset is pointing to a GOT entry containing fixed offset
305	 *  relative to tls_base;
306	 *
307	 * For general dynamic mode:
308	 *  - loc->offset is pointing to a beginning of double GOT entries;
309	 *  - (for aarch64 only) second entry points to tls_index_t struct;
310	 *  - (for x86-64 only) two GOT entries are already tls_index_t;
311	 *  - tls_index_t->module is used to find start of TLS section in
312	 *  which variable resides;
313	 *  - tls_index_t->offset provides offset within that TLS section,
314	 *  pointing to value of variable.
315	 */
316	struct tls_index tls_index;
317	dtv_t *dtv;
318	void *tls_ptr;
319
320	bpf_probe_read_user(&tls_index, sizeof(struct tls_index),
321			    (void *)loc->offset);
322	/* valid module index is always positive */
323	if (tls_index.module > 0) {
324		/* dtv = ((struct tcbhead *)tls_base)->dtv[tls_index.module] */
325		bpf_probe_read_user(&dtv, sizeof(dtv),
326				    &((struct tcbhead *)tls_base)->dtv);
327		dtv += tls_index.module;
328	} else {
329		dtv = NULL;
330	}
331	bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv);
332	/* if pointer has (void *)-1 value, then TLS wasn't initialized yet */
333	return tls_ptr && tls_ptr != (void *)-1
334		? tls_ptr + tls_index.offset
335		: NULL;
336}
337
338#ifdef SUBPROGS
339__noinline
340#else
341__always_inline
342#endif
343static void read_int_var(struct strobemeta_cfg *cfg,
344			 size_t idx, void *tls_base,
345			 struct strobe_value_generic *value,
346			 struct strobemeta_payload *data)
347{
348	void *location = calc_location(&cfg->int_locs[idx], tls_base);
349	if (!location)
350		return;
351
352	bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
353	data->int_vals[idx] = value->val;
354	if (value->header.len)
355		data->int_vals_set_mask |= (1 << idx);
356}
357
358static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg,
359					     size_t idx, void *tls_base,
360					     struct strobe_value_generic *value,
361					     struct strobemeta_payload *data,
362					     size_t off)
363{
364	void *location;
365	uint64_t len;
366
367	data->str_lens[idx] = 0;
368	location = calc_location(&cfg->str_locs[idx], tls_base);
369	if (!location)
370		return 0;
371
372	bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
373	len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, value->ptr);
374	/*
375	 * if bpf_probe_read_user_str returns error (<0), due to casting to
376	 * unsigned int, it will become big number, so next check is
377	 * sufficient to check for errors AND prove to BPF verifier, that
378	 * bpf_probe_read_user_str won't return anything bigger than
379	 * STROBE_MAX_STR_LEN
380	 */
381	if (len > STROBE_MAX_STR_LEN)
382		return 0;
383
384	data->str_lens[idx] = len;
385	return off + len;
386}
387
388static __always_inline uint64_t read_map_var(struct strobemeta_cfg *cfg,
389					     size_t idx, void *tls_base,
390					     struct strobe_value_generic *value,
391					     struct strobemeta_payload *data,
392					     size_t off)
393{
394	struct strobe_map_descr* descr = &data->map_descrs[idx];
395	struct strobe_map_raw map;
396	void *location;
397	uint64_t len;
398
399	descr->tag_len = 0; /* presume no tag is set */
400	descr->cnt = -1; /* presume no value is set */
401
402	location = calc_location(&cfg->map_locs[idx], tls_base);
403	if (!location)
404		return off;
405
406	bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
407	if (bpf_probe_read_user(&map, sizeof(struct strobe_map_raw), value->ptr))
408		return off;
409
410	descr->id = map.id;
411	descr->cnt = map.cnt;
412	if (cfg->req_meta_idx == idx) {
413		data->req_id = map.id;
414		data->req_meta_valid = 1;
415	}
416
417	len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, map.tag);
418	if (len <= STROBE_MAX_STR_LEN) {
419		descr->tag_len = len;
420		off += len;
421	}
422
423#ifdef NO_UNROLL
424	__pragma_loop_no_unroll
425#else
426	__pragma_loop_unroll
427#endif
428	for (int i = 0; i < STROBE_MAX_MAP_ENTRIES; ++i) {
429		if (i >= map.cnt)
430			break;
431
432		descr->key_lens[i] = 0;
433		len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN,
434					      map.entries[i].key);
435		if (len <= STROBE_MAX_STR_LEN) {
436			descr->key_lens[i] = len;
437			off += len;
438		}
439		descr->val_lens[i] = 0;
440		len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN,
441					      map.entries[i].val);
442		if (len <= STROBE_MAX_STR_LEN) {
443			descr->val_lens[i] = len;
444			off += len;
445		}
446	}
447
448	return off;
449}
450
451#ifdef USE_BPF_LOOP
452enum read_type {
453	READ_INT_VAR,
454	READ_MAP_VAR,
455	READ_STR_VAR,
456};
457
458struct read_var_ctx {
459	struct strobemeta_payload *data;
460	void *tls_base;
461	struct strobemeta_cfg *cfg;
462	size_t payload_off;
463	/* value gets mutated */
464	struct strobe_value_generic *value;
465	enum read_type type;
466};
467
468static int read_var_callback(__u64 index, struct read_var_ctx *ctx)
469{
470	/* lose precision info for ctx->payload_off, verifier won't track
471	 * double xor, barrier_var() is needed to force clang keep both xors.
472	 */
473	ctx->payload_off ^= index;
474	barrier_var(ctx->payload_off);
475	ctx->payload_off ^= index;
476	switch (ctx->type) {
477	case READ_INT_VAR:
478		if (index >= STROBE_MAX_INTS)
479			return 1;
480		read_int_var(ctx->cfg, index, ctx->tls_base, ctx->value, ctx->data);
481		break;
482	case READ_MAP_VAR:
483		if (index >= STROBE_MAX_MAPS)
484			return 1;
485		if (ctx->payload_off > sizeof(ctx->data->payload) - READ_MAP_VAR_PAYLOAD_CAP)
486			return 1;
487		ctx->payload_off = read_map_var(ctx->cfg, index, ctx->tls_base,
488						ctx->value, ctx->data, ctx->payload_off);
489		break;
490	case READ_STR_VAR:
491		if (index >= STROBE_MAX_STRS)
492			return 1;
493		if (ctx->payload_off > sizeof(ctx->data->payload) - STROBE_MAX_STR_LEN)
494			return 1;
495		ctx->payload_off = read_str_var(ctx->cfg, index, ctx->tls_base,
496						ctx->value, ctx->data, ctx->payload_off);
497		break;
498	}
499	return 0;
500}
501#endif /* USE_BPF_LOOP */
502
503/*
504 * read_strobe_meta returns NULL, if no metadata was read; otherwise returns
505 * pointer to *right after* payload ends
506 */
507#ifdef SUBPROGS
508__noinline
509#else
510__always_inline
511#endif
512static void *read_strobe_meta(struct task_struct *task,
513			      struct strobemeta_payload *data)
514{
515	pid_t pid = bpf_get_current_pid_tgid() >> 32;
516	struct strobe_value_generic value = {0};
517	struct strobemeta_cfg *cfg;
518	size_t payload_off;
519	void *tls_base;
520
521	cfg = bpf_map_lookup_elem(&strobemeta_cfgs, &pid);
522	if (!cfg)
523		return NULL;
524
525	data->int_vals_set_mask = 0;
526	data->req_meta_valid = 0;
527	payload_off = 0;
528	/*
529	 * we don't have struct task_struct definition, it should be:
530	 * tls_base = (void *)task->thread.fsbase;
531	 */
532	tls_base = (void *)task;
533
534#ifdef USE_BPF_LOOP
535	struct read_var_ctx ctx = {
536		.cfg = cfg,
537		.tls_base = tls_base,
538		.value = &value,
539		.data = data,
540		.payload_off = 0,
541	};
542	int err;
543
544	ctx.type = READ_INT_VAR;
545	err = bpf_loop(STROBE_MAX_INTS, read_var_callback, &ctx, 0);
546	if (err != STROBE_MAX_INTS)
547		return NULL;
548
549	ctx.type = READ_STR_VAR;
550	err = bpf_loop(STROBE_MAX_STRS, read_var_callback, &ctx, 0);
551	if (err != STROBE_MAX_STRS)
552		return NULL;
553
554	ctx.type = READ_MAP_VAR;
555	err = bpf_loop(STROBE_MAX_MAPS, read_var_callback, &ctx, 0);
556	if (err != STROBE_MAX_MAPS)
557		return NULL;
558
559	payload_off = ctx.payload_off;
560	/* this should not really happen, here only to satisfy verifier */
561	if (payload_off > sizeof(data->payload))
562		payload_off = sizeof(data->payload);
563#else
564#ifdef NO_UNROLL
565	__pragma_loop_no_unroll
566#else
567	__pragma_loop_unroll
568#endif /* NO_UNROLL */
569	for (int i = 0; i < STROBE_MAX_INTS; ++i) {
570		read_int_var(cfg, i, tls_base, &value, data);
571	}
572#ifdef NO_UNROLL
573	__pragma_loop_no_unroll
574#else
575	__pragma_loop_unroll
576#endif /* NO_UNROLL */
577	for (int i = 0; i < STROBE_MAX_STRS; ++i) {
578		payload_off = read_str_var(cfg, i, tls_base, &value, data, payload_off);
579	}
580#ifdef NO_UNROLL
581	__pragma_loop_no_unroll
582#else
583	__pragma_loop_unroll
584#endif /* NO_UNROLL */
585	for (int i = 0; i < STROBE_MAX_MAPS; ++i) {
586		payload_off = read_map_var(cfg, i, tls_base, &value, data, payload_off);
587	}
588#endif /* USE_BPF_LOOP */
589
590	/*
591	 * return pointer right after end of payload, so it's possible to
592	 * calculate exact amount of useful data that needs to be sent
593	 */
594	return &data->payload[payload_off];
595}
596
597SEC("raw_tracepoint/kfree_skb")
598int on_event(struct pt_regs *ctx) {
599	pid_t pid =  bpf_get_current_pid_tgid() >> 32;
600	struct strobelight_bpf_sample* sample;
601	struct task_struct *task;
602	uint32_t zero = 0;
603	uint64_t ktime_ns;
604	void *sample_end;
605
606	sample = bpf_map_lookup_elem(&sample_heap, &zero);
607	if (!sample)
608		return 0; /* this will never happen */
609
610	sample->pid = pid;
611	bpf_get_current_comm(&sample->comm, TASK_COMM_LEN);
612	ktime_ns = bpf_ktime_get_ns();
613	sample->ktime = ktime_ns;
614
615	task = (struct task_struct *)bpf_get_current_task();
616	sample_end = read_strobe_meta(task, &sample->metadata);
617	sample->has_meta = sample_end != NULL;
618	sample_end = sample_end ? : &sample->metadata;
619
620	if ((ktime_ns >> STACK_TABLE_EPOCH_SHIFT) & 1) {
621		sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_1, 0);
622		sample->user_stack_id = bpf_get_stackid(ctx, &stacks_1, BPF_F_USER_STACK);
623	} else {
624		sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_0, 0);
625		sample->user_stack_id = bpf_get_stackid(ctx, &stacks_0, BPF_F_USER_STACK);
626	}
627
628	uint64_t sample_size = sample_end - (void *)sample;
629	/* should always be true */
630	if (sample_size < sizeof(struct strobelight_bpf_sample))
631		bpf_perf_event_output(ctx, &samples, 0, sample, 1 + sample_size);
632	return 0;
633}
634
635char _license[] SEC("license") = "GPL";
v6.8
  1// SPDX-License-Identifier: GPL-2.0
  2// Copyright (c) 2019 Facebook
  3
  4#include <stdint.h>
  5#include <stddef.h>
  6#include <stdbool.h>
  7#include <linux/bpf.h>
  8#include <linux/ptrace.h>
  9#include <linux/sched.h>
 10#include <linux/types.h>
 11#include <bpf/bpf_helpers.h>
 12
 
 
 13typedef uint32_t pid_t;
 14struct task_struct {};
 15
 16#define TASK_COMM_LEN 16
 17#define PERF_MAX_STACK_DEPTH 127
 18
 19#define STROBE_TYPE_INVALID 0
 20#define STROBE_TYPE_INT 1
 21#define STROBE_TYPE_STR 2
 22#define STROBE_TYPE_MAP 3
 23
 24#define STACK_TABLE_EPOCH_SHIFT 20
 25#define STROBE_MAX_STR_LEN 1
 26#define STROBE_MAX_CFGS 32
 27#define READ_MAP_VAR_PAYLOAD_CAP					\
 28	((1 + STROBE_MAX_MAP_ENTRIES * 2) * STROBE_MAX_STR_LEN)
 29#define STROBE_MAX_PAYLOAD						\
 30	(STROBE_MAX_STRS * STROBE_MAX_STR_LEN +				\
 31	 STROBE_MAX_MAPS * READ_MAP_VAR_PAYLOAD_CAP)
 32
 33struct strobe_value_header {
 34	/*
 35	 * meaning depends on type:
 36	 * 1. int: 0, if value not set, 1 otherwise
 37	 * 2. str: 1 always, whether value is set or not is determined by ptr
 38	 * 3. map: 1 always, pointer points to additional struct with number
 39	 *    of entries (up to STROBE_MAX_MAP_ENTRIES)
 40	 */
 41	uint16_t len;
 42	/*
 43	 * _reserved might be used for some future fields/flags, but we always
 44	 * want to keep strobe_value_header to be 8 bytes, so BPF can read 16
 45	 * bytes in one go and get both header and value
 46	 */
 47	uint8_t _reserved[6];
 48};
 49
 50/*
 51 * strobe_value_generic is used from BPF probe only, but needs to be a union
 52 * of strobe_value_int/strobe_value_str/strobe_value_map
 53 */
 54struct strobe_value_generic {
 55	struct strobe_value_header header;
 56	union {
 57		int64_t val;
 58		void *ptr;
 59	};
 60};
 61
 62struct strobe_value_int {
 63	struct strobe_value_header header;
 64	int64_t value;
 65};
 66
 67struct strobe_value_str {
 68	struct strobe_value_header header;
 69	const char* value;
 70};
 71
 72struct strobe_value_map {
 73	struct strobe_value_header header;
 74	const struct strobe_map_raw* value;
 75};
 76
 77struct strobe_map_entry {
 78	const char* key;
 79	const char* val;
 80};
 81
 82/*
 83 * Map of C-string key/value pairs with fixed maximum capacity. Each map has
 84 * corresponding int64 ID, which application can use (or ignore) in whatever
 85 * way appropriate. Map is "write-only", there is no way to get data out of
 86 * map. Map is intended to be used to provide metadata for profilers and is
 87 * not to be used for internal in-app communication. All methods are
 88 * thread-safe.
 89 */
 90struct strobe_map_raw {
 91	/*
 92	 * general purpose unique ID that's up to application to decide
 93	 * whether and how to use; for request metadata use case id is unique
 94	 * request ID that's used to match metadata with stack traces on
 95	 * Strobelight backend side
 96	 */
 97	int64_t id;
 98	/* number of used entries in map */
 99	int64_t cnt;
100	/*
101	 * having volatile doesn't change anything on BPF side, but clang
102	 * emits warnings for passing `volatile const char *` into
103	 * bpf_probe_read_user_str that expects just `const char *`
104	 */
105	const char* tag;
106	/*
107	 * key/value entries, each consisting of 2 pointers to key and value
108	 * C strings
109	 */
110	struct strobe_map_entry entries[STROBE_MAX_MAP_ENTRIES];
111};
112
113/* Following values define supported values of TLS mode */
114#define TLS_NOT_SET -1
115#define TLS_LOCAL_EXEC 0
116#define TLS_IMM_EXEC 1
117#define TLS_GENERAL_DYN 2
118
119/*
120 * structure that universally represents TLS location (both for static
121 * executables and shared libraries)
122 */
123struct strobe_value_loc {
124	/*
125	 * tls_mode defines what TLS mode was used for particular metavariable:
126	 * - -1 (TLS_NOT_SET) - no metavariable;
127	 * - 0 (TLS_LOCAL_EXEC) - Local Executable mode;
128	 * - 1 (TLS_IMM_EXEC) - Immediate Executable mode;
129	 * - 2 (TLS_GENERAL_DYN) - General Dynamic mode;
130	 * Local Dynamic mode is not yet supported, because never seen in
131	 * practice.  Mode defines how offset field is interpreted. See
132	 * calc_location() in below for details.
133	 */
134	int64_t tls_mode;
135	/*
136	 * TLS_LOCAL_EXEC: offset from thread pointer (fs:0 for x86-64,
137	 * tpidr_el0 for aarch64).
138	 * TLS_IMM_EXEC: absolute address of GOT entry containing offset
139	 * from thread pointer;
140	 * TLS_GENERAL_DYN: absolute address of double GOT entry
141	 * containing tls_index_t struct;
142	 */
143	int64_t offset;
144};
145
146struct strobemeta_cfg {
147	int64_t req_meta_idx;
148	struct strobe_value_loc int_locs[STROBE_MAX_INTS];
149	struct strobe_value_loc str_locs[STROBE_MAX_STRS];
150	struct strobe_value_loc map_locs[STROBE_MAX_MAPS];
151};
152
153struct strobe_map_descr {
154	uint64_t id;
155	int16_t tag_len;
156	/*
157	 * cnt <0 - map value isn't set;
158	 * 0 - map has id set, but no key/value entries
159	 */
160	int16_t cnt;
161	/*
162	 * both key_lens[i] and val_lens[i] should be >0 for present key/value
163	 * entry
164	 */
165	uint16_t key_lens[STROBE_MAX_MAP_ENTRIES];
166	uint16_t val_lens[STROBE_MAX_MAP_ENTRIES];
167};
168
169struct strobemeta_payload {
170	/* req_id has valid request ID, if req_meta_valid == 1 */
171	int64_t req_id;
172	uint8_t req_meta_valid;
173	/*
174	 * mask has Nth bit set to 1, if Nth metavar was present and
175	 * successfully read
176	 */
177	uint64_t int_vals_set_mask;
178	int64_t int_vals[STROBE_MAX_INTS];
179	/* len is >0 for present values */
180	uint16_t str_lens[STROBE_MAX_STRS];
181	/* if map_descrs[i].cnt == -1, metavar is not present/set */
182	struct strobe_map_descr map_descrs[STROBE_MAX_MAPS];
183	/*
184	 * payload has compactly packed values of str and map variables in the
185	 * form: strval1\0strval2\0map1key1\0map1val1\0map2key1\0map2val1\0
186	 * (and so on); str_lens[i], key_lens[i] and val_lens[i] determines
187	 * value length
188	 */
189	char payload[STROBE_MAX_PAYLOAD];
190};
191
192struct strobelight_bpf_sample {
193	uint64_t ktime;
194	char comm[TASK_COMM_LEN];
195	pid_t pid;
196	int user_stack_id;
197	int kernel_stack_id;
198	int has_meta;
199	struct strobemeta_payload metadata;
200	/*
201	 * makes it possible to pass (<real payload size> + 1) as data size to
202	 * perf_submit() to avoid perf_submit's paranoia about passing zero as
203	 * size, as it deduces that <real payload size> might be
204	 * **theoretically** zero
205	 */
206	char dummy_safeguard;
207};
208
209struct {
210	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
211	__uint(max_entries, 32);
212	__uint(key_size, sizeof(int));
213	__uint(value_size, sizeof(int));
214} samples SEC(".maps");
215
216struct {
217	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
218	__uint(max_entries, 16);
219	__uint(key_size, sizeof(uint32_t));
220	__uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH);
221} stacks_0 SEC(".maps");
222
223struct {
224	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
225	__uint(max_entries, 16);
226	__uint(key_size, sizeof(uint32_t));
227	__uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH);
228} stacks_1 SEC(".maps");
229
230struct {
231	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
232	__uint(max_entries, 1);
233	__type(key, uint32_t);
234	__type(value, struct strobelight_bpf_sample);
235} sample_heap SEC(".maps");
236
237struct {
238	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
239	__uint(max_entries, STROBE_MAX_CFGS);
240	__type(key, pid_t);
241	__type(value, struct strobemeta_cfg);
242} strobemeta_cfgs SEC(".maps");
243
244/* Type for the dtv.  */
245/* https://github.com/lattera/glibc/blob/master/nptl/sysdeps/x86_64/tls.h#L34 */
246typedef union dtv {
247	size_t counter;
248	struct {
249		void* val;
250		bool is_static;
251	} pointer;
252} dtv_t;
253
254/* Partial definition for tcbhead_t */
255/* https://github.com/bminor/glibc/blob/master/sysdeps/x86_64/nptl/tls.h#L42 */
256struct tcbhead {
257	void* tcb;
258	dtv_t* dtv;
259};
260
261/*
262 * TLS module/offset information for shared library case.
263 * For x86-64, this is mapped onto two entries in GOT.
264 * For aarch64, this is pointed to by second GOT entry.
265 */
266struct tls_index {
267	uint64_t module;
268	uint64_t offset;
269};
270
271#ifdef SUBPROGS
272__noinline
273#else
274__always_inline
275#endif
276static void *calc_location(struct strobe_value_loc *loc, void *tls_base)
277{
278	/*
279	 * tls_mode value is:
280	 * - -1 (TLS_NOT_SET), if no metavar is present;
281	 * - 0 (TLS_LOCAL_EXEC), if metavar uses Local Executable mode of TLS
282	 * (offset from fs:0 for x86-64 or tpidr_el0 for aarch64);
283	 * - 1 (TLS_IMM_EXEC), if metavar uses Immediate Executable mode of TLS;
284	 * - 2 (TLS_GENERAL_DYN), if metavar uses General Dynamic mode of TLS;
285	 * This schema allows to use something like:
286	 * (tls_mode + 1) * (tls_base + offset)
287	 * to get NULL for "no metavar" location, or correct pointer for local
288	 * executable mode without doing extra ifs.
289	 */
290	if (loc->tls_mode <= TLS_LOCAL_EXEC) {
291		/* static executable is simple, we just have offset from
292		 * tls_base */
293		void *addr = tls_base + loc->offset;
294		/* multiply by (tls_mode + 1) to get NULL, if we have no
295		 * metavar in this slot */
296		return (void *)((loc->tls_mode + 1) * (int64_t)addr);
297	}
298	/*
299	 * Other modes are more complicated, we need to jump through few hoops.
300	 *
301	 * For immediate executable mode (currently supported only for aarch64):
302	 *  - loc->offset is pointing to a GOT entry containing fixed offset
303	 *  relative to tls_base;
304	 *
305	 * For general dynamic mode:
306	 *  - loc->offset is pointing to a beginning of double GOT entries;
307	 *  - (for aarch64 only) second entry points to tls_index_t struct;
308	 *  - (for x86-64 only) two GOT entries are already tls_index_t;
309	 *  - tls_index_t->module is used to find start of TLS section in
310	 *  which variable resides;
311	 *  - tls_index_t->offset provides offset within that TLS section,
312	 *  pointing to value of variable.
313	 */
314	struct tls_index tls_index;
315	dtv_t *dtv;
316	void *tls_ptr;
317
318	bpf_probe_read_user(&tls_index, sizeof(struct tls_index),
319			    (void *)loc->offset);
320	/* valid module index is always positive */
321	if (tls_index.module > 0) {
322		/* dtv = ((struct tcbhead *)tls_base)->dtv[tls_index.module] */
323		bpf_probe_read_user(&dtv, sizeof(dtv),
324				    &((struct tcbhead *)tls_base)->dtv);
325		dtv += tls_index.module;
326	} else {
327		dtv = NULL;
328	}
329	bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv);
330	/* if pointer has (void *)-1 value, then TLS wasn't initialized yet */
331	return tls_ptr && tls_ptr != (void *)-1
332		? tls_ptr + tls_index.offset
333		: NULL;
334}
335
336#ifdef SUBPROGS
337__noinline
338#else
339__always_inline
340#endif
341static void read_int_var(struct strobemeta_cfg *cfg,
342			 size_t idx, void *tls_base,
343			 struct strobe_value_generic *value,
344			 struct strobemeta_payload *data)
345{
346	void *location = calc_location(&cfg->int_locs[idx], tls_base);
347	if (!location)
348		return;
349
350	bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
351	data->int_vals[idx] = value->val;
352	if (value->header.len)
353		data->int_vals_set_mask |= (1 << idx);
354}
355
356static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg,
357					     size_t idx, void *tls_base,
358					     struct strobe_value_generic *value,
359					     struct strobemeta_payload *data,
360					     size_t off)
361{
362	void *location;
363	uint64_t len;
364
365	data->str_lens[idx] = 0;
366	location = calc_location(&cfg->str_locs[idx], tls_base);
367	if (!location)
368		return 0;
369
370	bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
371	len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, value->ptr);
372	/*
373	 * if bpf_probe_read_user_str returns error (<0), due to casting to
374	 * unsinged int, it will become big number, so next check is
375	 * sufficient to check for errors AND prove to BPF verifier, that
376	 * bpf_probe_read_user_str won't return anything bigger than
377	 * STROBE_MAX_STR_LEN
378	 */
379	if (len > STROBE_MAX_STR_LEN)
380		return 0;
381
382	data->str_lens[idx] = len;
383	return off + len;
384}
385
386static __always_inline uint64_t read_map_var(struct strobemeta_cfg *cfg,
387					     size_t idx, void *tls_base,
388					     struct strobe_value_generic *value,
389					     struct strobemeta_payload *data,
390					     size_t off)
391{
392	struct strobe_map_descr* descr = &data->map_descrs[idx];
393	struct strobe_map_raw map;
394	void *location;
395	uint64_t len;
396
397	descr->tag_len = 0; /* presume no tag is set */
398	descr->cnt = -1; /* presume no value is set */
399
400	location = calc_location(&cfg->map_locs[idx], tls_base);
401	if (!location)
402		return off;
403
404	bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
405	if (bpf_probe_read_user(&map, sizeof(struct strobe_map_raw), value->ptr))
406		return off;
407
408	descr->id = map.id;
409	descr->cnt = map.cnt;
410	if (cfg->req_meta_idx == idx) {
411		data->req_id = map.id;
412		data->req_meta_valid = 1;
413	}
414
415	len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, map.tag);
416	if (len <= STROBE_MAX_STR_LEN) {
417		descr->tag_len = len;
418		off += len;
419	}
420
421#ifdef NO_UNROLL
422#pragma clang loop unroll(disable)
423#else
424#pragma unroll
425#endif
426	for (int i = 0; i < STROBE_MAX_MAP_ENTRIES; ++i) {
427		if (i >= map.cnt)
428			break;
429
430		descr->key_lens[i] = 0;
431		len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN,
432					      map.entries[i].key);
433		if (len <= STROBE_MAX_STR_LEN) {
434			descr->key_lens[i] = len;
435			off += len;
436		}
437		descr->val_lens[i] = 0;
438		len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN,
439					      map.entries[i].val);
440		if (len <= STROBE_MAX_STR_LEN) {
441			descr->val_lens[i] = len;
442			off += len;
443		}
444	}
445
446	return off;
447}
448
449#ifdef USE_BPF_LOOP
450enum read_type {
451	READ_INT_VAR,
452	READ_MAP_VAR,
453	READ_STR_VAR,
454};
455
456struct read_var_ctx {
457	struct strobemeta_payload *data;
458	void *tls_base;
459	struct strobemeta_cfg *cfg;
460	size_t payload_off;
461	/* value gets mutated */
462	struct strobe_value_generic *value;
463	enum read_type type;
464};
465
466static int read_var_callback(__u64 index, struct read_var_ctx *ctx)
467{
468	/* lose precision info for ctx->payload_off, verifier won't track
469	 * double xor, barrier_var() is needed to force clang keep both xors.
470	 */
471	ctx->payload_off ^= index;
472	barrier_var(ctx->payload_off);
473	ctx->payload_off ^= index;
474	switch (ctx->type) {
475	case READ_INT_VAR:
476		if (index >= STROBE_MAX_INTS)
477			return 1;
478		read_int_var(ctx->cfg, index, ctx->tls_base, ctx->value, ctx->data);
479		break;
480	case READ_MAP_VAR:
481		if (index >= STROBE_MAX_MAPS)
482			return 1;
483		if (ctx->payload_off > sizeof(ctx->data->payload) - READ_MAP_VAR_PAYLOAD_CAP)
484			return 1;
485		ctx->payload_off = read_map_var(ctx->cfg, index, ctx->tls_base,
486						ctx->value, ctx->data, ctx->payload_off);
487		break;
488	case READ_STR_VAR:
489		if (index >= STROBE_MAX_STRS)
490			return 1;
491		if (ctx->payload_off > sizeof(ctx->data->payload) - STROBE_MAX_STR_LEN)
492			return 1;
493		ctx->payload_off = read_str_var(ctx->cfg, index, ctx->tls_base,
494						ctx->value, ctx->data, ctx->payload_off);
495		break;
496	}
497	return 0;
498}
499#endif /* USE_BPF_LOOP */
500
501/*
502 * read_strobe_meta returns NULL, if no metadata was read; otherwise returns
503 * pointer to *right after* payload ends
504 */
505#ifdef SUBPROGS
506__noinline
507#else
508__always_inline
509#endif
510static void *read_strobe_meta(struct task_struct *task,
511			      struct strobemeta_payload *data)
512{
513	pid_t pid = bpf_get_current_pid_tgid() >> 32;
514	struct strobe_value_generic value = {0};
515	struct strobemeta_cfg *cfg;
516	size_t payload_off;
517	void *tls_base;
518
519	cfg = bpf_map_lookup_elem(&strobemeta_cfgs, &pid);
520	if (!cfg)
521		return NULL;
522
523	data->int_vals_set_mask = 0;
524	data->req_meta_valid = 0;
525	payload_off = 0;
526	/*
527	 * we don't have struct task_struct definition, it should be:
528	 * tls_base = (void *)task->thread.fsbase;
529	 */
530	tls_base = (void *)task;
531
532#ifdef USE_BPF_LOOP
533	struct read_var_ctx ctx = {
534		.cfg = cfg,
535		.tls_base = tls_base,
536		.value = &value,
537		.data = data,
538		.payload_off = 0,
539	};
540	int err;
541
542	ctx.type = READ_INT_VAR;
543	err = bpf_loop(STROBE_MAX_INTS, read_var_callback, &ctx, 0);
544	if (err != STROBE_MAX_INTS)
545		return NULL;
546
547	ctx.type = READ_STR_VAR;
548	err = bpf_loop(STROBE_MAX_STRS, read_var_callback, &ctx, 0);
549	if (err != STROBE_MAX_STRS)
550		return NULL;
551
552	ctx.type = READ_MAP_VAR;
553	err = bpf_loop(STROBE_MAX_MAPS, read_var_callback, &ctx, 0);
554	if (err != STROBE_MAX_MAPS)
555		return NULL;
556
557	payload_off = ctx.payload_off;
558	/* this should not really happen, here only to satisfy verifer */
559	if (payload_off > sizeof(data->payload))
560		payload_off = sizeof(data->payload);
561#else
562#ifdef NO_UNROLL
563#pragma clang loop unroll(disable)
564#else
565#pragma unroll
566#endif /* NO_UNROLL */
567	for (int i = 0; i < STROBE_MAX_INTS; ++i) {
568		read_int_var(cfg, i, tls_base, &value, data);
569	}
570#ifdef NO_UNROLL
571#pragma clang loop unroll(disable)
572#else
573#pragma unroll
574#endif /* NO_UNROLL */
575	for (int i = 0; i < STROBE_MAX_STRS; ++i) {
576		payload_off = read_str_var(cfg, i, tls_base, &value, data, payload_off);
577	}
578#ifdef NO_UNROLL
579#pragma clang loop unroll(disable)
580#else
581#pragma unroll
582#endif /* NO_UNROLL */
583	for (int i = 0; i < STROBE_MAX_MAPS; ++i) {
584		payload_off = read_map_var(cfg, i, tls_base, &value, data, payload_off);
585	}
586#endif /* USE_BPF_LOOP */
587
588	/*
589	 * return pointer right after end of payload, so it's possible to
590	 * calculate exact amount of useful data that needs to be sent
591	 */
592	return &data->payload[payload_off];
593}
594
595SEC("raw_tracepoint/kfree_skb")
596int on_event(struct pt_regs *ctx) {
597	pid_t pid =  bpf_get_current_pid_tgid() >> 32;
598	struct strobelight_bpf_sample* sample;
599	struct task_struct *task;
600	uint32_t zero = 0;
601	uint64_t ktime_ns;
602	void *sample_end;
603
604	sample = bpf_map_lookup_elem(&sample_heap, &zero);
605	if (!sample)
606		return 0; /* this will never happen */
607
608	sample->pid = pid;
609	bpf_get_current_comm(&sample->comm, TASK_COMM_LEN);
610	ktime_ns = bpf_ktime_get_ns();
611	sample->ktime = ktime_ns;
612
613	task = (struct task_struct *)bpf_get_current_task();
614	sample_end = read_strobe_meta(task, &sample->metadata);
615	sample->has_meta = sample_end != NULL;
616	sample_end = sample_end ? : &sample->metadata;
617
618	if ((ktime_ns >> STACK_TABLE_EPOCH_SHIFT) & 1) {
619		sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_1, 0);
620		sample->user_stack_id = bpf_get_stackid(ctx, &stacks_1, BPF_F_USER_STACK);
621	} else {
622		sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_0, 0);
623		sample->user_stack_id = bpf_get_stackid(ctx, &stacks_0, BPF_F_USER_STACK);
624	}
625
626	uint64_t sample_size = sample_end - (void *)sample;
627	/* should always be true */
628	if (sample_size < sizeof(struct strobelight_bpf_sample))
629		bpf_perf_event_output(ctx, &samples, 0, sample, 1 + sample_size);
630	return 0;
631}
632
633char _license[] SEC("license") = "GPL";