Loading...
1// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2// Copyright (c) 2022 Google
3#include "vmlinux.h"
4#include <bpf/bpf_helpers.h>
5#include <bpf/bpf_tracing.h>
6#include <bpf/bpf_core_read.h>
7#include <asm-generic/errno-base.h>
8
9#include "lock_data.h"
10
11/* for collect_lock_syms(). 4096 was rejected by the verifier */
12#define MAX_CPUS 1024
13
14/* lock contention flags from include/trace/events/lock.h */
15#define LCB_F_SPIN (1U << 0)
16#define LCB_F_READ (1U << 1)
17#define LCB_F_WRITE (1U << 2)
18#define LCB_F_RT (1U << 3)
19#define LCB_F_PERCPU (1U << 4)
20#define LCB_F_MUTEX (1U << 5)
21
22/* callstack storage */
23struct {
24 __uint(type, BPF_MAP_TYPE_STACK_TRACE);
25 __uint(key_size, sizeof(__u32));
26 __uint(value_size, sizeof(__u64));
27 __uint(max_entries, MAX_ENTRIES);
28} stacks SEC(".maps");
29
30/* maintain timestamp at the beginning of contention */
31struct {
32 __uint(type, BPF_MAP_TYPE_HASH);
33 __type(key, int);
34 __type(value, struct tstamp_data);
35 __uint(max_entries, MAX_ENTRIES);
36} tstamp SEC(".maps");
37
38/* maintain per-CPU timestamp at the beginning of contention */
39struct {
40 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
41 __uint(key_size, sizeof(__u32));
42 __uint(value_size, sizeof(struct tstamp_data));
43 __uint(max_entries, 1);
44} tstamp_cpu SEC(".maps");
45
46/* actual lock contention statistics */
47struct {
48 __uint(type, BPF_MAP_TYPE_HASH);
49 __uint(key_size, sizeof(struct contention_key));
50 __uint(value_size, sizeof(struct contention_data));
51 __uint(max_entries, MAX_ENTRIES);
52} lock_stat SEC(".maps");
53
54struct {
55 __uint(type, BPF_MAP_TYPE_HASH);
56 __uint(key_size, sizeof(__u32));
57 __uint(value_size, sizeof(struct contention_task_data));
58 __uint(max_entries, MAX_ENTRIES);
59} task_data SEC(".maps");
60
61struct {
62 __uint(type, BPF_MAP_TYPE_HASH);
63 __uint(key_size, sizeof(__u64));
64 __uint(value_size, sizeof(__u32));
65 __uint(max_entries, MAX_ENTRIES);
66} lock_syms SEC(".maps");
67
68struct {
69 __uint(type, BPF_MAP_TYPE_HASH);
70 __uint(key_size, sizeof(__u32));
71 __uint(value_size, sizeof(__u8));
72 __uint(max_entries, 1);
73} cpu_filter SEC(".maps");
74
75struct {
76 __uint(type, BPF_MAP_TYPE_HASH);
77 __uint(key_size, sizeof(__u32));
78 __uint(value_size, sizeof(__u8));
79 __uint(max_entries, 1);
80} task_filter SEC(".maps");
81
82struct {
83 __uint(type, BPF_MAP_TYPE_HASH);
84 __uint(key_size, sizeof(__u32));
85 __uint(value_size, sizeof(__u8));
86 __uint(max_entries, 1);
87} type_filter SEC(".maps");
88
89struct {
90 __uint(type, BPF_MAP_TYPE_HASH);
91 __uint(key_size, sizeof(__u64));
92 __uint(value_size, sizeof(__u8));
93 __uint(max_entries, 1);
94} addr_filter SEC(".maps");
95
96struct {
97 __uint(type, BPF_MAP_TYPE_HASH);
98 __uint(key_size, sizeof(__u64));
99 __uint(value_size, sizeof(__u8));
100 __uint(max_entries, 1);
101} cgroup_filter SEC(".maps");
102
103struct rw_semaphore___old {
104 struct task_struct *owner;
105} __attribute__((preserve_access_index));
106
107struct rw_semaphore___new {
108 atomic_long_t owner;
109} __attribute__((preserve_access_index));
110
111struct mm_struct___old {
112 struct rw_semaphore mmap_sem;
113} __attribute__((preserve_access_index));
114
115struct mm_struct___new {
116 struct rw_semaphore mmap_lock;
117} __attribute__((preserve_access_index));
118
119/* control flags */
120int enabled;
121int has_cpu;
122int has_task;
123int has_type;
124int has_addr;
125int has_cgroup;
126int needs_callstack;
127int stack_skip;
128int lock_owner;
129
130int use_cgroup_v2;
131int perf_subsys_id = -1;
132
133/* determine the key of lock stat */
134int aggr_mode;
135
136__u64 end_ts;
137
138/* error stat */
139int task_fail;
140int stack_fail;
141int time_fail;
142int data_fail;
143
144int task_map_full;
145int data_map_full;
146
147static inline __u64 get_current_cgroup_id(void)
148{
149 struct task_struct *task;
150 struct cgroup *cgrp;
151
152 if (use_cgroup_v2)
153 return bpf_get_current_cgroup_id();
154
155 task = bpf_get_current_task_btf();
156
157 if (perf_subsys_id == -1) {
158#if __has_builtin(__builtin_preserve_enum_value)
159 perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
160 perf_event_cgrp_id);
161#else
162 perf_subsys_id = perf_event_cgrp_id;
163#endif
164 }
165
166 cgrp = BPF_CORE_READ(task, cgroups, subsys[perf_subsys_id], cgroup);
167 return BPF_CORE_READ(cgrp, kn, id);
168}
169
170static inline int can_record(u64 *ctx)
171{
172 if (has_cpu) {
173 __u32 cpu = bpf_get_smp_processor_id();
174 __u8 *ok;
175
176 ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
177 if (!ok)
178 return 0;
179 }
180
181 if (has_task) {
182 __u8 *ok;
183 __u32 pid = bpf_get_current_pid_tgid();
184
185 ok = bpf_map_lookup_elem(&task_filter, &pid);
186 if (!ok)
187 return 0;
188 }
189
190 if (has_type) {
191 __u8 *ok;
192 __u32 flags = (__u32)ctx[1];
193
194 ok = bpf_map_lookup_elem(&type_filter, &flags);
195 if (!ok)
196 return 0;
197 }
198
199 if (has_addr) {
200 __u8 *ok;
201 __u64 addr = ctx[0];
202
203 ok = bpf_map_lookup_elem(&addr_filter, &addr);
204 if (!ok)
205 return 0;
206 }
207
208 if (has_cgroup) {
209 __u8 *ok;
210 __u64 cgrp = get_current_cgroup_id();
211
212 ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp);
213 if (!ok)
214 return 0;
215 }
216
217 return 1;
218}
219
220static inline int update_task_data(struct task_struct *task)
221{
222 struct contention_task_data *p;
223 int pid, err;
224
225 err = bpf_core_read(&pid, sizeof(pid), &task->pid);
226 if (err)
227 return -1;
228
229 p = bpf_map_lookup_elem(&task_data, &pid);
230 if (p == NULL && !task_map_full) {
231 struct contention_task_data data = {};
232
233 BPF_CORE_READ_STR_INTO(&data.comm, task, comm);
234 if (bpf_map_update_elem(&task_data, &pid, &data, BPF_NOEXIST) == -E2BIG)
235 task_map_full = 1;
236 }
237
238 return 0;
239}
240
241#ifndef __has_builtin
242# define __has_builtin(x) 0
243#endif
244
245static inline struct task_struct *get_lock_owner(__u64 lock, __u32 flags)
246{
247 struct task_struct *task;
248 __u64 owner = 0;
249
250 if (flags & LCB_F_MUTEX) {
251 struct mutex *mutex = (void *)lock;
252 owner = BPF_CORE_READ(mutex, owner.counter);
253 } else if (flags == LCB_F_READ || flags == LCB_F_WRITE) {
254 /*
255 * Support for the BPF_TYPE_MATCHES argument to the
256 * __builtin_preserve_type_info builtin was added at some point during
257 * development of clang 15 and it's what is needed for
258 * bpf_core_type_matches.
259 */
260#if __has_builtin(__builtin_preserve_type_info) && __clang_major__ >= 15
261 if (bpf_core_type_matches(struct rw_semaphore___old)) {
262 struct rw_semaphore___old *rwsem = (void *)lock;
263 owner = (unsigned long)BPF_CORE_READ(rwsem, owner);
264 } else if (bpf_core_type_matches(struct rw_semaphore___new)) {
265 struct rw_semaphore___new *rwsem = (void *)lock;
266 owner = BPF_CORE_READ(rwsem, owner.counter);
267 }
268#else
269 /* assume new struct */
270 struct rw_semaphore *rwsem = (void *)lock;
271 owner = BPF_CORE_READ(rwsem, owner.counter);
272#endif
273 }
274
275 if (!owner)
276 return NULL;
277
278 task = (void *)(owner & ~7UL);
279 return task;
280}
281
282static inline __u32 check_lock_type(__u64 lock, __u32 flags)
283{
284 struct task_struct *curr;
285 struct mm_struct___old *mm_old;
286 struct mm_struct___new *mm_new;
287 struct sighand_struct *sighand;
288
289 switch (flags) {
290 case LCB_F_READ: /* rwsem */
291 case LCB_F_WRITE:
292 curr = bpf_get_current_task_btf();
293 if (curr->mm == NULL)
294 break;
295 mm_new = (void *)curr->mm;
296 if (bpf_core_field_exists(mm_new->mmap_lock)) {
297 if (&mm_new->mmap_lock == (void *)lock)
298 return LCD_F_MMAP_LOCK;
299 break;
300 }
301 mm_old = (void *)curr->mm;
302 if (bpf_core_field_exists(mm_old->mmap_sem)) {
303 if (&mm_old->mmap_sem == (void *)lock)
304 return LCD_F_MMAP_LOCK;
305 }
306 break;
307 case LCB_F_SPIN: /* spinlock */
308 curr = bpf_get_current_task_btf();
309 sighand = curr->sighand;
310
311 if (sighand && &sighand->siglock == (void *)lock)
312 return LCD_F_SIGHAND_LOCK;
313 break;
314 default:
315 break;
316 }
317 return 0;
318}
319
320static inline struct tstamp_data *get_tstamp_elem(__u32 flags)
321{
322 __u32 pid;
323 struct tstamp_data *pelem;
324
325 /* Use per-cpu array map for spinlock and rwlock */
326 if (flags == (LCB_F_SPIN | LCB_F_READ) || flags == LCB_F_SPIN ||
327 flags == (LCB_F_SPIN | LCB_F_WRITE)) {
328 __u32 idx = 0;
329
330 pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx);
331 /* Do not update the element for nested locks */
332 if (pelem && pelem->lock)
333 pelem = NULL;
334 return pelem;
335 }
336
337 pid = bpf_get_current_pid_tgid();
338 pelem = bpf_map_lookup_elem(&tstamp, &pid);
339 /* Do not update the element for nested locks */
340 if (pelem && pelem->lock)
341 return NULL;
342
343 if (pelem == NULL) {
344 struct tstamp_data zero = {};
345
346 if (bpf_map_update_elem(&tstamp, &pid, &zero, BPF_NOEXIST) < 0) {
347 __sync_fetch_and_add(&task_fail, 1);
348 return NULL;
349 }
350
351 pelem = bpf_map_lookup_elem(&tstamp, &pid);
352 if (pelem == NULL) {
353 __sync_fetch_and_add(&task_fail, 1);
354 return NULL;
355 }
356 }
357 return pelem;
358}
359
360SEC("tp_btf/contention_begin")
361int contention_begin(u64 *ctx)
362{
363 struct tstamp_data *pelem;
364
365 if (!enabled || !can_record(ctx))
366 return 0;
367
368 pelem = get_tstamp_elem(ctx[1]);
369 if (pelem == NULL)
370 return 0;
371
372 pelem->timestamp = bpf_ktime_get_ns();
373 pelem->lock = (__u64)ctx[0];
374 pelem->flags = (__u32)ctx[1];
375
376 if (needs_callstack) {
377 pelem->stack_id = bpf_get_stackid(ctx, &stacks,
378 BPF_F_FAST_STACK_CMP | stack_skip);
379 if (pelem->stack_id < 0)
380 __sync_fetch_and_add(&stack_fail, 1);
381 } else if (aggr_mode == LOCK_AGGR_TASK) {
382 struct task_struct *task;
383
384 if (lock_owner) {
385 task = get_lock_owner(pelem->lock, pelem->flags);
386
387 /* The flags is not used anymore. Pass the owner pid. */
388 if (task)
389 pelem->flags = BPF_CORE_READ(task, pid);
390 else
391 pelem->flags = -1U;
392
393 } else {
394 task = bpf_get_current_task_btf();
395 }
396
397 if (task) {
398 if (update_task_data(task) < 0 && lock_owner)
399 pelem->flags = -1U;
400 }
401 }
402
403 return 0;
404}
405
406SEC("tp_btf/contention_end")
407int contention_end(u64 *ctx)
408{
409 __u32 pid = 0, idx = 0;
410 struct tstamp_data *pelem;
411 struct contention_key key = {};
412 struct contention_data *data;
413 __u64 duration;
414 bool need_delete = false;
415
416 if (!enabled)
417 return 0;
418
419 /*
420 * For spinlock and rwlock, it needs to get the timestamp for the
421 * per-cpu map. However, contention_end does not have the flags
422 * so it cannot know whether it reads percpu or hash map.
423 *
424 * Try per-cpu map first and check if there's active contention.
425 * If it is, do not read hash map because it cannot go to sleeping
426 * locks before releasing the spinning locks.
427 */
428 pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx);
429 if (pelem && pelem->lock) {
430 if (pelem->lock != ctx[0])
431 return 0;
432 } else {
433 pid = bpf_get_current_pid_tgid();
434 pelem = bpf_map_lookup_elem(&tstamp, &pid);
435 if (!pelem || pelem->lock != ctx[0])
436 return 0;
437 need_delete = true;
438 }
439
440 duration = bpf_ktime_get_ns() - pelem->timestamp;
441 if ((__s64)duration < 0) {
442 pelem->lock = 0;
443 if (need_delete)
444 bpf_map_delete_elem(&tstamp, &pid);
445 __sync_fetch_and_add(&time_fail, 1);
446 return 0;
447 }
448
449 switch (aggr_mode) {
450 case LOCK_AGGR_CALLER:
451 key.stack_id = pelem->stack_id;
452 break;
453 case LOCK_AGGR_TASK:
454 if (lock_owner)
455 key.pid = pelem->flags;
456 else {
457 if (!need_delete)
458 pid = bpf_get_current_pid_tgid();
459 key.pid = pid;
460 }
461 if (needs_callstack)
462 key.stack_id = pelem->stack_id;
463 break;
464 case LOCK_AGGR_ADDR:
465 key.lock_addr_or_cgroup = pelem->lock;
466 if (needs_callstack)
467 key.stack_id = pelem->stack_id;
468 break;
469 case LOCK_AGGR_CGROUP:
470 key.lock_addr_or_cgroup = get_current_cgroup_id();
471 break;
472 default:
473 /* should not happen */
474 return 0;
475 }
476
477 data = bpf_map_lookup_elem(&lock_stat, &key);
478 if (!data) {
479 if (data_map_full) {
480 pelem->lock = 0;
481 if (need_delete)
482 bpf_map_delete_elem(&tstamp, &pid);
483 __sync_fetch_and_add(&data_fail, 1);
484 return 0;
485 }
486
487 struct contention_data first = {
488 .total_time = duration,
489 .max_time = duration,
490 .min_time = duration,
491 .count = 1,
492 .flags = pelem->flags,
493 };
494 int err;
495
496 if (aggr_mode == LOCK_AGGR_ADDR)
497 first.flags |= check_lock_type(pelem->lock, pelem->flags);
498
499 err = bpf_map_update_elem(&lock_stat, &key, &first, BPF_NOEXIST);
500 if (err < 0) {
501 if (err == -E2BIG)
502 data_map_full = 1;
503 __sync_fetch_and_add(&data_fail, 1);
504 }
505 pelem->lock = 0;
506 if (need_delete)
507 bpf_map_delete_elem(&tstamp, &pid);
508 return 0;
509 }
510
511 __sync_fetch_and_add(&data->total_time, duration);
512 __sync_fetch_and_add(&data->count, 1);
513
514 /* FIXME: need atomic operations */
515 if (data->max_time < duration)
516 data->max_time = duration;
517 if (data->min_time > duration)
518 data->min_time = duration;
519
520 pelem->lock = 0;
521 if (need_delete)
522 bpf_map_delete_elem(&tstamp, &pid);
523 return 0;
524}
525
526extern struct rq runqueues __ksym;
527
528struct rq___old {
529 raw_spinlock_t lock;
530} __attribute__((preserve_access_index));
531
532struct rq___new {
533 raw_spinlock_t __lock;
534} __attribute__((preserve_access_index));
535
536SEC("raw_tp/bpf_test_finish")
537int BPF_PROG(collect_lock_syms)
538{
539 __u64 lock_addr, lock_off;
540 __u32 lock_flag;
541
542 if (bpf_core_field_exists(struct rq___new, __lock))
543 lock_off = offsetof(struct rq___new, __lock);
544 else
545 lock_off = offsetof(struct rq___old, lock);
546
547 for (int i = 0; i < MAX_CPUS; i++) {
548 struct rq *rq = bpf_per_cpu_ptr(&runqueues, i);
549
550 if (rq == NULL)
551 break;
552
553 lock_addr = (__u64)(void *)rq + lock_off;
554 lock_flag = LOCK_CLASS_RQLOCK;
555 bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
556 }
557 return 0;
558}
559
560SEC("raw_tp/bpf_test_finish")
561int BPF_PROG(end_timestamp)
562{
563 end_ts = bpf_ktime_get_ns();
564 return 0;
565}
566
567char LICENSE[] SEC("license") = "Dual BSD/GPL";