scx_qmap.bpf.c - tools/sched_ext/scx_qmap.bpf.c - Linux source code v6.13.7

Note: File does not exist in v6.2.
  1/* SPDX-License-Identifier: GPL-2.0 */
  2/*
  3 * A simple five-level FIFO queue scheduler.
  4 *
  5 * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
  6 * assigned to one depending on its compound weight. Each CPU round robins
  7 * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
  8 * queue0, 2 from queue1, 4 from queue2 and so on.
  9 *
 10 * This scheduler demonstrates:
 11 *
 12 * - BPF-side queueing using PIDs.
 13 * - Sleepable per-task storage allocation using ops.prep_enable().
 14 * - Using ops.cpu_release() to handle a higher priority scheduling class taking
 15 *   the CPU away.
 16 * - Core-sched support.
 17 *
 18 * This scheduler is primarily for demonstration and testing of sched_ext
 19 * features and unlikely to be useful for actual workloads.
 20 *
 21 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
 22 * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
 23 * Copyright (c) 2022 David Vernet <dvernet@meta.com>
 24 */
 25#include <scx/common.bpf.h>
 26
 27enum consts {
 28	ONE_SEC_IN_NS		= 1000000000,
 29	SHARED_DSQ		= 0,
 30	HIGHPRI_DSQ		= 1,
 31	HIGHPRI_WEIGHT		= 8668,		/* this is what -20 maps to */
 32};
 33
 34char _license[] SEC("license") = "GPL";
 35
 36const volatile u64 slice_ns = SCX_SLICE_DFL;
 37const volatile u32 stall_user_nth;
 38const volatile u32 stall_kernel_nth;
 39const volatile u32 dsp_inf_loop_after;
 40const volatile u32 dsp_batch;
 41const volatile bool highpri_boosting;
 42const volatile bool print_shared_dsq;
 43const volatile s32 disallow_tgid;
 44const volatile bool suppress_dump;
 45
 46u64 nr_highpri_queued;
 47u32 test_error_cnt;
 48
 49UEI_DEFINE(uei);
 50
 51struct qmap {
 52	__uint(type, BPF_MAP_TYPE_QUEUE);
 53	__uint(max_entries, 4096);
 54	__type(value, u32);
 55} queue0 SEC(".maps"),
 56  queue1 SEC(".maps"),
 57  queue2 SEC(".maps"),
 58  queue3 SEC(".maps"),
 59  queue4 SEC(".maps");
 60
 61struct {
 62	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
 63	__uint(max_entries, 5);
 64	__type(key, int);
 65	__array(values, struct qmap);
 66} queue_arr SEC(".maps") = {
 67	.values = {
 68		[0] = &queue0,
 69		[1] = &queue1,
 70		[2] = &queue2,
 71		[3] = &queue3,
 72		[4] = &queue4,
 73	},
 74};
 75
 76/*
 77 * If enabled, CPU performance target is set according to the queue index
 78 * according to the following table.
 79 */
 80static const u32 qidx_to_cpuperf_target[] = {
 81	[0] = SCX_CPUPERF_ONE * 0 / 4,
 82	[1] = SCX_CPUPERF_ONE * 1 / 4,
 83	[2] = SCX_CPUPERF_ONE * 2 / 4,
 84	[3] = SCX_CPUPERF_ONE * 3 / 4,
 85	[4] = SCX_CPUPERF_ONE * 4 / 4,
 86};
 87
 88/*
 89 * Per-queue sequence numbers to implement core-sched ordering.
 90 *
 91 * Tail seq is assigned to each queued task and incremented. Head seq tracks the
 92 * sequence number of the latest dispatched task. The distance between the a
 93 * task's seq and the associated queue's head seq is called the queue distance
 94 * and used when comparing two tasks for ordering. See qmap_core_sched_before().
 95 */
 96static u64 core_sched_head_seqs[5];
 97static u64 core_sched_tail_seqs[5];
 98
 99/* Per-task scheduling context */
100struct task_ctx {
101	bool	force_local;	/* Dispatch directly to local_dsq */
102	bool	highpri;
103	u64	core_sched_seq;
104};
105
106struct {
107	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
108	__uint(map_flags, BPF_F_NO_PREALLOC);
109	__type(key, int);
110	__type(value, struct task_ctx);
111} task_ctx_stor SEC(".maps");
112
113struct cpu_ctx {
114	u64	dsp_idx;	/* dispatch index */
115	u64	dsp_cnt;	/* remaining count */
116	u32	avg_weight;
117	u32	cpuperf_target;
118};
119
120struct {
121	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
122	__uint(max_entries, 1);
123	__type(key, u32);
124	__type(value, struct cpu_ctx);
125} cpu_ctx_stor SEC(".maps");
126
127/* Statistics */
128u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq;
129u64 nr_core_sched_execed;
130u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
131u32 cpuperf_min, cpuperf_avg, cpuperf_max;
132u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
133
134static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
135{
136	s32 cpu;
137
138	if (p->nr_cpus_allowed == 1 ||
139	    scx_bpf_test_and_clear_cpu_idle(prev_cpu))
140		return prev_cpu;
141
142	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
143	if (cpu >= 0)
144		return cpu;
145
146	return -1;
147}
148
149static struct task_ctx *lookup_task_ctx(struct task_struct *p)
150{
151	struct task_ctx *tctx;
152
153	if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
154		scx_bpf_error("task_ctx lookup failed");
155		return NULL;
156	}
157	return tctx;
158}
159
160s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
161		   s32 prev_cpu, u64 wake_flags)
162{
163	struct task_ctx *tctx;
164	s32 cpu;
165
166	if (!(tctx = lookup_task_ctx(p)))
167		return -ESRCH;
168
169	cpu = pick_direct_dispatch_cpu(p, prev_cpu);
170
171	if (cpu >= 0) {
172		tctx->force_local = true;
173		return cpu;
174	} else {
175		return prev_cpu;
176	}
177}
178
179static int weight_to_idx(u32 weight)
180{
181	/* Coarsely map the compound weight to a FIFO. */
182	if (weight <= 25)
183		return 0;
184	else if (weight <= 50)
185		return 1;
186	else if (weight < 200)
187		return 2;
188	else if (weight < 400)
189		return 3;
190	else
191		return 4;
192}
193
194void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
195{
196	static u32 user_cnt, kernel_cnt;
197	struct task_ctx *tctx;
198	u32 pid = p->pid;
199	int idx = weight_to_idx(p->scx.weight);
200	void *ring;
201	s32 cpu;
202
203	if (p->flags & PF_KTHREAD) {
204		if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
205			return;
206	} else {
207		if (stall_user_nth && !(++user_cnt % stall_user_nth))
208			return;
209	}
210
211	if (test_error_cnt && !--test_error_cnt)
212		scx_bpf_error("test triggering error");
213
214	if (!(tctx = lookup_task_ctx(p)))
215		return;
216
217	/*
218	 * All enqueued tasks must have their core_sched_seq updated for correct
219	 * core-sched ordering. Also, take a look at the end of qmap_dispatch().
220	 */
221	tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
222
223	/*
224	 * If qmap_select_cpu() is telling us to or this is the last runnable
225	 * task on the CPU, enqueue locally.
226	 */
227	if (tctx->force_local) {
228		tctx->force_local = false;
229		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
230		return;
231	}
232
233	/* if select_cpu() wasn't called, try direct dispatch */
234	if (!(enq_flags & SCX_ENQ_CPU_SELECTED) &&
235	    (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
236		__sync_fetch_and_add(&nr_ddsp_from_enq, 1);
237		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
238		return;
239	}
240
241	/*
242	 * If the task was re-enqueued due to the CPU being preempted by a
243	 * higher priority scheduling class, just re-enqueue the task directly
244	 * on the global DSQ. As we want another CPU to pick it up, find and
245	 * kick an idle CPU.
246	 */
247	if (enq_flags & SCX_ENQ_REENQ) {
248		s32 cpu;
249
250		scx_bpf_dsq_insert(p, SHARED_DSQ, 0, enq_flags);
251		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
252		if (cpu >= 0)
253			scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
254		return;
255	}
256
257	ring = bpf_map_lookup_elem(&queue_arr, &idx);
258	if (!ring) {
259		scx_bpf_error("failed to find ring %d", idx);
260		return;
261	}
262
263	/* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
264	if (bpf_map_push_elem(ring, &pid, 0)) {
265		scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags);
266		return;
267	}
268
269	if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
270		tctx->highpri = true;
271		__sync_fetch_and_add(&nr_highpri_queued, 1);
272	}
273	__sync_fetch_and_add(&nr_enqueued, 1);
274}
275
276/*
277 * The BPF queue map doesn't support removal and sched_ext can handle spurious
278 * dispatches. qmap_dequeue() is only used to collect statistics.
279 */
280void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
281{
282	__sync_fetch_and_add(&nr_dequeued, 1);
283	if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
284		__sync_fetch_and_add(&nr_core_sched_execed, 1);
285}
286
287static void update_core_sched_head_seq(struct task_struct *p)
288{
289	int idx = weight_to_idx(p->scx.weight);
290	struct task_ctx *tctx;
291
292	if ((tctx = lookup_task_ctx(p)))
293		core_sched_head_seqs[idx] = tctx->core_sched_seq;
294}
295
296/*
297 * To demonstrate the use of scx_bpf_dsq_move(), implement silly selective
298 * priority boosting mechanism by scanning SHARED_DSQ looking for highpri tasks,
299 * moving them to HIGHPRI_DSQ and then consuming them first. This makes minor
300 * difference only when dsp_batch is larger than 1.
301 *
302 * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and
303 * non-rq-lock holding BPF programs. As demonstration, this function is called
304 * from qmap_dispatch() and monitor_timerfn().
305 */
306static bool dispatch_highpri(bool from_timer)
307{
308	struct task_struct *p;
309	s32 this_cpu = bpf_get_smp_processor_id();
310
311	/* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
312	bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
313		static u64 highpri_seq;
314		struct task_ctx *tctx;
315
316		if (!(tctx = lookup_task_ctx(p)))
317			return false;
318
319		if (tctx->highpri) {
320			/* exercise the set_*() and vtime interface too */
321			__COMPAT_scx_bpf_dsq_move_set_slice(
322				BPF_FOR_EACH_ITER, slice_ns * 2);
323			__COMPAT_scx_bpf_dsq_move_set_vtime(
324				BPF_FOR_EACH_ITER, highpri_seq++);
325			__COMPAT_scx_bpf_dsq_move_vtime(
326				BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
327		}
328	}
329
330	/*
331	 * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU
332	 * is found.
333	 */
334	bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) {
335		bool dispatched = false;
336		s32 cpu;
337
338		if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr))
339			cpu = this_cpu;
340		else
341			cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
342
343		if (__COMPAT_scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p,
344					      SCX_DSQ_LOCAL_ON | cpu,
345					      SCX_ENQ_PREEMPT)) {
346			if (cpu == this_cpu) {
347				dispatched = true;
348				__sync_fetch_and_add(&nr_expedited_local, 1);
349			} else {
350				__sync_fetch_and_add(&nr_expedited_remote, 1);
351			}
352			if (from_timer)
353				__sync_fetch_and_add(&nr_expedited_from_timer, 1);
354		} else {
355			__sync_fetch_and_add(&nr_expedited_lost, 1);
356		}
357
358		if (dispatched)
359			return true;
360	}
361
362	return false;
363}
364
365void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
366{
367	struct task_struct *p;
368	struct cpu_ctx *cpuc;
369	struct task_ctx *tctx;
370	u32 zero = 0, batch = dsp_batch ?: 1;
371	void *fifo;
372	s32 i, pid;
373
374	if (dispatch_highpri(false))
375		return;
376
377	if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ))
378		return;
379
380	if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
381		/*
382		 * PID 2 should be kthreadd which should mostly be idle and off
383		 * the scheduler. Let's keep dispatching it to force the kernel
384		 * to call this function over and over again.
385		 */
386		p = bpf_task_from_pid(2);
387		if (p) {
388			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0);
389			bpf_task_release(p);
390			return;
391		}
392	}
393
394	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
395		scx_bpf_error("failed to look up cpu_ctx");
396		return;
397	}
398
399	for (i = 0; i < 5; i++) {
400		/* Advance the dispatch cursor and pick the fifo. */
401		if (!cpuc->dsp_cnt) {
402			cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
403			cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
404		}
405
406		fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
407		if (!fifo) {
408			scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
409			return;
410		}
411
412		/* Dispatch or advance. */
413		bpf_repeat(BPF_MAX_LOOPS) {
414			struct task_ctx *tctx;
415
416			if (bpf_map_pop_elem(fifo, &pid))
417				break;
418
419			p = bpf_task_from_pid(pid);
420			if (!p)
421				continue;
422
423			if (!(tctx = lookup_task_ctx(p))) {
424				bpf_task_release(p);
425				return;
426			}
427
428			if (tctx->highpri)
429				__sync_fetch_and_sub(&nr_highpri_queued, 1);
430
431			update_core_sched_head_seq(p);
432			__sync_fetch_and_add(&nr_dispatched, 1);
433
434			scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0);
435			bpf_task_release(p);
436
437			batch--;
438			cpuc->dsp_cnt--;
439			if (!batch || !scx_bpf_dispatch_nr_slots()) {
440				if (dispatch_highpri(false))
441					return;
442				scx_bpf_dsq_move_to_local(SHARED_DSQ);
443				return;
444			}
445			if (!cpuc->dsp_cnt)
446				break;
447		}
448
449		cpuc->dsp_cnt = 0;
450	}
451
452	/*
453	 * No other tasks. @prev will keep running. Update its core_sched_seq as
454	 * if the task were enqueued and dispatched immediately.
455	 */
456	if (prev) {
457		tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
458		if (!tctx) {
459			scx_bpf_error("task_ctx lookup failed");
460			return;
461		}
462
463		tctx->core_sched_seq =
464			core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
465	}
466}
467
468void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
469{
470	struct cpu_ctx *cpuc;
471	u32 zero = 0;
472	int idx;
473
474	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
475		scx_bpf_error("failed to look up cpu_ctx");
476		return;
477	}
478
479	/*
480	 * Use the running avg of weights to select the target cpuperf level.
481	 * This is a demonstration of the cpuperf feature rather than a
482	 * practical strategy to regulate CPU frequency.
483	 */
484	cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
485	idx = weight_to_idx(cpuc->avg_weight);
486	cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];
487
488	scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target);
489}
490
491/*
492 * The distance from the head of the queue scaled by the weight of the queue.
493 * The lower the number, the older the task and the higher the priority.
494 */
495static s64 task_qdist(struct task_struct *p)
496{
497	int idx = weight_to_idx(p->scx.weight);
498	struct task_ctx *tctx;
499	s64 qdist;
500
501	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
502	if (!tctx) {
503		scx_bpf_error("task_ctx lookup failed");
504		return 0;
505	}
506
507	qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
508
509	/*
510	 * As queue index increments, the priority doubles. The queue w/ index 3
511	 * is dispatched twice more frequently than 2. Reflect the difference by
512	 * scaling qdists accordingly. Note that the shift amount needs to be
513	 * flipped depending on the sign to avoid flipping priority direction.
514	 */
515	if (qdist >= 0)
516		return qdist << (4 - idx);
517	else
518		return qdist << idx;
519}
520
521/*
522 * This is called to determine the task ordering when core-sched is picking
523 * tasks to execute on SMT siblings and should encode about the same ordering as
524 * the regular scheduling path. Use the priority-scaled distances from the head
525 * of the queues to compare the two tasks which should be consistent with the
526 * dispatch path behavior.
527 */
528bool BPF_STRUCT_OPS(qmap_core_sched_before,
529		    struct task_struct *a, struct task_struct *b)
530{
531	return task_qdist(a) > task_qdist(b);
532}
533
534void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
535{
536	u32 cnt;
537
538	/*
539	 * Called when @cpu is taken by a higher priority scheduling class. This
540	 * makes @cpu no longer available for executing sched_ext tasks. As we
541	 * don't want the tasks in @cpu's local dsq to sit there until @cpu
542	 * becomes available again, re-enqueue them into the global dsq. See
543	 * %SCX_ENQ_REENQ handling in qmap_enqueue().
544	 */
545	cnt = scx_bpf_reenqueue_local();
546	if (cnt)
547		__sync_fetch_and_add(&nr_reenqueued, cnt);
548}
549
550s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
551		   struct scx_init_task_args *args)
552{
553	if (p->tgid == disallow_tgid)
554		p->scx.disallow = true;
555
556	/*
557	 * @p is new. Let's ensure that its task_ctx is available. We can sleep
558	 * in this function and the following will automatically use GFP_KERNEL.
559	 */
560	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
561				 BPF_LOCAL_STORAGE_GET_F_CREATE))
562		return 0;
563	else
564		return -ENOMEM;
565}
566
567void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
568{
569	s32 i, pid;
570
571	if (suppress_dump)
572		return;
573
574	bpf_for(i, 0, 5) {
575		void *fifo;
576
577		if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
578			return;
579
580		scx_bpf_dump("QMAP FIFO[%d]:", i);
581		bpf_repeat(4096) {
582			if (bpf_map_pop_elem(fifo, &pid))
583				break;
584			scx_bpf_dump(" %d", pid);
585		}
586		scx_bpf_dump("\n");
587	}
588}
589
590void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
591{
592	u32 zero = 0;
593	struct cpu_ctx *cpuc;
594
595	if (suppress_dump || idle)
596		return;
597	if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
598		return;
599
600	scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
601		     cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
602		     cpuc->cpuperf_target);
603}
604
605void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
606{
607	struct task_ctx *taskc;
608
609	if (suppress_dump)
610		return;
611	if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
612		return;
613
614	scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
615		     taskc->force_local, taskc->core_sched_seq);
616}
617
618/*
619 * Print out the online and possible CPU map using bpf_printk() as a
620 * demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
621 */
622static void print_cpus(void)
623{
624	const struct cpumask *possible, *online;
625	s32 cpu;
626	char buf[128] = "", *p;
627	int idx;
628
629	possible = scx_bpf_get_possible_cpumask();
630	online = scx_bpf_get_online_cpumask();
631
632	idx = 0;
633	bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) {
634		if (!(p = MEMBER_VPTR(buf, [idx++])))
635			break;
636		if (bpf_cpumask_test_cpu(cpu, online))
637			*p++ = 'O';
638		else if (bpf_cpumask_test_cpu(cpu, possible))
639			*p++ = 'X';
640		else
641			*p++ = ' ';
642
643		if ((cpu & 7) == 7) {
644			if (!(p = MEMBER_VPTR(buf, [idx++])))
645				break;
646			*p++ = '|';
647		}
648	}
649	buf[sizeof(buf) - 1] = '\0';
650
651	scx_bpf_put_cpumask(online);
652	scx_bpf_put_cpumask(possible);
653
654	bpf_printk("CPUS: |%s", buf);
655}
656
657void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu)
658{
659	bpf_printk("CPU %d coming online", cpu);
660	/* @cpu is already online at this point */
661	print_cpus();
662}
663
664void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
665{
666	bpf_printk("CPU %d going offline", cpu);
667	/* @cpu is still online at this point */
668	print_cpus();
669}
670
671struct monitor_timer {
672	struct bpf_timer timer;
673};
674
675struct {
676	__uint(type, BPF_MAP_TYPE_ARRAY);
677	__uint(max_entries, 1);
678	__type(key, u32);
679	__type(value, struct monitor_timer);
680} monitor_timer SEC(".maps");
681
682/*
683 * Print out the min, avg and max performance levels of CPUs every second to
684 * demonstrate the cpuperf interface.
685 */
686static void monitor_cpuperf(void)
687{
688	u32 zero = 0, nr_cpu_ids;
689	u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
690	u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
691	const struct cpumask *online;
692	int i, nr_online_cpus = 0;
693
694	nr_cpu_ids = scx_bpf_nr_cpu_ids();
695	online = scx_bpf_get_online_cpumask();
696
697	bpf_for(i, 0, nr_cpu_ids) {
698		struct cpu_ctx *cpuc;
699		u32 cap, cur;
700
701		if (!bpf_cpumask_test_cpu(i, online))
702			continue;
703		nr_online_cpus++;
704
705		/* collect the capacity and current cpuperf */
706		cap = scx_bpf_cpuperf_cap(i);
707		cur = scx_bpf_cpuperf_cur(i);
708
709		cur_min = cur < cur_min ? cur : cur_min;
710		cur_max = cur > cur_max ? cur : cur_max;
711
712		/*
713		 * $cur is relative to $cap. Scale it down accordingly so that
714		 * it's in the same scale as other CPUs and $cur_sum/$cap_sum
715		 * makes sense.
716		 */
717		cur_sum += cur * cap / SCX_CPUPERF_ONE;
718		cap_sum += cap;
719
720		if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
721			scx_bpf_error("failed to look up cpu_ctx");
722			goto out;
723		}
724
725		/* collect target */
726		cur = cpuc->cpuperf_target;
727		target_sum += cur;
728		target_min = cur < target_min ? cur : target_min;
729		target_max = cur > target_max ? cur : target_max;
730	}
731
732	cpuperf_min = cur_min;
733	cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
734	cpuperf_max = cur_max;
735
736	cpuperf_target_min = target_min;
737	cpuperf_target_avg = target_sum / nr_online_cpus;
738	cpuperf_target_max = target_max;
739out:
740	scx_bpf_put_cpumask(online);
741}
742
743/*
744 * Dump the currently queued tasks in the shared DSQ to demonstrate the usage of
745 * scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to
746 * see meaningful dumps in the trace pipe.
747 */
748static void dump_shared_dsq(void)
749{
750	struct task_struct *p;
751	s32 nr;
752
753	if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ)))
754		return;
755
756	bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr);
757
758	bpf_rcu_read_lock();
759	bpf_for_each(scx_dsq, p, SHARED_DSQ, SCX_DSQ_ITER_REV)
760		bpf_printk("%s[%d]", p->comm, p->pid);
761	bpf_rcu_read_unlock();
762}
763
764static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
765{
766	bpf_rcu_read_lock();
767	dispatch_highpri(true);
768	bpf_rcu_read_unlock();
769
770	monitor_cpuperf();
771
772	if (print_shared_dsq)
773		dump_shared_dsq();
774
775	bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
776	return 0;
777}
778
779s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
780{
781	u32 key = 0;
782	struct bpf_timer *timer;
783	s32 ret;
784
785	print_cpus();
786
787	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
788	if (ret)
789		return ret;
790
791	ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1);
792	if (ret)
793		return ret;
794
795	timer = bpf_map_lookup_elem(&monitor_timer, &key);
796	if (!timer)
797		return -ESRCH;
798
799	bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC);
800	bpf_timer_set_callback(timer, monitor_timerfn);
801
802	return bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
803}
804
805void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
806{
807	UEI_RECORD(uei, ei);
808}
809
810SCX_OPS_DEFINE(qmap_ops,
811	       .select_cpu		= (void *)qmap_select_cpu,
812	       .enqueue			= (void *)qmap_enqueue,
813	       .dequeue			= (void *)qmap_dequeue,
814	       .dispatch		= (void *)qmap_dispatch,
815	       .tick			= (void *)qmap_tick,
816	       .core_sched_before	= (void *)qmap_core_sched_before,
817	       .cpu_release		= (void *)qmap_cpu_release,
818	       .init_task		= (void *)qmap_init_task,
819	       .dump			= (void *)qmap_dump,
820	       .dump_cpu		= (void *)qmap_dump_cpu,
821	       .dump_task		= (void *)qmap_dump_task,
822	       .cpu_online		= (void *)qmap_cpu_online,
823	       .cpu_offline		= (void *)qmap_cpu_offline,
824	       .init			= (void *)qmap_init,
825	       .exit			= (void *)qmap_exit,
826	       .timeout_ms		= 5000U,
827	       .name			= "qmap");