scx_simple.bpf.c - tools/sched_ext/scx_simple.bpf.c - Linux source code v3.1

Note: File does not exist in v3.1.
  1/* SPDX-License-Identifier: GPL-2.0 */
  2/*
  3 * A simple scheduler.
  4 *
  5 * By default, it operates as a simple global weighted vtime scheduler and can
  6 * be switched to FIFO scheduling. It also demonstrates the following niceties.
  7 *
  8 * - Statistics tracking how many tasks are queued to local and global dsq's.
  9 * - Termination notification for userspace.
 10 *
 11 * While very simple, this scheduler should work reasonably well on CPUs with a
 12 * uniform L3 cache topology. While preemption is not implemented, the fact that
 13 * the scheduling queue is shared across all CPUs means that whatever is at the
 14 * front of the queue is likely to be executed fairly quickly given enough
 15 * number of CPUs. The FIFO scheduling mode may be beneficial to some workloads
 16 * but comes with the usual problems with FIFO scheduling where saturating
 17 * threads can easily drown out interactive ones.
 18 *
 19 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
 20 * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
 21 * Copyright (c) 2022 David Vernet <dvernet@meta.com>
 22 */
 23#include <scx/common.bpf.h>
 24
 25char _license[] SEC("license") = "GPL";
 26
 27const volatile bool fifo_sched;
 28
 29static u64 vtime_now;
 30UEI_DEFINE(uei);
 31
 32/*
 33 * Built-in DSQs such as SCX_DSQ_GLOBAL cannot be used as priority queues
 34 * (meaning, cannot be dispatched to with scx_bpf_dsq_insert_vtime()). We
 35 * therefore create a separate DSQ with ID 0 that we dispatch to and consume
 36 * from. If scx_simple only supported global FIFO scheduling, then we could just
 37 * use SCX_DSQ_GLOBAL.
 38 */
 39#define SHARED_DSQ 0
 40
 41struct {
 42	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
 43	__uint(key_size, sizeof(u32));
 44	__uint(value_size, sizeof(u64));
 45	__uint(max_entries, 2);			/* [local, global] */
 46} stats SEC(".maps");
 47
 48static void stat_inc(u32 idx)
 49{
 50	u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx);
 51	if (cnt_p)
 52		(*cnt_p)++;
 53}
 54
 55static inline bool vtime_before(u64 a, u64 b)
 56{
 57	return (s64)(a - b) < 0;
 58}
 59
 60s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
 61{
 62	bool is_idle = false;
 63	s32 cpu;
 64
 65	cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
 66	if (is_idle) {
 67		stat_inc(0);	/* count local queueing */
 68		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
 69	}
 70
 71	return cpu;
 72}
 73
 74void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
 75{
 76	stat_inc(1);	/* count global queueing */
 77
 78	if (fifo_sched) {
 79		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
 80	} else {
 81		u64 vtime = p->scx.dsq_vtime;
 82
 83		/*
 84		 * Limit the amount of budget that an idling task can accumulate
 85		 * to one slice.
 86		 */
 87		if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL))
 88			vtime = vtime_now - SCX_SLICE_DFL;
 89
 90		scx_bpf_dsq_insert_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, vtime,
 91					 enq_flags);
 92	}
 93}
 94
 95void BPF_STRUCT_OPS(simple_dispatch, s32 cpu, struct task_struct *prev)
 96{
 97	scx_bpf_dsq_move_to_local(SHARED_DSQ);
 98}
 99
100void BPF_STRUCT_OPS(simple_running, struct task_struct *p)
101{
102	if (fifo_sched)
103		return;
104
105	/*
106	 * Global vtime always progresses forward as tasks start executing. The
107	 * test and update can be performed concurrently from multiple CPUs and
108	 * thus racy. Any error should be contained and temporary. Let's just
109	 * live with it.
110	 */
111	if (vtime_before(vtime_now, p->scx.dsq_vtime))
112		vtime_now = p->scx.dsq_vtime;
113}
114
115void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable)
116{
117	if (fifo_sched)
118		return;
119
120	/*
121	 * Scale the execution time by the inverse of the weight and charge.
122	 *
123	 * Note that the default yield implementation yields by setting
124	 * @p->scx.slice to zero and the following would treat the yielding task
125	 * as if it has consumed all its slice. If this penalizes yielding tasks
126	 * too much, determine the execution time by taking explicit timestamps
127	 * instead of depending on @p->scx.slice.
128	 */
129	p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
130}
131
132void BPF_STRUCT_OPS(simple_enable, struct task_struct *p)
133{
134	p->scx.dsq_vtime = vtime_now;
135}
136
137s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init)
138{
139	return scx_bpf_create_dsq(SHARED_DSQ, -1);
140}
141
142void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
143{
144	UEI_RECORD(uei, ei);
145}
146
147SCX_OPS_DEFINE(simple_ops,
148	       .select_cpu		= (void *)simple_select_cpu,
149	       .enqueue			= (void *)simple_enqueue,
150	       .dispatch		= (void *)simple_dispatch,
151	       .running			= (void *)simple_running,
152	       .stopping		= (void *)simple_stopping,
153	       .enable			= (void *)simple_enable,
154	       .init			= (void *)simple_init,
155	       .exit			= (void *)simple_exit,
156	       .name			= "simple");