Linux Audio

Check our new training course

Loading...
Note: File does not exist in v6.8.
  1/*
  2 * NUMA support for s390
  3 *
  4 * NUMA emulation (aka fake NUMA) distributes the available memory to nodes
  5 * without using real topology information about the physical memory of the
  6 * machine.
  7 *
  8 * It distributes the available CPUs to nodes while respecting the original
  9 * machine topology information. This is done by trying to avoid to separate
 10 * CPUs which reside on the same book or even on the same MC.
 11 *
 12 * Because the current Linux scheduler code requires a stable cpu to node
 13 * mapping, cores are pinned to nodes when the first CPU thread is set online.
 14 *
 15 * Copyright IBM Corp. 2015
 16 */
 17
 18#define KMSG_COMPONENT "numa_emu"
 19#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
 20
 21#include <linux/kernel.h>
 22#include <linux/cpumask.h>
 23#include <linux/memblock.h>
 24#include <linux/node.h>
 25#include <linux/memory.h>
 26#include <linux/slab.h>
 27#include <asm/smp.h>
 28#include <asm/topology.h>
 29#include "numa_mode.h"
 30#include "toptree.h"
 31
 32/* Distances between the different system components */
 33#define DIST_EMPTY	0
 34#define DIST_CORE	1
 35#define DIST_MC		2
 36#define DIST_BOOK	3
 37#define DIST_MAX	4
 38
 39/* Node distance reported to common code */
 40#define EMU_NODE_DIST	10
 41
 42/* Node ID for free (not yet pinned) cores */
 43#define NODE_ID_FREE	-1
 44
 45/* Different levels of toptree */
 46enum toptree_level {CORE, MC, BOOK, NODE, TOPOLOGY};
 47
 48/* The two toptree IDs */
 49enum {TOPTREE_ID_PHYS, TOPTREE_ID_NUMA};
 50
 51/* Number of NUMA nodes */
 52static int emu_nodes = 1;
 53/* NUMA stripe size */
 54static unsigned long emu_size;
 55
 56/*
 57 * Node to core pinning information updates are protected by
 58 * "sched_domains_mutex".
 59 */
 60static struct {
 61	s32 to_node_id[CONFIG_NR_CPUS];	/* Pinned core to node mapping */
 62	int total;			/* Total number of pinned cores */
 63	int per_node_target;		/* Cores per node without extra cores */
 64	int per_node[MAX_NUMNODES];	/* Number of cores pinned to node */
 65} *emu_cores;
 66
 67/*
 68 * Pin a core to a node
 69 */
 70static void pin_core_to_node(int core_id, int node_id)
 71{
 72	if (emu_cores->to_node_id[core_id] == NODE_ID_FREE) {
 73		emu_cores->per_node[node_id]++;
 74		emu_cores->to_node_id[core_id] = node_id;
 75		emu_cores->total++;
 76	} else {
 77		WARN_ON(emu_cores->to_node_id[core_id] != node_id);
 78	}
 79}
 80
 81/*
 82 * Number of pinned cores of a node
 83 */
 84static int cores_pinned(struct toptree *node)
 85{
 86	return emu_cores->per_node[node->id];
 87}
 88
 89/*
 90 * ID of the node where the core is pinned (or NODE_ID_FREE)
 91 */
 92static int core_pinned_to_node_id(struct toptree *core)
 93{
 94	return emu_cores->to_node_id[core->id];
 95}
 96
 97/*
 98 * Number of cores in the tree that are not yet pinned
 99 */
100static int cores_free(struct toptree *tree)
101{
102	struct toptree *core;
103	int count = 0;
104
105	toptree_for_each(core, tree, CORE) {
106		if (core_pinned_to_node_id(core) == NODE_ID_FREE)
107			count++;
108	}
109	return count;
110}
111
112/*
113 * Return node of core
114 */
115static struct toptree *core_node(struct toptree *core)
116{
117	return core->parent->parent->parent;
118}
119
120/*
121 * Return book of core
122 */
123static struct toptree *core_book(struct toptree *core)
124{
125	return core->parent->parent;
126}
127
128/*
129 * Return mc of core
130 */
131static struct toptree *core_mc(struct toptree *core)
132{
133	return core->parent;
134}
135
136/*
137 * Distance between two cores
138 */
139static int dist_core_to_core(struct toptree *core1, struct toptree *core2)
140{
141	if (core_book(core1)->id != core_book(core2)->id)
142		return DIST_BOOK;
143	if (core_mc(core1)->id != core_mc(core2)->id)
144		return DIST_MC;
145	/* Same core or sibling on same MC */
146	return DIST_CORE;
147}
148
149/*
150 * Distance of a node to a core
151 */
152static int dist_node_to_core(struct toptree *node, struct toptree *core)
153{
154	struct toptree *core_node;
155	int dist_min = DIST_MAX;
156
157	toptree_for_each(core_node, node, CORE)
158		dist_min = min(dist_min, dist_core_to_core(core_node, core));
159	return dist_min == DIST_MAX ? DIST_EMPTY : dist_min;
160}
161
162/*
163 * Unify will delete empty nodes, therefore recreate nodes.
164 */
165static void toptree_unify_tree(struct toptree *tree)
166{
167	int nid;
168
169	toptree_unify(tree);
170	for (nid = 0; nid < emu_nodes; nid++)
171		toptree_get_child(tree, nid);
172}
173
174/*
175 * Find the best/nearest node for a given core and ensure that no node
176 * gets more than "emu_cores->per_node_target + extra" cores.
177 */
178static struct toptree *node_for_core(struct toptree *numa, struct toptree *core,
179				     int extra)
180{
181	struct toptree *node, *node_best = NULL;
182	int dist_cur, dist_best, cores_target;
183
184	cores_target = emu_cores->per_node_target + extra;
185	dist_best = DIST_MAX;
186	node_best = NULL;
187	toptree_for_each(node, numa, NODE) {
188		/* Already pinned cores must use their nodes */
189		if (core_pinned_to_node_id(core) == node->id) {
190			node_best = node;
191			break;
192		}
193		/* Skip nodes that already have enough cores */
194		if (cores_pinned(node) >= cores_target)
195			continue;
196		dist_cur = dist_node_to_core(node, core);
197		if (dist_cur < dist_best) {
198			dist_best = dist_cur;
199			node_best = node;
200		}
201	}
202	return node_best;
203}
204
205/*
206 * Find the best node for each core with respect to "extra" core count
207 */
208static void toptree_to_numa_single(struct toptree *numa, struct toptree *phys,
209				   int extra)
210{
211	struct toptree *node, *core, *tmp;
212
213	toptree_for_each_safe(core, tmp, phys, CORE) {
214		node = node_for_core(numa, core, extra);
215		if (!node)
216			return;
217		toptree_move(core, node);
218		pin_core_to_node(core->id, node->id);
219	}
220}
221
222/*
223 * Move structures of given level to specified NUMA node
224 */
225static void move_level_to_numa_node(struct toptree *node, struct toptree *phys,
226				    enum toptree_level level, bool perfect)
227{
228	int cores_free, cores_target = emu_cores->per_node_target;
229	struct toptree *cur, *tmp;
230
231	toptree_for_each_safe(cur, tmp, phys, level) {
232		cores_free = cores_target - toptree_count(node, CORE);
233		if (perfect) {
234			if (cores_free == toptree_count(cur, CORE))
235				toptree_move(cur, node);
236		} else {
237			if (cores_free >= toptree_count(cur, CORE))
238				toptree_move(cur, node);
239		}
240	}
241}
242
243/*
244 * Move structures of a given level to NUMA nodes. If "perfect" is specified
245 * move only perfectly fitting structures. Otherwise move also smaller
246 * than needed structures.
247 */
248static void move_level_to_numa(struct toptree *numa, struct toptree *phys,
249			       enum toptree_level level, bool perfect)
250{
251	struct toptree *node;
252
253	toptree_for_each(node, numa, NODE)
254		move_level_to_numa_node(node, phys, level, perfect);
255}
256
257/*
258 * For the first run try to move the big structures
259 */
260static void toptree_to_numa_first(struct toptree *numa, struct toptree *phys)
261{
262	struct toptree *core;
263
264	/* Always try to move perfectly fitting structures first */
265	move_level_to_numa(numa, phys, BOOK, true);
266	move_level_to_numa(numa, phys, BOOK, false);
267	move_level_to_numa(numa, phys, MC, true);
268	move_level_to_numa(numa, phys, MC, false);
269	/* Now pin all the moved cores */
270	toptree_for_each(core, numa, CORE)
271		pin_core_to_node(core->id, core_node(core)->id);
272}
273
274/*
275 * Allocate new topology and create required nodes
276 */
277static struct toptree *toptree_new(int id, int nodes)
278{
279	struct toptree *tree;
280	int nid;
281
282	tree = toptree_alloc(TOPOLOGY, id);
283	if (!tree)
284		goto fail;
285	for (nid = 0; nid < nodes; nid++) {
286		if (!toptree_get_child(tree, nid))
287			goto fail;
288	}
289	return tree;
290fail:
291	panic("NUMA emulation could not allocate topology");
292}
293
294/*
295 * Allocate and initialize core to node mapping
296 */
297static void create_core_to_node_map(void)
298{
299	int i;
300
301	emu_cores = kzalloc(sizeof(*emu_cores), GFP_KERNEL);
302	if (emu_cores == NULL)
303		panic("Could not allocate cores to node memory");
304	for (i = 0; i < ARRAY_SIZE(emu_cores->to_node_id); i++)
305		emu_cores->to_node_id[i] = NODE_ID_FREE;
306}
307
308/*
309 * Move cores from physical topology into NUMA target topology
310 * and try to keep as much of the physical topology as possible.
311 */
312static struct toptree *toptree_to_numa(struct toptree *phys)
313{
314	static int first = 1;
315	struct toptree *numa;
316	int cores_total;
317
318	cores_total = emu_cores->total + cores_free(phys);
319	emu_cores->per_node_target = cores_total / emu_nodes;
320	numa = toptree_new(TOPTREE_ID_NUMA, emu_nodes);
321	if (first) {
322		toptree_to_numa_first(numa, phys);
323		first = 0;
324	}
325	toptree_to_numa_single(numa, phys, 0);
326	toptree_to_numa_single(numa, phys, 1);
327	toptree_unify_tree(numa);
328
329	WARN_ON(cpumask_weight(&phys->mask));
330	return numa;
331}
332
333/*
334 * Create a toptree out of the physical topology that we got from the hypervisor
335 */
336static struct toptree *toptree_from_topology(void)
337{
338	struct toptree *phys, *node, *book, *mc, *core;
339	struct cpu_topology_s390 *top;
340	int cpu;
341
342	phys = toptree_new(TOPTREE_ID_PHYS, 1);
343
344	for_each_online_cpu(cpu) {
345		top = &per_cpu(cpu_topology, cpu);
346		node = toptree_get_child(phys, 0);
347		book = toptree_get_child(node, top->book_id);
348		mc = toptree_get_child(book, top->socket_id);
349		core = toptree_get_child(mc, top->core_id);
350		if (!book || !mc || !core)
351			panic("NUMA emulation could not allocate memory");
352		cpumask_set_cpu(cpu, &core->mask);
353		toptree_update_mask(mc);
354	}
355	return phys;
356}
357
358/*
359 * Add toptree core to topology and create correct CPU masks
360 */
361static void topology_add_core(struct toptree *core)
362{
363	struct cpu_topology_s390 *top;
364	int cpu;
365
366	for_each_cpu(cpu, &core->mask) {
367		top = &per_cpu(cpu_topology, cpu);
368		cpumask_copy(&top->thread_mask, &core->mask);
369		cpumask_copy(&top->core_mask, &core_mc(core)->mask);
370		cpumask_copy(&top->book_mask, &core_book(core)->mask);
371		cpumask_set_cpu(cpu, &node_to_cpumask_map[core_node(core)->id]);
372		top->node_id = core_node(core)->id;
373	}
374}
375
376/*
377 * Apply toptree to topology and create CPU masks
378 */
379static void toptree_to_topology(struct toptree *numa)
380{
381	struct toptree *core;
382	int i;
383
384	/* Clear all node masks */
385	for (i = 0; i < MAX_NUMNODES; i++)
386		cpumask_clear(&node_to_cpumask_map[i]);
387
388	/* Rebuild all masks */
389	toptree_for_each(core, numa, CORE)
390		topology_add_core(core);
391}
392
393/*
394 * Show the node to core mapping
395 */
396static void print_node_to_core_map(void)
397{
398	int nid, cid;
399
400	if (!numa_debug_enabled)
401		return;
402	printk(KERN_DEBUG "NUMA node to core mapping\n");
403	for (nid = 0; nid < emu_nodes; nid++) {
404		printk(KERN_DEBUG "  node %3d: ", nid);
405		for (cid = 0; cid < ARRAY_SIZE(emu_cores->to_node_id); cid++) {
406			if (emu_cores->to_node_id[cid] == nid)
407				printk(KERN_CONT "%d ", cid);
408		}
409		printk(KERN_CONT "\n");
410	}
411}
412
413/*
414 * Transfer physical topology into a NUMA topology and modify CPU masks
415 * according to the NUMA topology.
416 *
417 * Must be called with "sched_domains_mutex" lock held.
418 */
419static void emu_update_cpu_topology(void)
420{
421	struct toptree *phys, *numa;
422
423	if (emu_cores == NULL)
424		create_core_to_node_map();
425	phys = toptree_from_topology();
426	numa = toptree_to_numa(phys);
427	toptree_free(phys);
428	toptree_to_topology(numa);
429	toptree_free(numa);
430	print_node_to_core_map();
431}
432
433/*
434 * If emu_size is not set, use CONFIG_EMU_SIZE. Then round to minimum
435 * alignment (needed for memory hotplug).
436 */
437static unsigned long emu_setup_size_adjust(unsigned long size)
438{
439	unsigned long size_new;
440
441	size = size ? : CONFIG_EMU_SIZE;
442	size_new = roundup(size, memory_block_size_bytes());
443	if (size_new == size)
444		return size;
445	pr_warn("Increasing memory stripe size from %ld MB to %ld MB\n",
446		size >> 20, size_new >> 20);
447	return size_new;
448}
449
450/*
451 * If we have not enough memory for the specified nodes, reduce the node count.
452 */
453static int emu_setup_nodes_adjust(int nodes)
454{
455	int nodes_max;
456
457	nodes_max = memblock.memory.total_size / emu_size;
458	nodes_max = max(nodes_max, 1);
459	if (nodes_max >= nodes)
460		return nodes;
461	pr_warn("Not enough memory for %d nodes, reducing node count\n", nodes);
462	return nodes_max;
463}
464
465/*
466 * Early emu setup
467 */
468static void emu_setup(void)
469{
470	emu_size = emu_setup_size_adjust(emu_size);
471	emu_nodes = emu_setup_nodes_adjust(emu_nodes);
472	pr_info("Creating %d nodes with memory stripe size %ld MB\n",
473		emu_nodes, emu_size >> 20);
474}
475
476/*
477 * Return node id for given page number
478 */
479static int emu_pfn_to_nid(unsigned long pfn)
480{
481	return (pfn / (emu_size >> PAGE_SHIFT)) % emu_nodes;
482}
483
484/*
485 * Return stripe size
486 */
487static unsigned long emu_align(void)
488{
489	return emu_size;
490}
491
492/*
493 * Return distance between two nodes
494 */
495static int emu_distance(int node1, int node2)
496{
497	return (node1 != node2) * EMU_NODE_DIST;
498}
499
500/*
501 * Define callbacks for generic s390 NUMA infrastructure
502 */
503const struct numa_mode numa_mode_emu = {
504	.name = "emu",
505	.setup = emu_setup,
506	.update_cpu_topology = emu_update_cpu_topology,
507	.__pfn_to_nid = emu_pfn_to_nid,
508	.align = emu_align,
509	.distance = emu_distance,
510};
511
512/*
513 * Kernel parameter: emu_nodes=<n>
514 */
515static int __init early_parse_emu_nodes(char *p)
516{
517	int count;
518
519	if (kstrtoint(p, 0, &count) != 0 || count <= 0)
520		return 0;
521	if (count <= 0)
522		return 0;
523	emu_nodes = min(count, MAX_NUMNODES);
524	return 0;
525}
526early_param("emu_nodes", early_parse_emu_nodes);
527
528/*
529 * Kernel parameter: emu_size=[<n>[k|M|G|T]]
530 */
531static int __init early_parse_emu_size(char *p)
532{
533	emu_size = memparse(p, NULL);
534	return 0;
535}
536early_param("emu_size", early_parse_emu_size);