Linux Audio

Check our new training course

Loading...
v5.4
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
 
 
 
 
   3 *
   4 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
   5 *     & Swedish University of Agricultural Sciences.
   6 *
   7 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
   8 *     Agricultural Sciences.
   9 *
  10 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
  11 *
  12 * This work is based on the LPC-trie which is originally described in:
  13 *
  14 * An experimental study of compression methods for dynamic tries
  15 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
  16 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
  17 *
 
  18 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
  19 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
  20 *
 
  21 * Code from fib_hash has been reused which includes the following header:
  22 *
 
  23 * INET		An implementation of the TCP/IP protocol suite for the LINUX
  24 *		operating system.  INET is implemented using the  BSD Socket
  25 *		interface as the means of communication with the user level.
  26 *
  27 *		IPv4 FIB: lookup engine and maintenance routines.
  28 *
 
  29 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  30 *
 
 
 
 
 
  31 * Substantial contributions to this work comes from:
  32 *
  33 *		David S. Miller, <davem@davemloft.net>
  34 *		Stephen Hemminger <shemminger@osdl.org>
  35 *		Paul E. McKenney <paulmck@us.ibm.com>
  36 *		Patrick McHardy <kaber@trash.net>
  37 */
  38
  39#define VERSION "0.409"
  40
  41#include <linux/cache.h>
  42#include <linux/uaccess.h>
  43#include <linux/bitops.h>
  44#include <linux/types.h>
  45#include <linux/kernel.h>
  46#include <linux/mm.h>
  47#include <linux/string.h>
  48#include <linux/socket.h>
  49#include <linux/sockios.h>
  50#include <linux/errno.h>
  51#include <linux/in.h>
  52#include <linux/inet.h>
  53#include <linux/inetdevice.h>
  54#include <linux/netdevice.h>
  55#include <linux/if_arp.h>
  56#include <linux/proc_fs.h>
  57#include <linux/rcupdate.h>
  58#include <linux/skbuff.h>
  59#include <linux/netlink.h>
  60#include <linux/init.h>
  61#include <linux/list.h>
  62#include <linux/slab.h>
  63#include <linux/export.h>
  64#include <linux/vmalloc.h>
  65#include <linux/notifier.h>
  66#include <net/net_namespace.h>
  67#include <net/ip.h>
  68#include <net/protocol.h>
  69#include <net/route.h>
  70#include <net/tcp.h>
  71#include <net/sock.h>
  72#include <net/ip_fib.h>
  73#include <net/fib_notifier.h>
  74#include <trace/events/fib.h>
  75#include "fib_lookup.h"
  76
  77static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
  78				   enum fib_event_type event_type, u32 dst,
  79				   int dst_len, struct fib_alias *fa)
  80{
  81	struct fib_entry_notifier_info info = {
  82		.dst = dst,
  83		.dst_len = dst_len,
  84		.fi = fa->fa_info,
  85		.tos = fa->fa_tos,
  86		.type = fa->fa_type,
  87		.tb_id = fa->tb_id,
  88	};
  89	return call_fib4_notifier(nb, net, event_type, &info.info);
  90}
  91
  92static int call_fib_entry_notifiers(struct net *net,
  93				    enum fib_event_type event_type, u32 dst,
  94				    int dst_len, struct fib_alias *fa,
  95				    struct netlink_ext_ack *extack)
  96{
  97	struct fib_entry_notifier_info info = {
  98		.info.extack = extack,
  99		.dst = dst,
 100		.dst_len = dst_len,
 101		.fi = fa->fa_info,
 102		.tos = fa->fa_tos,
 103		.type = fa->fa_type,
 104		.tb_id = fa->tb_id,
 105	};
 106	return call_fib4_notifiers(net, event_type, &info.info);
 107}
 108
 109#define MAX_STAT_DEPTH 32
 110
 111#define KEYLENGTH	(8*sizeof(t_key))
 112#define KEY_MAX		((t_key)~0)
 113
 114typedef unsigned int t_key;
 115
 116#define IS_TRIE(n)	((n)->pos >= KEYLENGTH)
 117#define IS_TNODE(n)	((n)->bits)
 118#define IS_LEAF(n)	(!(n)->bits)
 
 119
 120struct key_vector {
 
 
 
 
 121	t_key key;
 122	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
 123	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
 124	unsigned char slen;
 125	union {
 126		/* This list pointer if valid if (pos | bits) == 0 (LEAF) */
 127		struct hlist_head leaf;
 128		/* This array is valid if (pos | bits) > 0 (TNODE) */
 129		struct key_vector __rcu *tnode[0];
 130	};
 131};
 132
 133struct tnode {
 
 
 
 134	struct rcu_head rcu;
 135	t_key empty_children;		/* KEYLENGTH bits needed */
 136	t_key full_children;		/* KEYLENGTH bits needed */
 137	struct key_vector __rcu *parent;
 138	struct key_vector kv[1];
 139#define tn_bits kv[0].bits
 140};
 141
 142#define TNODE_SIZE(n)	offsetof(struct tnode, kv[0].tnode[n])
 143#define LEAF_SIZE	TNODE_SIZE(1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 144
 145#ifdef CONFIG_IP_FIB_TRIE_STATS
 146struct trie_use_stats {
 147	unsigned int gets;
 148	unsigned int backtrack;
 149	unsigned int semantic_match_passed;
 150	unsigned int semantic_match_miss;
 151	unsigned int null_node_hit;
 152	unsigned int resize_node_skipped;
 153};
 154#endif
 155
 156struct trie_stat {
 157	unsigned int totdepth;
 158	unsigned int maxdepth;
 159	unsigned int tnodes;
 160	unsigned int leaves;
 161	unsigned int nullpointers;
 162	unsigned int prefixes;
 163	unsigned int nodesizes[MAX_STAT_DEPTH];
 164};
 165
 166struct trie {
 167	struct key_vector kv[1];
 168#ifdef CONFIG_IP_FIB_TRIE_STATS
 169	struct trie_use_stats __percpu *stats;
 170#endif
 171};
 172
 173static struct key_vector *resize(struct trie *t, struct key_vector *tn);
 174static unsigned int tnode_free_size;
 
 
 
 
 
 
 
 175
 176/*
 177 * synchronize_rcu after call_rcu for outstanding dirty memory; it should be
 178 * especially useful before resizing the root node with PREEMPT_NONE configs;
 179 * the value was obtained experimentally, aiming to avoid visible slowdown.
 180 */
 181unsigned int sysctl_fib_sync_mem = 512 * 1024;
 182unsigned int sysctl_fib_sync_mem_min = 64 * 1024;
 183unsigned int sysctl_fib_sync_mem_max = 64 * 1024 * 1024;
 184
 185static struct kmem_cache *fn_alias_kmem __ro_after_init;
 186static struct kmem_cache *trie_leaf_kmem __ro_after_init;
 187
 188static inline struct tnode *tn_info(struct key_vector *kv)
 
 
 
 189{
 190	return container_of(kv, struct tnode, kv[0]);
 
 
 
 
 191}
 192
 193/* caller must hold RTNL */
 194#define node_parent(tn) rtnl_dereference(tn_info(tn)->parent)
 195#define get_child(tn, i) rtnl_dereference((tn)->tnode[i])
 
 
 
 
 
 
 196
 197/* caller must hold RCU read lock or RTNL */
 198#define node_parent_rcu(tn) rcu_dereference_rtnl(tn_info(tn)->parent)
 199#define get_child_rcu(tn, i) rcu_dereference_rtnl((tn)->tnode[i])
 200
 201/* wrapper for rcu_assign_pointer */
 202static inline void node_set_parent(struct key_vector *n, struct key_vector *tp)
 
 
 203{
 204	if (n)
 205		rcu_assign_pointer(tn_info(n)->parent, tp);
 206}
 207
 208#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER(tn_info(n)->parent, p)
 
 
 
 
 
 
 
 
 209
 210/* This provides us with the number of children in this node, in the case of a
 211 * leaf this will return 0 meaning none of the children are accessible.
 212 */
 213static inline unsigned long child_length(const struct key_vector *tn)
 214{
 215	return (1ul << tn->bits) & ~(1ul);
 
 
 216}
 217
 218#define get_cindex(key, kv) (((key) ^ (kv)->key) >> (kv)->pos)
 
 
 
 219
 220static inline unsigned long get_index(t_key key, struct key_vector *kv)
 221{
 222	unsigned long index = key ^ kv->key;
 
 223
 224	if ((BITS_PER_LONG <= KEYLENGTH) && (KEYLENGTH == kv->pos))
 
 
 
 
 225		return 0;
 
 226
 227	return index >> kv->pos;
 
 
 228}
 229
 230/* To understand this stuff, an understanding of keys and all their bits is
 231 * necessary. Every node in the trie has a key associated with it, but not
 232 * all of the bits in that key are significant.
 233 *
 234 * Consider a node 'n' and its parent 'tp'.
 235 *
 236 * If n is a leaf, every bit in its key is significant. Its presence is
 237 * necessitated by path compression, since during a tree traversal (when
 238 * searching for a leaf - unless we are doing an insertion) we will completely
 239 * ignore all skipped bits we encounter. Thus we need to verify, at the end of
 240 * a potentially successful search, that we have indeed been walking the
 241 * correct key path.
 242 *
 243 * Note that we can never "miss" the correct key in the tree if present by
 244 * following the wrong path. Path compression ensures that segments of the key
 245 * that are the same for all keys with a given prefix are skipped, but the
 246 * skipped part *is* identical for each node in the subtrie below the skipped
 247 * bit! trie_insert() in this implementation takes care of that.
 248 *
 249 * if n is an internal node - a 'tnode' here, the various parts of its key
 250 * have many different meanings.
 251 *
 252 * Example:
 253 * _________________________________________________________________
 254 * | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
 255 * -----------------------------------------------------------------
 256 *  31  30  29  28  27  26  25  24  23  22  21  20  19  18  17  16
 257 *
 258 * _________________________________________________________________
 259 * | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
 260 * -----------------------------------------------------------------
 261 *  15  14  13  12  11  10   9   8   7   6   5   4   3   2   1   0
 262 *
 263 * tp->pos = 22
 264 * tp->bits = 3
 265 * n->pos = 13
 266 * n->bits = 4
 267 *
 268 * First, let's just ignore the bits that come before the parent tp, that is
 269 * the bits from (tp->pos + tp->bits) to 31. They are *known* but at this
 270 * point we do not use them for anything.
 271 *
 272 * The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
 273 * index into the parent's child array. That is, they will be used to find
 274 * 'n' among tp's children.
 275 *
 276 * The bits from (n->pos + n->bits) to (tp->pos - 1) - "S" - are skipped bits
 277 * for the node n.
 278 *
 279 * All the bits we have seen so far are significant to the node n. The rest
 280 * of the bits are really not needed or indeed known in n->key.
 281 *
 282 * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
 283 * n's child array, and will of course be different for each child.
 284 *
 285 * The rest of the bits, from 0 to (n->pos -1) - "u" - are completely unknown
 286 * at this point.
 287 */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 288
 289static const int halve_threshold = 25;
 290static const int inflate_threshold = 50;
 291static const int halve_threshold_root = 15;
 292static const int inflate_threshold_root = 30;
 293
 294static void __alias_free_mem(struct rcu_head *head)
 295{
 296	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
 297	kmem_cache_free(fn_alias_kmem, fa);
 298}
 299
 300static inline void alias_free_mem_rcu(struct fib_alias *fa)
 301{
 302	call_rcu(&fa->rcu, __alias_free_mem);
 303}
 304
 305#define TNODE_KMALLOC_MAX \
 306	ilog2((PAGE_SIZE - TNODE_SIZE(0)) / sizeof(struct key_vector *))
 307#define TNODE_VMALLOC_MAX \
 308	ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct key_vector *))
 309
 310static void __node_free_rcu(struct rcu_head *head)
 311{
 312	struct tnode *n = container_of(head, struct tnode, rcu);
 313
 314	if (!n->tn_bits)
 315		kmem_cache_free(trie_leaf_kmem, n);
 316	else
 317		kvfree(n);
 318}
 319
 320#define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu)
 321
 322static struct tnode *tnode_alloc(int bits)
 323{
 324	size_t size;
 325
 326	/* verify bits is within bounds */
 327	if (bits > TNODE_VMALLOC_MAX)
 328		return NULL;
 329
 330	/* determine size and verify it is non-zero and didn't overflow */
 331	size = TNODE_SIZE(1ul << bits);
 
 
 332
 
 
 333	if (size <= PAGE_SIZE)
 334		return kzalloc(size, GFP_KERNEL);
 335	else
 336		return vzalloc(size);
 337}
 338
 339static inline void empty_child_inc(struct key_vector *n)
 340{
 341	tn_info(n)->empty_children++;
 
 
 
 
 
 
 
 
 342
 343	if (!tn_info(n)->empty_children)
 344		tn_info(n)->full_children++;
 
 
 
 
 345}
 346
 347static inline void empty_child_dec(struct key_vector *n)
 348{
 349	if (!tn_info(n)->empty_children)
 350		tn_info(n)->full_children--;
 
 
 
 351
 352	tn_info(n)->empty_children--;
 
 
 
 
 
 
 353}
 354
 355static struct key_vector *leaf_new(t_key key, struct fib_alias *fa)
 356{
 357	struct key_vector *l;
 358	struct tnode *kv;
 359
 360	kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
 361	if (!kv)
 362		return NULL;
 
 
 363
 364	/* initialize key vector */
 365	l = kv->kv;
 366	l->key = key;
 367	l->pos = 0;
 368	l->bits = 0;
 369	l->slen = fa->fa_slen;
 370
 371	/* link leaf to fib alias */
 372	INIT_HLIST_HEAD(&l->leaf);
 373	hlist_add_head(&fa->fa_list, &l->leaf);
 374
 
 
 
 
 
 
 
 375	return l;
 376}
 377
 378static struct key_vector *tnode_new(t_key key, int pos, int bits)
 379{
 380	unsigned int shift = pos + bits;
 381	struct key_vector *tn;
 382	struct tnode *tnode;
 383
 384	/* verify bits and pos their msb bits clear and values are valid */
 385	BUG_ON(!bits || (shift > KEYLENGTH));
 386
 387	tnode = tnode_alloc(bits);
 388	if (!tnode)
 389		return NULL;
 390
 391	pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0),
 392		 sizeof(struct key_vector *) << bits);
 393
 394	if (bits == KEYLENGTH)
 395		tnode->full_children = 1;
 396	else
 397		tnode->empty_children = 1ul << bits;
 398
 399	tn = tnode->kv;
 400	tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
 401	tn->pos = pos;
 402	tn->bits = bits;
 403	tn->slen = pos;
 
 
 
 404
 
 
 405	return tn;
 406}
 407
 408/* Check whether a tnode 'n' is "full", i.e. it is an internal node
 
 409 * and no bits are skipped. See discussion in dyntree paper p. 6
 410 */
 411static inline int tnode_full(struct key_vector *tn, struct key_vector *n)
 
 412{
 413	return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
 
 
 
 414}
 415
 416/* Add a child at position i overwriting the old value.
 417 * Update the value of full_children and empty_children.
 418 */
 419static void put_child(struct key_vector *tn, unsigned long i,
 420		      struct key_vector *n)
 421{
 422	struct key_vector *chi = get_child(tn, i);
 423	int isfull, wasfull;
 
 
 
 
 
 
 
 
 
 
 
 424
 425	BUG_ON(i >= child_length(tn));
 426
 427	/* update emptyChildren, overflow into fullChildren */
 428	if (!n && chi)
 429		empty_child_inc(tn);
 430	if (n && !chi)
 431		empty_child_dec(tn);
 432
 433	/* update fullChildren */
 434	wasfull = tnode_full(tn, chi);
 435	isfull = tnode_full(tn, n);
 436
 
 437	if (wasfull && !isfull)
 438		tn_info(tn)->full_children--;
 439	else if (!wasfull && isfull)
 440		tn_info(tn)->full_children++;
 441
 442	if (n && (tn->slen < n->slen))
 443		tn->slen = n->slen;
 444
 445	rcu_assign_pointer(tn->tnode[i], n);
 446}
 447
 448static void update_children(struct key_vector *tn)
 
 449{
 450	unsigned long i;
 
 
 
 
 451
 452	/* update all of the child parent pointers */
 453	for (i = child_length(tn); i;) {
 454		struct key_vector *inode = get_child(tn, --i);
 455
 456		if (!inode)
 457			continue;
 458
 459		/* Either update the children of a tnode that
 460		 * already belongs to us or update the child
 461		 * to point to ourselves.
 462		 */
 463		if (node_parent(inode) == tn)
 464			update_children(inode);
 465		else
 466			node_set_parent(inode, tn);
 467	}
 468}
 
 
 
 
 
 
 469
 470static inline void put_child_root(struct key_vector *tp, t_key key,
 471				  struct key_vector *n)
 472{
 473	if (IS_TRIE(tp))
 474		rcu_assign_pointer(tp->tnode[0], n);
 475	else
 476		put_child(tp, get_index(key, tp), n);
 477}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 478
 479static inline void tnode_free_init(struct key_vector *tn)
 480{
 481	tn_info(tn)->rcu.next = NULL;
 482}
 483
 484static inline void tnode_free_append(struct key_vector *tn,
 485				     struct key_vector *n)
 486{
 487	tn_info(n)->rcu.next = tn_info(tn)->rcu.next;
 488	tn_info(tn)->rcu.next = &tn_info(n)->rcu;
 489}
 490
 491static void tnode_free(struct key_vector *tn)
 492{
 493	struct callback_head *head = &tn_info(tn)->rcu;
 
 
 
 
 494
 495	while (head) {
 496		head = head->next;
 497		tnode_free_size += TNODE_SIZE(1ul << tn->bits);
 498		node_free(tn);
 
 499
 500		tn = container_of(head, struct tnode, rcu)->kv;
 501	}
 502
 503	if (tnode_free_size >= sysctl_fib_sync_mem) {
 504		tnode_free_size = 0;
 505		synchronize_rcu();
 
 
 
 
 506	}
 507}
 508
 509static struct key_vector *replace(struct trie *t,
 510				  struct key_vector *oldtnode,
 511				  struct key_vector *tn)
 512{
 513	struct key_vector *tp = node_parent(oldtnode);
 514	unsigned long i;
 
 
 
 
 515
 516	/* setup the parent pointer out of and back into this node */
 517	NODE_INIT_PARENT(tn, tp);
 518	put_child_root(tp, tn->key, tn);
 
 
 
 
 
 
 
 
 
 
 
 
 519
 520	/* update all of the child parent pointers */
 521	update_children(tn);
 522
 523	/* all pointers should be clean so we are done */
 524	tnode_free(oldtnode);
 
 
 
 525
 526	/* resize children now that oldtnode is freed */
 527	for (i = child_length(tn); i;) {
 528		struct key_vector *inode = get_child(tn, --i);
 
 
 529
 530		/* resize child node */
 531		if (tnode_full(tn, inode))
 532			tn = resize(t, inode);
 
 533	}
 
 
 534
 535	return tp;
 
 
 
 
 
 
 
 
 
 
 
 536}
 537
 538static struct key_vector *inflate(struct trie *t,
 539				  struct key_vector *oldtnode)
 540{
 541	struct key_vector *tn;
 542	unsigned long i;
 543	t_key m;
 544
 545	pr_debug("In inflate\n");
 546
 547	tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1);
 548	if (!tn)
 549		goto notnode;
 550
 551	/* prepare oldtnode to be freed */
 552	tnode_free_init(oldtnode);
 553
 554	/* Assemble all of the pointers in our cluster, in this case that
 555	 * represents all of the pointers out of our allocated nodes that
 556	 * point to existing tnodes and the links between our allocated
 557	 * nodes.
 
 558	 */
 559	for (i = child_length(oldtnode), m = 1u << tn->pos; i;) {
 560		struct key_vector *inode = get_child(oldtnode, --i);
 561		struct key_vector *node0, *node1;
 562		unsigned long j, k;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 563
 564		/* An empty child */
 565		if (!inode)
 566			continue;
 567
 568		/* A leaf or an internal node with skipped bits */
 569		if (!tnode_full(oldtnode, inode)) {
 570			put_child(tn, get_index(inode->key, tn), inode);
 
 
 
 
 
 
 
 571			continue;
 572		}
 573
 574		/* drop the node in the old tnode free list */
 575		tnode_free_append(oldtnode, inode);
 576
 577		/* An internal node with two children */
 
 
 578		if (inode->bits == 1) {
 579			put_child(tn, 2 * i + 1, get_child(inode, 1));
 580			put_child(tn, 2 * i, get_child(inode, 0));
 
 
 581			continue;
 582		}
 583
 
 
 584		/* We will replace this node 'inode' with two new
 585		 * ones, 'node0' and 'node1', each with half of the
 586		 * original children. The two new nodes will have
 587		 * a position one bit further down the key and this
 588		 * means that the "significant" part of their keys
 589		 * (see the discussion near the top of this file)
 590		 * will differ by one bit, which will be "0" in
 591		 * node0's key and "1" in node1's key. Since we are
 592		 * moving the key position by one step, the bit that
 593		 * we are moving away from - the bit at position
 594		 * (tn->pos) - is the one that will differ between
 595		 * node0 and node1. So... we synthesize that bit in the
 596		 * two new keys.
 
 
 597		 */
 598		node1 = tnode_new(inode->key | m, inode->pos, inode->bits - 1);
 599		if (!node1)
 600			goto nomem;
 601		node0 = tnode_new(inode->key, inode->pos, inode->bits - 1);
 602
 603		tnode_free_append(tn, node1);
 604		if (!node0)
 605			goto nomem;
 606		tnode_free_append(tn, node0);
 607
 608		/* populate child pointers in new nodes */
 609		for (k = child_length(inode), j = k / 2; j;) {
 610			put_child(node1, --j, get_child(inode, --k));
 611			put_child(node0, j, get_child(inode, j));
 612			put_child(node1, --j, get_child(inode, --k));
 613			put_child(node0, j, get_child(inode, j));
 614		}
 615
 616		/* link new nodes to parent */
 617		NODE_INIT_PARENT(node1, tn);
 618		NODE_INIT_PARENT(node0, tn);
 619
 620		/* link parent to nodes */
 621		put_child(tn, 2 * i + 1, node1);
 622		put_child(tn, 2 * i, node0);
 623	}
 624
 625	/* setup the parent pointers into and out of this node */
 626	return replace(t, oldtnode, tn);
 627nomem:
 628	/* all pointers should be clean so we are done */
 629	tnode_free(tn);
 630notnode:
 631	return NULL;
 632}
 633
 634static struct key_vector *halve(struct trie *t,
 635				struct key_vector *oldtnode)
 636{
 637	struct key_vector *tn;
 638	unsigned long i;
 639
 640	pr_debug("In halve\n");
 641
 642	tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1);
 643	if (!tn)
 644		goto notnode;
 645
 646	/* prepare oldtnode to be freed */
 647	tnode_free_init(oldtnode);
 648
 649	/* Assemble all of the pointers in our cluster, in this case that
 650	 * represents all of the pointers out of our allocated nodes that
 651	 * point to existing tnodes and the links between our allocated
 652	 * nodes.
 653	 */
 654	for (i = child_length(oldtnode); i;) {
 655		struct key_vector *node1 = get_child(oldtnode, --i);
 656		struct key_vector *node0 = get_child(oldtnode, --i);
 657		struct key_vector *inode;
 658
 659		/* At least one of the children is empty */
 660		if (!node1 || !node0) {
 661			put_child(tn, i / 2, node1 ? : node0);
 662			continue;
 663		}
 
 
 664
 665		/* Two nonempty children */
 666		inode = tnode_new(node0->key, oldtnode->pos, 1);
 667		if (!inode)
 668			goto nomem;
 669		tnode_free_append(tn, inode);
 670
 671		/* initialize pointers out of node */
 672		put_child(inode, 1, node1);
 673		put_child(inode, 0, node0);
 674		NODE_INIT_PARENT(inode, tn);
 675
 676		/* link parent to node */
 677		put_child(tn, i / 2, inode);
 678	}
 679
 680	/* setup the parent pointers into and out of this node */
 681	return replace(t, oldtnode, tn);
 682nomem:
 683	/* all pointers should be clean so we are done */
 684	tnode_free(tn);
 685notnode:
 686	return NULL;
 687}
 688
 689static struct key_vector *collapse(struct trie *t,
 690				   struct key_vector *oldtnode)
 691{
 692	struct key_vector *n, *tp;
 693	unsigned long i;
 
 
 694
 695	/* scan the tnode looking for that one child that might still exist */
 696	for (n = NULL, i = child_length(oldtnode); !n && i;)
 697		n = get_child(oldtnode, --i);
 698
 699	/* compress one level */
 700	tp = node_parent(oldtnode);
 701	put_child_root(tp, oldtnode->key, n);
 702	node_set_parent(n, tp);
 703
 704	/* drop dead node */
 705	node_free(oldtnode);
 706
 707	return tp;
 708}
 
 
 
 
 709
 710static unsigned char update_suffix(struct key_vector *tn)
 711{
 712	unsigned char slen = tn->pos;
 713	unsigned long stride, i;
 714	unsigned char slen_max;
 715
 716	/* only vector 0 can have a suffix length greater than or equal to
 717	 * tn->pos + tn->bits, the second highest node will have a suffix
 718	 * length at most of tn->pos + tn->bits - 1
 719	 */
 720	slen_max = min_t(unsigned char, tn->pos + tn->bits - 1, tn->slen);
 721
 722	/* search though the list of children looking for nodes that might
 723	 * have a suffix greater than the one we currently have.  This is
 724	 * why we start with a stride of 2 since a stride of 1 would
 725	 * represent the nodes with suffix length equal to tn->pos
 726	 */
 727	for (i = 0, stride = 0x2ul ; i < child_length(tn); i += stride) {
 728		struct key_vector *n = get_child(tn, i);
 729
 730		if (!n || (n->slen <= slen))
 731			continue;
 732
 733		/* update stride and slen based on new value */
 734		stride <<= (n->slen - slen);
 735		slen = n->slen;
 736		i &= ~(stride - 1);
 737
 738		/* stop searching if we have hit the maximum possible value */
 739		if (slen >= slen_max)
 740			break;
 741	}
 742
 743	tn->slen = slen;
 
 744
 745	return slen;
 746}
 747
 748/* From "Implementing a dynamic compressed trie" by Stefan Nilsson of
 749 * the Helsinki University of Technology and Matti Tikkanen of Nokia
 750 * Telecommunications, page 6:
 751 * "A node is doubled if the ratio of non-empty children to all
 752 * children in the *doubled* node is at least 'high'."
 753 *
 754 * 'high' in this instance is the variable 'inflate_threshold'. It
 755 * is expressed as a percentage, so we multiply it with
 756 * child_length() and instead of multiplying by 2 (since the
 757 * child array will be doubled by inflate()) and multiplying
 758 * the left-hand side by 100 (to handle the percentage thing) we
 759 * multiply the left-hand side by 50.
 760 *
 761 * The left-hand side may look a bit weird: child_length(tn)
 762 * - tn->empty_children is of course the number of non-null children
 763 * in the current node. tn->full_children is the number of "full"
 764 * children, that is non-null tnodes with a skip value of 0.
 765 * All of those will be doubled in the resulting inflated tnode, so
 766 * we just count them one extra time here.
 767 *
 768 * A clearer way to write this would be:
 769 *
 770 * to_be_doubled = tn->full_children;
 771 * not_to_be_doubled = child_length(tn) - tn->empty_children -
 772 *     tn->full_children;
 773 *
 774 * new_child_length = child_length(tn) * 2;
 775 *
 776 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
 777 *      new_child_length;
 778 * if (new_fill_factor >= inflate_threshold)
 779 *
 780 * ...and so on, tho it would mess up the while () loop.
 781 *
 782 * anyway,
 783 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
 784 *      inflate_threshold
 785 *
 786 * avoid a division:
 787 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
 788 *      inflate_threshold * new_child_length
 789 *
 790 * expand not_to_be_doubled and to_be_doubled, and shorten:
 791 * 100 * (child_length(tn) - tn->empty_children +
 792 *    tn->full_children) >= inflate_threshold * new_child_length
 793 *
 794 * expand new_child_length:
 795 * 100 * (child_length(tn) - tn->empty_children +
 796 *    tn->full_children) >=
 797 *      inflate_threshold * child_length(tn) * 2
 798 *
 799 * shorten again:
 800 * 50 * (tn->full_children + child_length(tn) -
 801 *    tn->empty_children) >= inflate_threshold *
 802 *    child_length(tn)
 803 *
 804 */
 805static inline bool should_inflate(struct key_vector *tp, struct key_vector *tn)
 806{
 807	unsigned long used = child_length(tn);
 808	unsigned long threshold = used;
 809
 810	/* Keep root node larger */
 811	threshold *= IS_TRIE(tp) ? inflate_threshold_root : inflate_threshold;
 812	used -= tn_info(tn)->empty_children;
 813	used += tn_info(tn)->full_children;
 814
 815	/* if bits == KEYLENGTH then pos = 0, and will fail below */
 
 
 
 816
 817	return (used > 1) && tn->pos && ((50 * used) >= threshold);
 
 
 
 
 
 
 
 
 
 
 
 818}
 819
 820static inline bool should_halve(struct key_vector *tp, struct key_vector *tn)
 821{
 822	unsigned long used = child_length(tn);
 823	unsigned long threshold = used;
 824
 825	/* Keep root node larger */
 826	threshold *= IS_TRIE(tp) ? halve_threshold_root : halve_threshold;
 827	used -= tn_info(tn)->empty_children;
 
 
 828
 829	/* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */
 
 
 830
 831	return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold);
 832}
 833
 834static inline bool should_collapse(struct key_vector *tn)
 835{
 836	unsigned long used = child_length(tn);
 837
 838	used -= tn_info(tn)->empty_children;
 839
 840	/* account for bits == KEYLENGTH case */
 841	if ((tn->bits == KEYLENGTH) && tn_info(tn)->full_children)
 842		used -= KEY_MAX;
 843
 844	/* One child or none, time to drop us from the trie */
 845	return used < 2;
 846}
 847
 848#define MAX_WORK 10
 849static struct key_vector *resize(struct trie *t, struct key_vector *tn)
 850{
 851#ifdef CONFIG_IP_FIB_TRIE_STATS
 852	struct trie_use_stats __percpu *stats = t->stats;
 853#endif
 854	struct key_vector *tp = node_parent(tn);
 855	unsigned long cindex = get_index(tn->key, tp);
 856	int max_work = MAX_WORK;
 857
 858	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
 859		 tn, inflate_threshold, halve_threshold);
 860
 861	/* track the tnode via the pointer from the parent instead of
 862	 * doing it ourselves.  This way we can let RCU fully do its
 863	 * thing without us interfering
 864	 */
 865	BUG_ON(tn != get_child(tp, cindex));
 866
 867	/* Double as long as the resulting node has a number of
 868	 * nonempty nodes that are above the threshold.
 869	 */
 870	while (should_inflate(tp, tn) && max_work) {
 871		tp = inflate(t, tn);
 872		if (!tp) {
 873#ifdef CONFIG_IP_FIB_TRIE_STATS
 874			this_cpu_inc(stats->resize_node_skipped);
 875#endif
 876			break;
 877		}
 878
 879		max_work--;
 880		tn = get_child(tp, cindex);
 
 881	}
 
 882
 883	/* update parent in case inflate failed */
 884	tp = node_parent(tn);
 885
 886	/* Return if at least one inflate is run */
 887	if (max_work != MAX_WORK)
 888		return tp;
 889
 890	/* Halve as long as the number of empty children in this
 891	 * node is above threshold.
 892	 */
 893	while (should_halve(tp, tn) && max_work) {
 894		tp = halve(t, tn);
 895		if (!tp) {
 896#ifdef CONFIG_IP_FIB_TRIE_STATS
 897			this_cpu_inc(stats->resize_node_skipped);
 898#endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 899			break;
 900		}
 901
 902		max_work--;
 903		tn = get_child(tp, cindex);
 904	}
 
 905
 906	/* Only one child remains */
 907	if (should_collapse(tn))
 908		return collapse(t, tn);
 909
 910	/* update parent in case halve failed */
 911	return node_parent(tn);
 912}
 913
 914static void node_pull_suffix(struct key_vector *tn, unsigned char slen)
 915{
 916	unsigned char node_slen = tn->slen;
 
 
 917
 918	while ((node_slen > tn->pos) && (node_slen > slen)) {
 919		slen = update_suffix(tn);
 920		if (node_slen == slen)
 921			break;
 922
 923		tn = node_parent(tn);
 924		node_slen = tn->slen;
 925	}
 926}
 927
 928static void node_push_suffix(struct key_vector *tn, unsigned char slen)
 929{
 930	while (tn->slen < slen) {
 931		tn->slen = slen;
 932		tn = node_parent(tn);
 933	}
 934}
 935
 936/* rcu_read_lock needs to be hold by caller from readside */
 937static struct key_vector *fib_find_node(struct trie *t,
 938					struct key_vector **tp, u32 key)
 939{
 940	struct key_vector *pn, *n = t->kv;
 941	unsigned long index = 0;
 942
 943	do {
 944		pn = n;
 945		n = get_child_rcu(n, index);
 946
 947		if (!n)
 
 948			break;
 
 
 949
 950		index = get_cindex(key, n);
 
 
 951
 952		/* This bit of code is a bit tricky but it combines multiple
 953		 * checks into a single check.  The prefix consists of the
 954		 * prefix plus zeros for the bits in the cindex. The index
 955		 * is the difference between the key and this value.  From
 956		 * this we can actually derive several pieces of data.
 957		 *   if (index >= (1ul << bits))
 958		 *     we have a mismatch in skip bits and failed
 959		 *   else
 960		 *     we know the value is cindex
 961		 *
 962		 * This check is safe even if bits == KEYLENGTH due to the
 963		 * fact that we can only allocate a node with 32 bits if a
 964		 * long is greater than 32 bits.
 965		 */
 966		if (index >= (1ul << n->bits)) {
 967			n = NULL;
 968			break;
 969		}
 970
 971		/* keep searching until we find a perfect match leaf or NULL */
 972	} while (IS_TNODE(n));
 973
 974	*tp = pn;
 
 
 
 
 
 
 
 
 
 975
 976	return n;
 977}
 978
 979/* Return the first fib alias matching TOS with
 980 * priority less than or equal to PRIO.
 981 */
 982static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
 983					u8 tos, u32 prio, u32 tb_id)
 984{
 985	struct fib_alias *fa;
 
 
 
 
 
 
 
 
 
 
 986
 987	if (!fah)
 988		return NULL;
 989
 990	hlist_for_each_entry(fa, fah, fa_list) {
 991		if (fa->fa_slen < slen)
 992			continue;
 993		if (fa->fa_slen != slen)
 994			break;
 995		if (fa->tb_id > tb_id)
 996			continue;
 997		if (fa->tb_id != tb_id)
 
 
 
 
 998			break;
 999		if (fa->fa_tos > tos)
1000			continue;
1001		if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos)
1002			return fa;
1003	}
1004
1005	return NULL;
1006}
 
 
 
1007
1008static void trie_rebalance(struct trie *t, struct key_vector *tn)
1009{
1010	while (!IS_TRIE(tn))
1011		tn = resize(t, tn);
1012}
1013
1014static int fib_insert_node(struct trie *t, struct key_vector *tp,
1015			   struct fib_alias *new, t_key key)
1016{
1017	struct key_vector *n, *l;
1018
1019	l = leaf_new(key, new);
1020	if (!l)
1021		goto noleaf;
1022
1023	/* retrieve child from parent node */
1024	n = get_child(tp, get_index(key, tp));
1025
1026	/* Case 2: n is a LEAF or a TNODE and the key doesn't match.
1027	 *
1028	 *  Add a new tnode here
1029	 *  first tnode need some special handling
1030	 *  leaves us in position for handling as case 3
1031	 */
1032	if (n) {
1033		struct key_vector *tn;
1034
1035		tn = tnode_new(key, __fls(key ^ n->key), 1);
1036		if (!tn)
1037			goto notnode;
1038
1039		/* initialize routes out of node */
1040		NODE_INIT_PARENT(tn, tp);
1041		put_child(tn, get_index(key, tn) ^ 1, n);
1042
1043		/* start adding routes into the node */
1044		put_child_root(tp, key, tn);
1045		node_set_parent(n, tn);
1046
1047		/* parent now has a NULL spot where the leaf can go */
1048		tp = tn;
 
1049	}
1050
1051	/* Case 3: n is NULL, and will just insert a new leaf */
1052	node_push_suffix(tp, new->fa_slen);
1053	NODE_INIT_PARENT(l, tp);
1054	put_child_root(tp, key, l);
1055	trie_rebalance(t, tp);
1056
1057	return 0;
1058notnode:
1059	node_free(l);
1060noleaf:
1061	return -ENOMEM;
1062}
1063
1064/* fib notifier for ADD is sent before calling fib_insert_alias with
1065 * the expectation that the only possible failure ENOMEM
1066 */
1067static int fib_insert_alias(struct trie *t, struct key_vector *tp,
1068			    struct key_vector *l, struct fib_alias *new,
1069			    struct fib_alias *fa, t_key key)
1070{
1071	if (!l)
1072		return fib_insert_node(t, tp, new, key);
1073
1074	if (fa) {
1075		hlist_add_before_rcu(&new->fa_list, &fa->fa_list);
1076	} else {
1077		struct fib_alias *last;
 
 
 
 
1078
1079		hlist_for_each_entry(last, &l->leaf, fa_list) {
1080			if (new->fa_slen < last->fa_slen)
1081				break;
1082			if ((new->fa_slen == last->fa_slen) &&
1083			    (new->tb_id > last->tb_id))
1084				break;
1085			fa = last;
 
 
 
 
1086		}
1087
1088		if (fa)
1089			hlist_add_behind_rcu(&new->fa_list, &fa->fa_list);
1090		else
1091			hlist_add_head_rcu(&new->fa_list, &l->leaf);
1092	}
1093
1094	/* if we added to the tail node then we need to update slen */
1095	if (l->slen < new->fa_slen) {
1096		l->slen = new->fa_slen;
1097		node_push_suffix(tp, new->fa_slen);
1098	}
1099
1100	return 0;
1101}
 
1102
1103static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack)
1104{
1105	if (plen > KEYLENGTH) {
1106		NL_SET_ERR_MSG(extack, "Invalid prefix length");
1107		return false;
 
 
 
1108	}
1109
1110	if ((plen < KEYLENGTH) && (key << plen)) {
1111		NL_SET_ERR_MSG(extack,
1112			       "Invalid prefix for given prefix length");
1113		return false;
1114	}
 
1115
1116	return true;
 
 
1117}
1118
1119/* Caller must hold RTNL. */
1120int fib_table_insert(struct net *net, struct fib_table *tb,
1121		     struct fib_config *cfg, struct netlink_ext_ack *extack)
 
1122{
1123	enum fib_event_type event = FIB_EVENT_ENTRY_ADD;
1124	struct trie *t = (struct trie *)tb->tb_data;
1125	struct fib_alias *fa, *new_fa;
1126	struct key_vector *l, *tp;
1127	u16 nlflags = NLM_F_EXCL;
1128	struct fib_info *fi;
1129	u8 plen = cfg->fc_dst_len;
1130	u8 slen = KEYLENGTH - plen;
1131	u8 tos = cfg->fc_tos;
1132	u32 key;
1133	int err;
 
 
 
 
1134
1135	key = ntohl(cfg->fc_dst);
1136
1137	if (!fib_valid_key_len(key, plen, extack))
 
 
 
 
1138		return -EINVAL;
1139
1140	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1141
1142	fi = fib_create_info(cfg, extack);
1143	if (IS_ERR(fi)) {
1144		err = PTR_ERR(fi);
1145		goto err;
1146	}
1147
1148	l = fib_find_node(t, &tp, key);
1149	fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority,
1150				tb->tb_id) : NULL;
 
 
 
 
1151
1152	/* Now fa, if non-NULL, points to the first fib alias
1153	 * with the same keys [prefix,tos,priority], if such key already
1154	 * exists or to the node before which we will insert new one.
1155	 *
1156	 * If fa is NULL, we will need to allocate a new one and
1157	 * insert to the tail of the section matching the suffix length
1158	 * of the new alias.
 
 
1159	 */
1160
1161	if (fa && fa->fa_tos == tos &&
1162	    fa->fa_info->fib_priority == fi->fib_priority) {
1163		struct fib_alias *fa_first, *fa_match;
1164
1165		err = -EEXIST;
1166		if (cfg->fc_nlflags & NLM_F_EXCL)
1167			goto out;
1168
1169		nlflags &= ~NLM_F_EXCL;
1170
1171		/* We have 2 goals:
1172		 * 1. Find exact match for type, scope, fib_info to avoid
1173		 * duplicate routes
1174		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1175		 */
1176		fa_match = NULL;
1177		fa_first = fa;
1178		hlist_for_each_entry_from(fa, fa_list) {
1179			if ((fa->fa_slen != slen) ||
1180			    (fa->tb_id != tb->tb_id) ||
1181			    (fa->fa_tos != tos))
1182				break;
1183			if (fa->fa_info->fib_priority != fi->fib_priority)
1184				break;
1185			if (fa->fa_type == cfg->fc_type &&
1186			    fa->fa_info == fi) {
1187				fa_match = fa;
1188				break;
1189			}
1190		}
1191
1192		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1193			struct fib_info *fi_drop;
1194			u8 state;
1195
1196			nlflags |= NLM_F_REPLACE;
1197			fa = fa_first;
1198			if (fa_match) {
1199				if (fa == fa_match)
1200					err = 0;
1201				goto out;
1202			}
1203			err = -ENOBUFS;
1204			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1205			if (!new_fa)
1206				goto out;
1207
1208			fi_drop = fa->fa_info;
1209			new_fa->fa_tos = fa->fa_tos;
1210			new_fa->fa_info = fi;
1211			new_fa->fa_type = cfg->fc_type;
1212			state = fa->fa_state;
1213			new_fa->fa_state = state & ~FA_S_ACCESSED;
1214			new_fa->fa_slen = fa->fa_slen;
1215			new_fa->tb_id = tb->tb_id;
1216			new_fa->fa_default = -1;
1217
1218			err = call_fib_entry_notifiers(net,
1219						       FIB_EVENT_ENTRY_REPLACE,
1220						       key, plen, new_fa,
1221						       extack);
1222			if (err)
1223				goto out_free_new_fa;
1224
1225			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1226				  tb->tb_id, &cfg->fc_nlinfo, nlflags);
1227
1228			hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1229
 
1230			alias_free_mem_rcu(fa);
1231
1232			fib_release_info(fi_drop);
1233			if (state & FA_S_ACCESSED)
1234				rt_cache_flush(cfg->fc_nlinfo.nl_net);
 
 
1235
1236			goto succeeded;
1237		}
1238		/* Error if we find a perfect match which
1239		 * uses the same scope, type, and nexthop
1240		 * information.
1241		 */
1242		if (fa_match)
1243			goto out;
1244
1245		if (cfg->fc_nlflags & NLM_F_APPEND) {
1246			event = FIB_EVENT_ENTRY_APPEND;
1247			nlflags |= NLM_F_APPEND;
1248		} else {
1249			fa = fa_first;
1250		}
1251	}
1252	err = -ENOENT;
1253	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1254		goto out;
1255
1256	nlflags |= NLM_F_CREATE;
1257	err = -ENOBUFS;
1258	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1259	if (!new_fa)
1260		goto out;
1261
1262	new_fa->fa_info = fi;
1263	new_fa->fa_tos = tos;
1264	new_fa->fa_type = cfg->fc_type;
1265	new_fa->fa_state = 0;
1266	new_fa->fa_slen = slen;
1267	new_fa->tb_id = tb->tb_id;
1268	new_fa->fa_default = -1;
1269
1270	err = call_fib_entry_notifiers(net, event, key, plen, new_fa, extack);
1271	if (err)
1272		goto out_free_new_fa;
1273
1274	/* Insert new entry to the list. */
1275	err = fib_insert_alias(t, tp, l, new_fa, fa, key);
1276	if (err)
1277		goto out_fib_notif;
1278
1279	if (!plen)
1280		tb->tb_num_default++;
1281
1282	rt_cache_flush(cfg->fc_nlinfo.nl_net);
1283	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
1284		  &cfg->fc_nlinfo, nlflags);
 
 
 
1285succeeded:
1286	return 0;
1287
1288out_fib_notif:
1289	/* notifier was sent that entry would be added to trie, but
1290	 * the add failed and need to recover. Only failure for
1291	 * fib_insert_alias is ENOMEM.
1292	 */
1293	NL_SET_ERR_MSG(extack, "Failed to insert route into trie");
1294	call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key,
1295				 plen, new_fa, NULL);
1296out_free_new_fa:
1297	kmem_cache_free(fn_alias_kmem, new_fa);
1298out:
1299	fib_release_info(fi);
1300err:
1301	return err;
1302}
1303
1304static inline t_key prefix_mismatch(t_key key, struct key_vector *n)
1305{
1306	t_key prefix = n->key;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1307
1308	return (key ^ prefix) & (prefix | -prefix);
 
 
 
 
 
1309}
1310
1311/* should be called with rcu_read_lock */
1312int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1313		     struct fib_result *res, int fib_flags)
1314{
1315	struct trie *t = (struct trie *) tb->tb_data;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1316#ifdef CONFIG_IP_FIB_TRIE_STATS
1317	struct trie_use_stats __percpu *stats = t->stats;
1318#endif
1319	const t_key key = ntohl(flp->daddr);
1320	struct key_vector *n, *pn;
1321	struct fib_alias *fa;
1322	unsigned long index;
1323	t_key cindex;
1324
1325	pn = t->kv;
1326	cindex = 0;
1327
1328	n = get_child_rcu(pn, cindex);
1329	if (!n) {
1330		trace_fib_table_lookup(tb->tb_id, flp, NULL, -EAGAIN);
1331		return -EAGAIN;
1332	}
1333
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1334#ifdef CONFIG_IP_FIB_TRIE_STATS
1335	this_cpu_inc(stats->gets);
1336#endif
 
 
1337
1338	/* Step 1: Travel to the longest prefix match in the trie */
1339	for (;;) {
1340		index = get_cindex(key, n);
1341
1342		/* This bit of code is a bit tricky but it combines multiple
1343		 * checks into a single check.  The prefix consists of the
1344		 * prefix plus zeros for the "bits" in the prefix. The index
1345		 * is the difference between the key and this value.  From
1346		 * this we can actually derive several pieces of data.
1347		 *   if (index >= (1ul << bits))
1348		 *     we have a mismatch in skip bits and failed
1349		 *   else
1350		 *     we know the value is cindex
1351		 *
1352		 * This check is safe even if bits == KEYLENGTH due to the
1353		 * fact that we can only allocate a node with 32 bits if a
1354		 * long is greater than 32 bits.
1355		 */
1356		if (index >= (1ul << n->bits))
1357			break;
1358
1359		/* we have found a leaf. Prefixes have already been compared */
1360		if (IS_LEAF(n))
1361			goto found;
1362
1363		/* only record pn and cindex if we are going to be chopping
1364		 * bits later.  Otherwise we are just wasting cycles.
1365		 */
1366		if (n->slen > n->pos) {
1367			pn = n;
1368			cindex = index;
1369		}
1370
1371		n = get_child_rcu(n, index);
1372		if (unlikely(!n))
1373			goto backtrace;
1374	}
1375
1376	/* Step 2: Sort out leaves and begin backtracing for longest prefix */
1377	for (;;) {
1378		/* record the pointer where our next node pointer is stored */
1379		struct key_vector __rcu **cptr = n->tnode;
1380
1381		/* This test verifies that none of the bits that differ
1382		 * between the key and the prefix exist in the region of
1383		 * the lsb and higher in the prefix.
 
 
 
 
 
 
1384		 */
1385		if (unlikely(prefix_mismatch(key, n)) || (n->slen == n->pos))
1386			goto backtrace;
1387
1388		/* exit out and process leaf */
1389		if (unlikely(IS_LEAF(n)))
1390			break;
 
 
 
 
 
 
 
 
 
1391
1392		/* Don't bother recording parent info.  Since we are in
1393		 * prefix match mode we will have to come back to wherever
1394		 * we started this traversal anyway
 
 
 
 
 
 
 
 
 
 
 
 
1395		 */
1396
1397		while ((n = rcu_dereference(*cptr)) == NULL) {
1398backtrace:
1399#ifdef CONFIG_IP_FIB_TRIE_STATS
1400			if (!n)
1401				this_cpu_inc(stats->null_node_hit);
1402#endif
1403			/* If we are at cindex 0 there are no more bits for
1404			 * us to strip at this level so we must ascend back
1405			 * up one level to see if there are any more bits to
1406			 * be stripped there.
1407			 */
1408			while (!cindex) {
1409				t_key pkey = pn->key;
1410
1411				/* If we don't have a parent then there is
1412				 * nothing for us to do as we do not have any
1413				 * further nodes to parse.
1414				 */
1415				if (IS_TRIE(pn)) {
1416					trace_fib_table_lookup(tb->tb_id, flp,
1417							       NULL, -EAGAIN);
1418					return -EAGAIN;
1419				}
1420#ifdef CONFIG_IP_FIB_TRIE_STATS
1421				this_cpu_inc(stats->backtrack);
1422#endif
1423				/* Get Child's index */
1424				pn = node_parent_rcu(pn);
1425				cindex = get_index(pkey, pn);
1426			}
1427
1428			/* strip the least significant bit from the cindex */
1429			cindex &= cindex - 1;
 
 
 
 
 
 
 
 
 
 
1430
1431			/* grab pointer for next child node */
1432			cptr = &pn->tnode[cindex];
1433		}
1434	}
1435
1436found:
1437	/* this line carries forward the xor from earlier in the function */
1438	index = key ^ n->key;
 
 
 
 
1439
1440	/* Step 3: Process the leaf, if that fails fall back to backtracing */
1441	hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
1442		struct fib_info *fi = fa->fa_info;
1443		int nhsel, err;
1444
1445		if ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen < KEYLENGTH)) {
1446			if (index >= (1ul << fa->fa_slen))
1447				continue;
1448		}
1449		if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1450			continue;
1451		if (fi->fib_dead)
1452			continue;
1453		if (fa->fa_info->fib_scope < flp->flowi4_scope)
1454			continue;
1455		fib_alias_accessed(fa);
1456		err = fib_props[fa->fa_type].error;
1457		if (unlikely(err < 0)) {
1458out_reject:
1459#ifdef CONFIG_IP_FIB_TRIE_STATS
1460			this_cpu_inc(stats->semantic_match_passed);
1461#endif
1462			trace_fib_table_lookup(tb->tb_id, flp, NULL, err);
1463			return err;
1464		}
1465		if (fi->fib_flags & RTNH_F_DEAD)
1466			continue;
1467
1468		if (unlikely(fi->nh && nexthop_is_blackhole(fi->nh))) {
1469			err = fib_props[RTN_BLACKHOLE].error;
1470			goto out_reject;
1471		}
1472
1473		for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
1474			struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
1475
1476			if (nhc->nhc_flags & RTNH_F_DEAD)
1477				continue;
1478			if (ip_ignore_linkdown(nhc->nhc_dev) &&
1479			    nhc->nhc_flags & RTNH_F_LINKDOWN &&
1480			    !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE))
1481				continue;
1482			if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) {
1483				if (flp->flowi4_oif &&
1484				    flp->flowi4_oif != nhc->nhc_oif)
1485					continue;
1486			}
 
 
 
1487
1488			if (!(fib_flags & FIB_LOOKUP_NOREF))
1489				refcount_inc(&fi->fib_clntref);
 
 
 
 
 
 
 
 
 
1490
1491			res->prefix = htonl(n->key);
1492			res->prefixlen = KEYLENGTH - fa->fa_slen;
1493			res->nh_sel = nhsel;
1494			res->nhc = nhc;
1495			res->type = fa->fa_type;
1496			res->scope = fi->fib_scope;
1497			res->fi = fi;
1498			res->table = tb;
1499			res->fa_head = &n->leaf;
1500#ifdef CONFIG_IP_FIB_TRIE_STATS
1501			this_cpu_inc(stats->semantic_match_passed);
1502#endif
1503			trace_fib_table_lookup(tb->tb_id, flp, nhc, err);
1504
1505			return err;
1506		}
1507	}
1508#ifdef CONFIG_IP_FIB_TRIE_STATS
1509	this_cpu_inc(stats->semantic_match_miss);
1510#endif
1511	goto backtrace;
 
1512}
1513EXPORT_SYMBOL_GPL(fib_table_lookup);
1514
1515static void fib_remove_alias(struct trie *t, struct key_vector *tp,
1516			     struct key_vector *l, struct fib_alias *old)
 
 
1517{
1518	/* record the location of the previous list_info entry */
1519	struct hlist_node **pprev = old->fa_list.pprev;
1520	struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next);
1521
1522	/* remove the fib_alias from the list */
1523	hlist_del_rcu(&old->fa_list);
1524
1525	/* if we emptied the list this leaf will be freed and we can sort
1526	 * out parent suffix lengths as a part of trie_rebalance
1527	 */
1528	if (hlist_empty(&l->leaf)) {
1529		if (tp->slen == l->slen)
1530			node_pull_suffix(tp, tp->pos);
1531		put_child_root(tp, l->key, NULL);
1532		node_free(l);
1533		trie_rebalance(t, tp);
1534		return;
1535	}
1536
1537	/* only access fa if it is pointing at the last valid hlist_node */
1538	if (*pprev)
1539		return;
1540
1541	/* update the trie with the latest suffix length */
1542	l->slen = fa->fa_slen;
1543	node_pull_suffix(tp, fa->fa_slen);
1544}
1545
1546/* Caller must hold RTNL. */
1547int fib_table_delete(struct net *net, struct fib_table *tb,
1548		     struct fib_config *cfg, struct netlink_ext_ack *extack)
 
1549{
1550	struct trie *t = (struct trie *) tb->tb_data;
1551	struct fib_alias *fa, *fa_to_delete;
1552	struct key_vector *l, *tp;
1553	u8 plen = cfg->fc_dst_len;
1554	u8 slen = KEYLENGTH - plen;
1555	u8 tos = cfg->fc_tos;
1556	u32 key;
 
 
 
 
 
 
1557
1558	key = ntohl(cfg->fc_dst);
 
1559
1560	if (!fib_valid_key_len(key, plen, extack))
1561		return -EINVAL;
1562
1563	l = fib_find_node(t, &tp, key);
 
 
1564	if (!l)
1565		return -ESRCH;
1566
1567	fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id);
 
 
1568	if (!fa)
1569		return -ESRCH;
1570
1571	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1572
1573	fa_to_delete = NULL;
1574	hlist_for_each_entry_from(fa, fa_list) {
 
1575		struct fib_info *fi = fa->fa_info;
1576
1577		if ((fa->fa_slen != slen) ||
1578		    (fa->tb_id != tb->tb_id) ||
1579		    (fa->fa_tos != tos))
1580			break;
1581
1582		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1583		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1584		     fa->fa_info->fib_scope == cfg->fc_scope) &&
1585		    (!cfg->fc_prefsrc ||
1586		     fi->fib_prefsrc == cfg->fc_prefsrc) &&
1587		    (!cfg->fc_protocol ||
1588		     fi->fib_protocol == cfg->fc_protocol) &&
1589		    fib_nh_match(cfg, fi, extack) == 0 &&
1590		    fib_metrics_match(cfg, fi)) {
1591			fa_to_delete = fa;
1592			break;
1593		}
1594	}
1595
1596	if (!fa_to_delete)
1597		return -ESRCH;
1598
1599	call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen,
1600				 fa_to_delete, extack);
1601	rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
1602		  &cfg->fc_nlinfo, 0);
1603
 
 
 
 
 
1604	if (!plen)
1605		tb->tb_num_default--;
1606
1607	fib_remove_alias(t, tp, l, fa_to_delete);
 
 
 
 
 
 
1608
1609	if (fa_to_delete->fa_state & FA_S_ACCESSED)
1610		rt_cache_flush(cfg->fc_nlinfo.nl_net);
1611
1612	fib_release_info(fa_to_delete->fa_info);
1613	alias_free_mem_rcu(fa_to_delete);
1614	return 0;
1615}
1616
1617/* Scan for the next leaf starting at the provided key value */
1618static struct key_vector *leaf_walk_rcu(struct key_vector **tn, t_key key)
1619{
1620	struct key_vector *pn, *n = *tn;
1621	unsigned long cindex;
1622
1623	/* this loop is meant to try and find the key in the trie */
1624	do {
1625		/* record parent and next child index */
1626		pn = n;
1627		cindex = (key > pn->key) ? get_index(key, pn) : 0;
1628
1629		if (cindex >> pn->bits)
1630			break;
1631
1632		/* descend into the next child */
1633		n = get_child_rcu(pn, cindex++);
1634		if (!n)
1635			break;
1636
1637		/* guarantee forward progress on the keys */
1638		if (IS_LEAF(n) && (n->key >= key))
1639			goto found;
1640	} while (IS_TNODE(n));
1641
1642	/* this loop will search for the next leaf with a greater key */
1643	while (!IS_TRIE(pn)) {
1644		/* if we exhausted the parent node we will need to climb */
1645		if (cindex >= (1ul << pn->bits)) {
1646			t_key pkey = pn->key;
1647
1648			pn = node_parent_rcu(pn);
1649			cindex = get_index(pkey, pn) + 1;
1650			continue;
 
 
1651		}
1652
1653		/* grab the next available node */
1654		n = get_child_rcu(pn, cindex++);
1655		if (!n)
1656			continue;
1657
1658		/* no need to compare keys since we bumped the index */
1659		if (IS_LEAF(n))
1660			goto found;
1661
1662		/* Rescan start scanning in new node */
1663		pn = n;
1664		cindex = 0;
1665	}
1666
1667	*tn = pn;
1668	return NULL; /* Root of trie */
1669found:
1670	/* if we are at the limit for keys just return NULL for the tnode */
1671	*tn = pn;
1672	return n;
1673}
1674
1675static void fib_trie_free(struct fib_table *tb)
1676{
1677	struct trie *t = (struct trie *)tb->tb_data;
1678	struct key_vector *pn = t->kv;
1679	unsigned long cindex = 1;
1680	struct hlist_node *tmp;
1681	struct fib_alias *fa;
1682
1683	/* walk trie in reverse order and free everything */
1684	for (;;) {
1685		struct key_vector *n;
1686
1687		if (!(cindex--)) {
1688			t_key pkey = pn->key;
1689
1690			if (IS_TRIE(pn))
1691				break;
1692
1693			n = pn;
1694			pn = node_parent(pn);
1695
1696			/* drop emptied tnode */
1697			put_child_root(pn, n->key, NULL);
1698			node_free(n);
1699
1700			cindex = get_index(pkey, pn);
1701
1702			continue;
1703		}
1704
1705		/* grab the next available node */
1706		n = get_child(pn, cindex);
1707		if (!n)
1708			continue;
1709
1710		if (IS_TNODE(n)) {
1711			/* record pn and cindex for leaf walking */
1712			pn = n;
1713			cindex = 1ul << n->bits;
1714
1715			continue;
1716		}
1717
1718		hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
1719			hlist_del_rcu(&fa->fa_list);
1720			alias_free_mem_rcu(fa);
1721		}
1722
1723		put_child_root(pn, n->key, NULL);
1724		node_free(n);
1725	}
1726
1727#ifdef CONFIG_IP_FIB_TRIE_STATS
1728	free_percpu(t->stats);
1729#endif
1730	kfree(tb);
1731}
1732
1733struct fib_table *fib_trie_unmerge(struct fib_table *oldtb)
 
 
 
 
1734{
1735	struct trie *ot = (struct trie *)oldtb->tb_data;
1736	struct key_vector *l, *tp = ot->kv;
1737	struct fib_table *local_tb;
1738	struct fib_alias *fa;
1739	struct trie *lt;
1740	t_key key = 0;
1741
1742	if (oldtb->tb_data == oldtb->__data)
1743		return oldtb;
1744
1745	local_tb = fib_trie_table(RT_TABLE_LOCAL, NULL);
1746	if (!local_tb)
1747		return NULL;
1748
1749	lt = (struct trie *)local_tb->tb_data;
1750
1751	while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
1752		struct key_vector *local_l = NULL, *local_tp;
1753
1754		hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
1755			struct fib_alias *new_fa;
 
 
1756
1757			if (local_tb->tb_id != fa->tb_id)
 
 
1758				continue;
1759
1760			/* clone fa for new local table */
1761			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1762			if (!new_fa)
1763				goto out;
1764
1765			memcpy(new_fa, fa, sizeof(*fa));
1766
1767			/* insert clone into table */
1768			if (!local_l)
1769				local_l = fib_find_node(lt, &local_tp, l->key);
1770
1771			if (fib_insert_alias(lt, local_tp, local_l, new_fa,
1772					     NULL, l->key)) {
1773				kmem_cache_free(fn_alias_kmem, new_fa);
1774				goto out;
1775			}
1776		}
1777
1778		/* stop loop if key wrapped back to 0 */
1779		key = l->key + 1;
1780		if (key < l->key)
1781			break;
1782	}
1783
1784	return local_tb;
1785out:
1786	fib_trie_free(local_tb);
1787
1788	return NULL;
1789}
1790
1791/* Caller must hold RTNL */
1792void fib_table_flush_external(struct fib_table *tb)
1793{
1794	struct trie *t = (struct trie *)tb->tb_data;
1795	struct key_vector *pn = t->kv;
1796	unsigned long cindex = 1;
1797	struct hlist_node *tmp;
1798	struct fib_alias *fa;
1799
1800	/* walk trie in reverse order */
1801	for (;;) {
1802		unsigned char slen = 0;
1803		struct key_vector *n;
1804
1805		if (!(cindex--)) {
1806			t_key pkey = pn->key;
1807
1808			/* cannot resize the trie vector */
1809			if (IS_TRIE(pn))
1810				break;
1811
1812			/* update the suffix to address pulled leaves */
1813			if (pn->slen > pn->pos)
1814				update_suffix(pn);
1815
1816			/* resize completed node */
1817			pn = resize(t, pn);
1818			cindex = get_index(pkey, pn);
1819
1820			continue;
1821		}
1822
1823		/* grab the next available node */
1824		n = get_child(pn, cindex);
1825		if (!n)
1826			continue;
1827
1828		if (IS_TNODE(n)) {
1829			/* record pn and cindex for leaf walking */
1830			pn = n;
1831			cindex = 1ul << n->bits;
1832
1833			continue;
1834		}
1835
1836		hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
1837			/* if alias was cloned to local then we just
1838			 * need to remove the local copy from main
1839			 */
1840			if (tb->tb_id != fa->tb_id) {
1841				hlist_del_rcu(&fa->fa_list);
1842				alias_free_mem_rcu(fa);
1843				continue;
1844			}
1845
1846			/* record local slen */
1847			slen = fa->fa_slen;
1848		}
1849
1850		/* update leaf slen */
1851		n->slen = slen;
1852
1853		if (hlist_empty(&n->leaf)) {
1854			put_child_root(pn, n->key, NULL);
1855			node_free(n);
1856		}
1857	}
1858}
1859
1860/* Caller must hold RTNL. */
1861int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all)
1862{
1863	struct trie *t = (struct trie *)tb->tb_data;
1864	struct key_vector *pn = t->kv;
1865	unsigned long cindex = 1;
1866	struct hlist_node *tmp;
1867	struct fib_alias *fa;
1868	int found = 0;
1869
1870	/* walk trie in reverse order */
1871	for (;;) {
1872		unsigned char slen = 0;
1873		struct key_vector *n;
1874
1875		if (!(cindex--)) {
1876			t_key pkey = pn->key;
1877
1878			/* cannot resize the trie vector */
1879			if (IS_TRIE(pn))
1880				break;
1881
1882			/* update the suffix to address pulled leaves */
1883			if (pn->slen > pn->pos)
1884				update_suffix(pn);
1885
1886			/* resize completed node */
1887			pn = resize(t, pn);
1888			cindex = get_index(pkey, pn);
1889
1890			continue;
1891		}
1892
1893		/* grab the next available node */
1894		n = get_child(pn, cindex);
1895		if (!n)
1896			continue;
1897
1898		if (IS_TNODE(n)) {
1899			/* record pn and cindex for leaf walking */
1900			pn = n;
1901			cindex = 1ul << n->bits;
1902
1903			continue;
1904		}
 
1905
1906		hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
1907			struct fib_info *fi = fa->fa_info;
1908
1909			if (!fi || tb->tb_id != fa->tb_id ||
1910			    (!(fi->fib_flags & RTNH_F_DEAD) &&
1911			     !fib_props[fa->fa_type].error)) {
1912				slen = fa->fa_slen;
1913				continue;
1914			}
1915
1916			/* Do not flush error routes if network namespace is
1917			 * not being dismantled
1918			 */
1919			if (!flush_all && fib_props[fa->fa_type].error) {
1920				slen = fa->fa_slen;
1921				continue;
1922			}
1923
1924			call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL,
1925						 n->key,
1926						 KEYLENGTH - fa->fa_slen, fa,
1927						 NULL);
1928			hlist_del_rcu(&fa->fa_list);
1929			fib_release_info(fa->fa_info);
1930			alias_free_mem_rcu(fa);
1931			found++;
1932		}
1933
1934		/* update leaf slen */
1935		n->slen = slen;
1936
1937		if (hlist_empty(&n->leaf)) {
1938			put_child_root(pn, n->key, NULL);
1939			node_free(n);
1940		}
1941	}
1942
 
 
 
1943	pr_debug("trie_flush found=%d\n", found);
1944	return found;
1945}
1946
1947/* derived from fib_trie_free */
1948static void __fib_info_notify_update(struct net *net, struct fib_table *tb,
1949				     struct nl_info *info)
1950{
1951	struct trie *t = (struct trie *)tb->tb_data;
1952	struct key_vector *pn = t->kv;
1953	unsigned long cindex = 1;
1954	struct fib_alias *fa;
1955
1956	for (;;) {
1957		struct key_vector *n;
1958
1959		if (!(cindex--)) {
1960			t_key pkey = pn->key;
1961
1962			if (IS_TRIE(pn))
1963				break;
1964
1965			pn = node_parent(pn);
1966			cindex = get_index(pkey, pn);
1967			continue;
1968		}
1969
1970		/* grab the next available node */
1971		n = get_child(pn, cindex);
1972		if (!n)
1973			continue;
1974
1975		if (IS_TNODE(n)) {
1976			/* record pn and cindex for leaf walking */
1977			pn = n;
1978			cindex = 1ul << n->bits;
1979
1980			continue;
1981		}
1982
1983		hlist_for_each_entry(fa, &n->leaf, fa_list) {
1984			struct fib_info *fi = fa->fa_info;
1985
1986			if (!fi || !fi->nh_updated || fa->tb_id != tb->tb_id)
1987				continue;
1988
1989			rtmsg_fib(RTM_NEWROUTE, htonl(n->key), fa,
1990				  KEYLENGTH - fa->fa_slen, tb->tb_id,
1991				  info, NLM_F_REPLACE);
1992
1993			/* call_fib_entry_notifiers will be removed when
1994			 * in-kernel notifier is implemented and supported
1995			 * for nexthop objects
1996			 */
1997			call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE,
1998						 n->key,
1999						 KEYLENGTH - fa->fa_slen, fa,
2000						 NULL);
2001		}
2002	}
2003}
2004
2005void fib_info_notify_update(struct net *net, struct nl_info *info)
2006{
2007	unsigned int h;
2008
2009	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2010		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2011		struct fib_table *tb;
2012
2013		hlist_for_each_entry_rcu(tb, head, tb_hlist)
2014			__fib_info_notify_update(net, tb, info);
2015	}
2016}
2017
2018static void fib_leaf_notify(struct net *net, struct key_vector *l,
2019			    struct fib_table *tb, struct notifier_block *nb)
 
2020{
 
2021	struct fib_alias *fa;
 
2022
2023	hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
2024		struct fib_info *fi = fa->fa_info;
2025
2026		if (!fi)
2027			continue;
2028
2029		/* local and main table can share the same trie,
2030		 * so don't notify twice for the same entry.
2031		 */
2032		if (tb->tb_id != fa->tb_id)
2033			continue;
 
2034
2035		call_fib_entry_notifier(nb, net, FIB_EVENT_ENTRY_ADD, l->key,
2036					KEYLENGTH - fa->fa_slen, fa);
2037	}
2038}
2039
2040static void fib_table_notify(struct net *net, struct fib_table *tb,
2041			     struct notifier_block *nb)
2042{
2043	struct trie *t = (struct trie *)tb->tb_data;
2044	struct key_vector *l, *tp = t->kv;
2045	t_key key = 0;
2046
2047	while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
2048		fib_leaf_notify(net, l, tb, nb);
2049
2050		key = l->key + 1;
2051		/* stop in case of wrap around */
2052		if (key < l->key)
2053			break;
2054	}
2055}
2056
2057void fib_notify(struct net *net, struct notifier_block *nb)
2058{
2059	unsigned int h;
2060
2061	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2062		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2063		struct fib_table *tb;
2064
2065		hlist_for_each_entry_rcu(tb, head, tb_hlist)
2066			fib_table_notify(net, tb, nb);
2067	}
 
 
2068}
2069
2070static void __trie_free_rcu(struct rcu_head *head)
2071{
2072	struct fib_table *tb = container_of(head, struct fib_table, rcu);
2073#ifdef CONFIG_IP_FIB_TRIE_STATS
2074	struct trie *t = (struct trie *)tb->tb_data;
2075
2076	if (tb->tb_data == tb->__data)
2077		free_percpu(t->stats);
2078#endif /* CONFIG_IP_FIB_TRIE_STATS */
2079	kfree(tb);
2080}
2081
2082void fib_free_table(struct fib_table *tb)
2083{
2084	call_rcu(&tb->rcu, __trie_free_rcu);
2085}
2086
2087static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
2088			     struct sk_buff *skb, struct netlink_callback *cb,
2089			     struct fib_dump_filter *filter)
2090{
2091	unsigned int flags = NLM_F_MULTI;
2092	__be32 xkey = htonl(l->key);
2093	int i, s_i, i_fa, s_fa, err;
2094	struct fib_alias *fa;
2095
2096	if (filter->filter_set ||
2097	    !filter->dump_exceptions || !filter->dump_routes)
2098		flags |= NLM_F_DUMP_FILTERED;
2099
2100	s_i = cb->args[4];
2101	s_fa = cb->args[5];
2102	i = 0;
2103
2104	/* rcu_read_lock is hold by caller */
2105	hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
2106		struct fib_info *fi = fa->fa_info;
2107
2108		if (i < s_i)
2109			goto next;
2110
2111		i_fa = 0;
2112
2113		if (tb->tb_id != fa->tb_id)
2114			goto next;
2115
2116		if (filter->filter_set) {
2117			if (filter->rt_type && fa->fa_type != filter->rt_type)
2118				goto next;
2119
2120			if ((filter->protocol &&
2121			     fi->fib_protocol != filter->protocol))
2122				goto next;
2123
2124			if (filter->dev &&
2125			    !fib_info_nh_uses_dev(fi, filter->dev))
2126				goto next;
2127		}
2128
2129		if (filter->dump_routes) {
2130			if (!s_fa) {
2131				err = fib_dump_info(skb,
2132						    NETLINK_CB(cb->skb).portid,
2133						    cb->nlh->nlmsg_seq,
2134						    RTM_NEWROUTE,
2135						    tb->tb_id, fa->fa_type,
2136						    xkey,
2137						    KEYLENGTH - fa->fa_slen,
2138						    fa->fa_tos, fi, flags);
2139				if (err < 0)
2140					goto stop;
2141			}
2142
2143			i_fa++;
2144		}
2145
2146		if (filter->dump_exceptions) {
2147			err = fib_dump_info_fnhe(skb, cb, tb->tb_id, fi,
2148						 &i_fa, s_fa, flags);
2149			if (err < 0)
2150				goto stop;
2151		}
2152
2153next:
2154		i++;
2155	}
2156
2157	cb->args[4] = i;
2158	return skb->len;
2159
2160stop:
2161	cb->args[4] = i;
2162	cb->args[5] = i_fa;
2163	return err;
2164}
2165
2166/* rcu_read_lock needs to be hold by caller from readside */
2167int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
2168		   struct netlink_callback *cb, struct fib_dump_filter *filter)
2169{
2170	struct trie *t = (struct trie *)tb->tb_data;
2171	struct key_vector *l, *tp = t->kv;
 
 
 
 
2172	/* Dump starting at last key.
2173	 * Note: 0.0.0.0/0 (ie default) is first key.
2174	 */
2175	int count = cb->args[2];
2176	t_key key = cb->args[3];
2177
2178	while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
2179		int err;
 
 
 
 
 
2180
2181		err = fn_trie_dump_leaf(l, tb, skb, cb, filter);
2182		if (err < 0) {
2183			cb->args[3] = key;
2184			cb->args[2] = count;
2185			return err;
 
2186		}
2187
2188		++count;
2189		key = l->key + 1;
2190
2191		memset(&cb->args[4], 0,
2192		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
2193
2194		/* stop loop if key wrapped back to 0 */
2195		if (key < l->key)
2196			break;
2197	}
2198
2199	cb->args[3] = key;
2200	cb->args[2] = count;
2201
2202	return skb->len;
2203}
2204
2205void __init fib_trie_init(void)
2206{
2207	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
2208					  sizeof(struct fib_alias),
2209					  0, SLAB_PANIC, NULL);
2210
2211	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
2212					   LEAF_SIZE,
 
2213					   0, SLAB_PANIC, NULL);
2214}
2215
2216struct fib_table *fib_trie_table(u32 id, struct fib_table *alias)
 
2217{
2218	struct fib_table *tb;
2219	struct trie *t;
2220	size_t sz = sizeof(*tb);
2221
2222	if (!alias)
2223		sz += sizeof(struct trie);
2224
2225	tb = kzalloc(sz, GFP_KERNEL);
2226	if (!tb)
 
2227		return NULL;
2228
2229	tb->tb_id = id;
 
2230	tb->tb_num_default = 0;
2231	tb->tb_data = (alias ? alias->__data : tb->__data);
2232
2233	if (alias)
2234		return tb;
2235
2236	t = (struct trie *) tb->tb_data;
2237	t->kv[0].pos = KEYLENGTH;
2238	t->kv[0].slen = KEYLENGTH;
2239#ifdef CONFIG_IP_FIB_TRIE_STATS
2240	t->stats = alloc_percpu(struct trie_use_stats);
2241	if (!t->stats) {
2242		kfree(tb);
2243		tb = NULL;
2244	}
2245#endif
2246
2247	return tb;
2248}
2249
2250#ifdef CONFIG_PROC_FS
2251/* Depth first Trie walk iterator */
2252struct fib_trie_iter {
2253	struct seq_net_private p;
2254	struct fib_table *tb;
2255	struct key_vector *tnode;
2256	unsigned int index;
2257	unsigned int depth;
2258};
2259
2260static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter)
2261{
2262	unsigned long cindex = iter->index;
2263	struct key_vector *pn = iter->tnode;
2264	t_key pkey;
 
 
 
 
2265
2266	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2267		 iter->tnode, iter->index, iter->depth);
 
 
 
2268
2269	while (!IS_TRIE(pn)) {
2270		while (cindex < child_length(pn)) {
2271			struct key_vector *n = get_child_rcu(pn, cindex++);
2272
2273			if (!n)
2274				continue;
2275
2276			if (IS_LEAF(n)) {
2277				iter->tnode = pn;
2278				iter->index = cindex;
2279			} else {
2280				/* push down one level */
2281				iter->tnode = n;
2282				iter->index = 0;
2283				++iter->depth;
2284			}
2285
2286			return n;
2287		}
2288
2289		/* Current node exhausted, pop back up */
2290		pkey = pn->key;
2291		pn = node_parent_rcu(pn);
2292		cindex = get_index(pkey, pn) + 1;
2293		--iter->depth;
2294	}
2295
2296	/* record root node so further searches know we are done */
2297	iter->tnode = pn;
2298	iter->index = 0;
 
 
 
 
 
2299
 
2300	return NULL;
2301}
2302
2303static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter,
2304					     struct trie *t)
2305{
2306	struct key_vector *n, *pn;
2307
2308	if (!t)
2309		return NULL;
2310
2311	pn = t->kv;
2312	n = rcu_dereference(pn->tnode[0]);
2313	if (!n)
2314		return NULL;
2315
2316	if (IS_TNODE(n)) {
2317		iter->tnode = n;
2318		iter->index = 0;
2319		iter->depth = 1;
2320	} else {
2321		iter->tnode = pn;
2322		iter->index = 0;
2323		iter->depth = 0;
2324	}
2325
2326	return n;
2327}
2328
2329static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2330{
2331	struct key_vector *n;
2332	struct fib_trie_iter iter;
2333
2334	memset(s, 0, sizeof(*s));
2335
2336	rcu_read_lock();
2337	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2338		if (IS_LEAF(n)) {
2339			struct fib_alias *fa;
 
 
2340
2341			s->leaves++;
2342			s->totdepth += iter.depth;
2343			if (iter.depth > s->maxdepth)
2344				s->maxdepth = iter.depth;
2345
2346			hlist_for_each_entry_rcu(fa, &n->leaf, fa_list)
2347				++s->prefixes;
2348		} else {
 
 
 
2349			s->tnodes++;
2350			if (n->bits < MAX_STAT_DEPTH)
2351				s->nodesizes[n->bits]++;
2352			s->nullpointers += tn_info(n)->empty_children;
 
 
 
2353		}
2354	}
2355	rcu_read_unlock();
2356}
2357
2358/*
2359 *	This outputs /proc/net/fib_triestats
2360 */
2361static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2362{
2363	unsigned int i, max, pointers, bytes, avdepth;
2364
2365	if (stat->leaves)
2366		avdepth = stat->totdepth*100 / stat->leaves;
2367	else
2368		avdepth = 0;
2369
2370	seq_printf(seq, "\tAver depth:     %u.%02d\n",
2371		   avdepth / 100, avdepth % 100);
2372	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2373
2374	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2375	bytes = LEAF_SIZE * stat->leaves;
2376
2377	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2378	bytes += sizeof(struct fib_alias) * stat->prefixes;
2379
2380	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2381	bytes += TNODE_SIZE(0) * stat->tnodes;
2382
2383	max = MAX_STAT_DEPTH;
2384	while (max > 0 && stat->nodesizes[max-1] == 0)
2385		max--;
2386
2387	pointers = 0;
2388	for (i = 1; i < max; i++)
2389		if (stat->nodesizes[i] != 0) {
2390			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2391			pointers += (1<<i) * stat->nodesizes[i];
2392		}
2393	seq_putc(seq, '\n');
2394	seq_printf(seq, "\tPointers: %u\n", pointers);
2395
2396	bytes += sizeof(struct key_vector *) * pointers;
2397	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2398	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2399}
2400
2401#ifdef CONFIG_IP_FIB_TRIE_STATS
2402static void trie_show_usage(struct seq_file *seq,
2403			    const struct trie_use_stats __percpu *stats)
2404{
2405	struct trie_use_stats s = { 0 };
2406	int cpu;
2407
2408	/* loop through all of the CPUs and gather up the stats */
2409	for_each_possible_cpu(cpu) {
2410		const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);
2411
2412		s.gets += pcpu->gets;
2413		s.backtrack += pcpu->backtrack;
2414		s.semantic_match_passed += pcpu->semantic_match_passed;
2415		s.semantic_match_miss += pcpu->semantic_match_miss;
2416		s.null_node_hit += pcpu->null_node_hit;
2417		s.resize_node_skipped += pcpu->resize_node_skipped;
2418	}
2419
2420	seq_printf(seq, "\nCounters:\n---------\n");
2421	seq_printf(seq, "gets = %u\n", s.gets);
2422	seq_printf(seq, "backtracks = %u\n", s.backtrack);
2423	seq_printf(seq, "semantic match passed = %u\n",
2424		   s.semantic_match_passed);
2425	seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
2426	seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
2427	seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
 
 
2428}
2429#endif /*  CONFIG_IP_FIB_TRIE_STATS */
2430
2431static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2432{
2433	if (tb->tb_id == RT_TABLE_LOCAL)
2434		seq_puts(seq, "Local:\n");
2435	else if (tb->tb_id == RT_TABLE_MAIN)
2436		seq_puts(seq, "Main:\n");
2437	else
2438		seq_printf(seq, "Id %d:\n", tb->tb_id);
2439}
2440
2441
2442static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2443{
2444	struct net *net = (struct net *)seq->private;
2445	unsigned int h;
2446
2447	seq_printf(seq,
2448		   "Basic info: size of leaf:"
2449		   " %zd bytes, size of tnode: %zd bytes.\n",
2450		   LEAF_SIZE, TNODE_SIZE(0));
2451
2452	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2453		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
 
2454		struct fib_table *tb;
2455
2456		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2457			struct trie *t = (struct trie *) tb->tb_data;
2458			struct trie_stat stat;
2459
2460			if (!t)
2461				continue;
2462
2463			fib_table_print(seq, tb);
2464
2465			trie_collect_stats(t, &stat);
2466			trie_show_stats(seq, &stat);
2467#ifdef CONFIG_IP_FIB_TRIE_STATS
2468			trie_show_usage(seq, t->stats);
2469#endif
2470		}
2471	}
2472
2473	return 0;
2474}
2475
2476static struct key_vector *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
 
 
 
 
 
 
 
 
 
 
 
 
 
2477{
2478	struct fib_trie_iter *iter = seq->private;
2479	struct net *net = seq_file_net(seq);
2480	loff_t idx = 0;
2481	unsigned int h;
2482
2483	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2484		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
 
2485		struct fib_table *tb;
2486
2487		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2488			struct key_vector *n;
2489
2490			for (n = fib_trie_get_first(iter,
2491						    (struct trie *) tb->tb_data);
2492			     n; n = fib_trie_get_next(iter))
2493				if (pos == idx++) {
2494					iter->tb = tb;
2495					return n;
2496				}
2497		}
2498	}
2499
2500	return NULL;
2501}
2502
2503static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2504	__acquires(RCU)
2505{
2506	rcu_read_lock();
2507	return fib_trie_get_idx(seq, *pos);
2508}
2509
2510static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2511{
2512	struct fib_trie_iter *iter = seq->private;
2513	struct net *net = seq_file_net(seq);
2514	struct fib_table *tb = iter->tb;
2515	struct hlist_node *tb_node;
2516	unsigned int h;
2517	struct key_vector *n;
2518
2519	++*pos;
2520	/* next node in same table */
2521	n = fib_trie_get_next(iter);
2522	if (n)
2523		return n;
2524
2525	/* walk rest of this hash chain */
2526	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2527	while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2528		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2529		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2530		if (n)
2531			goto found;
2532	}
2533
2534	/* new hash chain */
2535	while (++h < FIB_TABLE_HASHSZ) {
2536		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2537		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2538			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2539			if (n)
2540				goto found;
2541		}
2542	}
2543	return NULL;
2544
2545found:
2546	iter->tb = tb;
2547	return n;
2548}
2549
2550static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2551	__releases(RCU)
2552{
2553	rcu_read_unlock();
2554}
2555
2556static void seq_indent(struct seq_file *seq, int n)
2557{
2558	while (n-- > 0)
2559		seq_puts(seq, "   ");
2560}
2561
2562static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2563{
2564	switch (s) {
2565	case RT_SCOPE_UNIVERSE: return "universe";
2566	case RT_SCOPE_SITE:	return "site";
2567	case RT_SCOPE_LINK:	return "link";
2568	case RT_SCOPE_HOST:	return "host";
2569	case RT_SCOPE_NOWHERE:	return "nowhere";
2570	default:
2571		snprintf(buf, len, "scope=%d", s);
2572		return buf;
2573	}
2574}
2575
2576static const char *const rtn_type_names[__RTN_MAX] = {
2577	[RTN_UNSPEC] = "UNSPEC",
2578	[RTN_UNICAST] = "UNICAST",
2579	[RTN_LOCAL] = "LOCAL",
2580	[RTN_BROADCAST] = "BROADCAST",
2581	[RTN_ANYCAST] = "ANYCAST",
2582	[RTN_MULTICAST] = "MULTICAST",
2583	[RTN_BLACKHOLE] = "BLACKHOLE",
2584	[RTN_UNREACHABLE] = "UNREACHABLE",
2585	[RTN_PROHIBIT] = "PROHIBIT",
2586	[RTN_THROW] = "THROW",
2587	[RTN_NAT] = "NAT",
2588	[RTN_XRESOLVE] = "XRESOLVE",
2589};
2590
2591static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2592{
2593	if (t < __RTN_MAX && rtn_type_names[t])
2594		return rtn_type_names[t];
2595	snprintf(buf, len, "type %u", t);
2596	return buf;
2597}
2598
2599/* Pretty print the trie */
2600static int fib_trie_seq_show(struct seq_file *seq, void *v)
2601{
2602	const struct fib_trie_iter *iter = seq->private;
2603	struct key_vector *n = v;
2604
2605	if (IS_TRIE(node_parent_rcu(n)))
2606		fib_table_print(seq, iter->tb);
2607
2608	if (IS_TNODE(n)) {
2609		__be32 prf = htonl(n->key);
 
2610
2611		seq_indent(seq, iter->depth-1);
2612		seq_printf(seq, "  +-- %pI4/%zu %u %u %u\n",
2613			   &prf, KEYLENGTH - n->pos - n->bits, n->bits,
2614			   tn_info(n)->full_children,
2615			   tn_info(n)->empty_children);
2616	} else {
2617		__be32 val = htonl(n->key);
2618		struct fib_alias *fa;
 
 
2619
2620		seq_indent(seq, iter->depth);
2621		seq_printf(seq, "  |-- %pI4\n", &val);
2622
2623		hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
2624			char buf1[32], buf2[32];
 
 
 
2625
2626			seq_indent(seq, iter->depth + 1);
2627			seq_printf(seq, "  /%zu %s %s",
2628				   KEYLENGTH - fa->fa_slen,
2629				   rtn_scope(buf1, sizeof(buf1),
2630					     fa->fa_info->fib_scope),
2631				   rtn_type(buf2, sizeof(buf2),
2632					    fa->fa_type));
2633			if (fa->fa_tos)
2634				seq_printf(seq, " tos=%d", fa->fa_tos);
2635			seq_putc(seq, '\n');
2636		}
2637	}
2638
2639	return 0;
2640}
2641
2642static const struct seq_operations fib_trie_seq_ops = {
2643	.start  = fib_trie_seq_start,
2644	.next   = fib_trie_seq_next,
2645	.stop   = fib_trie_seq_stop,
2646	.show   = fib_trie_seq_show,
2647};
2648
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2649struct fib_route_iter {
2650	struct seq_net_private p;
2651	struct fib_table *main_tb;
2652	struct key_vector *tnode;
2653	loff_t	pos;
2654	t_key	key;
2655};
2656
2657static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter,
2658					    loff_t pos)
2659{
2660	struct key_vector *l, **tp = &iter->tnode;
2661	t_key key;
2662
2663	/* use cached location of previously found key */
2664	if (iter->pos > 0 && pos >= iter->pos) {
2665		key = iter->key;
2666	} else {
2667		iter->pos = 1;
2668		key = 0;
2669	}
2670
2671	pos -= iter->pos;
2672
2673	while ((l = leaf_walk_rcu(tp, key)) && (pos-- > 0)) {
2674		key = l->key + 1;
2675		iter->pos++;
2676		l = NULL;
2677
2678		/* handle unlikely case of a key wrap */
2679		if (!key)
2680			break;
2681	}
2682
2683	if (l)
2684		iter->key = l->key;	/* remember it */
2685	else
2686		iter->pos = 0;		/* forget it */
2687
2688	return l;
2689}
2690
2691static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2692	__acquires(RCU)
2693{
2694	struct fib_route_iter *iter = seq->private;
2695	struct fib_table *tb;
2696	struct trie *t;
2697
2698	rcu_read_lock();
2699
2700	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2701	if (!tb)
2702		return NULL;
2703
2704	iter->main_tb = tb;
2705	t = (struct trie *)tb->tb_data;
2706	iter->tnode = t->kv;
2707
2708	if (*pos != 0)
2709		return fib_route_get_idx(iter, *pos);
2710
2711	iter->pos = 0;
2712	iter->key = KEY_MAX;
2713
2714	return SEQ_START_TOKEN;
2715}
2716
2717static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2718{
2719	struct fib_route_iter *iter = seq->private;
2720	struct key_vector *l = NULL;
2721	t_key key = iter->key + 1;
2722
2723	++*pos;
2724
2725	/* only allow key of 0 for start of sequence */
2726	if ((v == SEQ_START_TOKEN) || key)
2727		l = leaf_walk_rcu(&iter->tnode, key);
2728
2729	if (l) {
2730		iter->key = l->key;
2731		iter->pos++;
2732	} else {
2733		iter->pos = 0;
 
 
 
 
2734	}
2735
 
 
 
 
2736	return l;
2737}
2738
2739static void fib_route_seq_stop(struct seq_file *seq, void *v)
2740	__releases(RCU)
2741{
2742	rcu_read_unlock();
2743}
2744
2745static unsigned int fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
2746{
2747	unsigned int flags = 0;
2748
2749	if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2750		flags = RTF_REJECT;
2751	if (fi) {
2752		const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
2753
2754		if (nhc->nhc_gw.ipv4)
2755			flags |= RTF_GATEWAY;
2756	}
2757	if (mask == htonl(0xFFFFFFFF))
2758		flags |= RTF_HOST;
2759	flags |= RTF_UP;
2760	return flags;
2761}
2762
2763/*
2764 *	This outputs /proc/net/route.
2765 *	The format of the file is not supposed to be changed
2766 *	and needs to be same as fib_hash output to avoid breaking
2767 *	legacy utilities
2768 */
2769static int fib_route_seq_show(struct seq_file *seq, void *v)
2770{
2771	struct fib_route_iter *iter = seq->private;
2772	struct fib_table *tb = iter->main_tb;
2773	struct fib_alias *fa;
2774	struct key_vector *l = v;
2775	__be32 prefix;
2776
2777	if (v == SEQ_START_TOKEN) {
2778		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2779			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2780			   "\tWindow\tIRTT");
2781		return 0;
2782	}
2783
2784	prefix = htonl(l->key);
 
 
2785
2786	hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
2787		struct fib_info *fi = fa->fa_info;
2788		__be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen);
2789		unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2790
2791		if ((fa->fa_type == RTN_BROADCAST) ||
2792		    (fa->fa_type == RTN_MULTICAST))
2793			continue;
 
2794
2795		if (fa->tb_id != tb->tb_id)
2796			continue;
 
2797
2798		seq_setwidth(seq, 127);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2799
2800		if (fi) {
2801			struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
2802			__be32 gw = 0;
2803
2804			if (nhc->nhc_gw_family == AF_INET)
2805				gw = nhc->nhc_gw.ipv4;
2806
2807			seq_printf(seq,
2808				   "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2809				   "%d\t%08X\t%d\t%u\t%u",
2810				   nhc->nhc_dev ? nhc->nhc_dev->name : "*",
2811				   prefix, gw, flags, 0, 0,
2812				   fi->fib_priority,
2813				   mask,
2814				   (fi->fib_advmss ?
2815				    fi->fib_advmss + 40 : 0),
2816				   fi->fib_window,
2817				   fi->fib_rtt >> 3);
2818		} else {
2819			seq_printf(seq,
2820				   "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2821				   "%d\t%08X\t%d\t%u\t%u",
2822				   prefix, 0, flags, 0, 0, 0,
2823				   mask, 0, 0, 0);
2824		}
2825		seq_pad(seq, '\n');
2826	}
2827
2828	return 0;
2829}
2830
2831static const struct seq_operations fib_route_seq_ops = {
2832	.start  = fib_route_seq_start,
2833	.next   = fib_route_seq_next,
2834	.stop   = fib_route_seq_stop,
2835	.show   = fib_route_seq_show,
2836};
2837
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2838int __net_init fib_proc_init(struct net *net)
2839{
2840	if (!proc_create_net("fib_trie", 0444, net->proc_net, &fib_trie_seq_ops,
2841			sizeof(struct fib_trie_iter)))
2842		goto out1;
2843
2844	if (!proc_create_net_single("fib_triestat", 0444, net->proc_net,
2845			fib_triestat_seq_show, NULL))
2846		goto out2;
2847
2848	if (!proc_create_net("route", 0444, net->proc_net, &fib_route_seq_ops,
2849			sizeof(struct fib_route_iter)))
2850		goto out3;
2851
2852	return 0;
2853
2854out3:
2855	remove_proc_entry("fib_triestat", net->proc_net);
2856out2:
2857	remove_proc_entry("fib_trie", net->proc_net);
2858out1:
2859	return -ENOMEM;
2860}
2861
2862void __net_exit fib_proc_exit(struct net *net)
2863{
2864	remove_proc_entry("fib_trie", net->proc_net);
2865	remove_proc_entry("fib_triestat", net->proc_net);
2866	remove_proc_entry("route", net->proc_net);
2867}
2868
2869#endif /* CONFIG_PROC_FS */
v3.1
 
   1/*
   2 *   This program is free software; you can redistribute it and/or
   3 *   modify it under the terms of the GNU General Public License
   4 *   as published by the Free Software Foundation; either version
   5 *   2 of the License, or (at your option) any later version.
   6 *
   7 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
   8 *     & Swedish University of Agricultural Sciences.
   9 *
  10 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
  11 *     Agricultural Sciences.
  12 *
  13 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
  14 *
  15 * This work is based on the LPC-trie which is originally described in:
  16 *
  17 * An experimental study of compression methods for dynamic tries
  18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
  19 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
  20 *
  21 *
  22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
  23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
  24 *
  25 *
  26 * Code from fib_hash has been reused which includes the following header:
  27 *
  28 *
  29 * INET		An implementation of the TCP/IP protocol suite for the LINUX
  30 *		operating system.  INET is implemented using the  BSD Socket
  31 *		interface as the means of communication with the user level.
  32 *
  33 *		IPv4 FIB: lookup engine and maintenance routines.
  34 *
  35 *
  36 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  37 *
  38 *		This program is free software; you can redistribute it and/or
  39 *		modify it under the terms of the GNU General Public License
  40 *		as published by the Free Software Foundation; either version
  41 *		2 of the License, or (at your option) any later version.
  42 *
  43 * Substantial contributions to this work comes from:
  44 *
  45 *		David S. Miller, <davem@davemloft.net>
  46 *		Stephen Hemminger <shemminger@osdl.org>
  47 *		Paul E. McKenney <paulmck@us.ibm.com>
  48 *		Patrick McHardy <kaber@trash.net>
  49 */
  50
  51#define VERSION "0.409"
  52
  53#include <asm/uaccess.h>
  54#include <asm/system.h>
  55#include <linux/bitops.h>
  56#include <linux/types.h>
  57#include <linux/kernel.h>
  58#include <linux/mm.h>
  59#include <linux/string.h>
  60#include <linux/socket.h>
  61#include <linux/sockios.h>
  62#include <linux/errno.h>
  63#include <linux/in.h>
  64#include <linux/inet.h>
  65#include <linux/inetdevice.h>
  66#include <linux/netdevice.h>
  67#include <linux/if_arp.h>
  68#include <linux/proc_fs.h>
  69#include <linux/rcupdate.h>
  70#include <linux/skbuff.h>
  71#include <linux/netlink.h>
  72#include <linux/init.h>
  73#include <linux/list.h>
  74#include <linux/slab.h>
  75#include <linux/prefetch.h>
 
 
  76#include <net/net_namespace.h>
  77#include <net/ip.h>
  78#include <net/protocol.h>
  79#include <net/route.h>
  80#include <net/tcp.h>
  81#include <net/sock.h>
  82#include <net/ip_fib.h>
 
 
  83#include "fib_lookup.h"
  84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  85#define MAX_STAT_DEPTH 32
  86
  87#define KEYLENGTH (8*sizeof(t_key))
 
  88
  89typedef unsigned int t_key;
  90
  91#define T_TNODE 0
  92#define T_LEAF  1
  93#define NODE_TYPE_MASK	0x1UL
  94#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
  95
  96#define IS_TNODE(n) (!(n->parent & T_LEAF))
  97#define IS_LEAF(n) (n->parent & T_LEAF)
  98
  99struct rt_trie_node {
 100	unsigned long parent;
 101	t_key key;
 
 
 
 
 
 
 
 
 
 102};
 103
 104struct leaf {
 105	unsigned long parent;
 106	t_key key;
 107	struct hlist_head list;
 108	struct rcu_head rcu;
 
 
 
 
 
 109};
 110
 111struct leaf_info {
 112	struct hlist_node hlist;
 113	int plen;
 114	u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
 115	struct list_head falh;
 116	struct rcu_head rcu;
 117};
 118
 119struct tnode {
 120	unsigned long parent;
 121	t_key key;
 122	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
 123	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
 124	unsigned int full_children;	/* KEYLENGTH bits needed */
 125	unsigned int empty_children;	/* KEYLENGTH bits needed */
 126	union {
 127		struct rcu_head rcu;
 128		struct work_struct work;
 129		struct tnode *tnode_free;
 130	};
 131	struct rt_trie_node __rcu *child[0];
 132};
 133
 134#ifdef CONFIG_IP_FIB_TRIE_STATS
 135struct trie_use_stats {
 136	unsigned int gets;
 137	unsigned int backtrack;
 138	unsigned int semantic_match_passed;
 139	unsigned int semantic_match_miss;
 140	unsigned int null_node_hit;
 141	unsigned int resize_node_skipped;
 142};
 143#endif
 144
 145struct trie_stat {
 146	unsigned int totdepth;
 147	unsigned int maxdepth;
 148	unsigned int tnodes;
 149	unsigned int leaves;
 150	unsigned int nullpointers;
 151	unsigned int prefixes;
 152	unsigned int nodesizes[MAX_STAT_DEPTH];
 153};
 154
 155struct trie {
 156	struct rt_trie_node __rcu *trie;
 157#ifdef CONFIG_IP_FIB_TRIE_STATS
 158	struct trie_use_stats stats;
 159#endif
 160};
 161
 162static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n);
 163static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
 164				  int wasfull);
 165static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
 166static struct tnode *inflate(struct trie *t, struct tnode *tn);
 167static struct tnode *halve(struct trie *t, struct tnode *tn);
 168/* tnodes to free after resize(); protected by RTNL */
 169static struct tnode *tnode_free_head;
 170static size_t tnode_free_size;
 171
 172/*
 173 * synchronize_rcu after call_rcu for that many pages; it should be especially
 174 * useful before resizing the root node with PREEMPT_NONE configs; the value was
 175 * obtained experimentally, aiming to avoid visible slowdown.
 176 */
 177static const int sync_pages = 128;
 
 
 178
 179static struct kmem_cache *fn_alias_kmem __read_mostly;
 180static struct kmem_cache *trie_leaf_kmem __read_mostly;
 181
 182/*
 183 * caller must hold RTNL
 184 */
 185static inline struct tnode *node_parent(const struct rt_trie_node *node)
 186{
 187	unsigned long parent;
 188
 189	parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());
 190
 191	return (struct tnode *)(parent & ~NODE_TYPE_MASK);
 192}
 193
 194/*
 195 * caller must hold RCU read lock or RTNL
 196 */
 197static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
 198{
 199	unsigned long parent;
 200
 201	parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
 202							   lockdep_rtnl_is_held());
 203
 204	return (struct tnode *)(parent & ~NODE_TYPE_MASK);
 205}
 
 206
 207/* Same as rcu_assign_pointer
 208 * but that macro() assumes that value is a pointer.
 209 */
 210static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
 211{
 212	smp_wmb();
 213	node->parent = (unsigned long)ptr | NODE_TYPE(node);
 214}
 215
 216/*
 217 * caller must hold RTNL
 218 */
 219static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
 220{
 221	BUG_ON(i >= 1U << tn->bits);
 222
 223	return rtnl_dereference(tn->child[i]);
 224}
 225
 226/*
 227 * caller must hold RCU read lock or RTNL
 228 */
 229static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
 230{
 231	BUG_ON(i >= 1U << tn->bits);
 232
 233	return rcu_dereference_rtnl(tn->child[i]);
 234}
 235
 236static inline int tnode_child_length(const struct tnode *tn)
 237{
 238	return 1 << tn->bits;
 239}
 240
 241static inline t_key mask_pfx(t_key k, unsigned int l)
 242{
 243	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
 244}
 245
 246static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
 247{
 248	if (offset < KEYLENGTH)
 249		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
 250	else
 251		return 0;
 252}
 253
 254static inline int tkey_equals(t_key a, t_key b)
 255{
 256	return a == b;
 257}
 258
 259static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
 260{
 261	if (bits == 0 || offset >= KEYLENGTH)
 262		return 1;
 263	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
 264	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
 265}
 266
 267static inline int tkey_mismatch(t_key a, int offset, t_key b)
 268{
 269	t_key diff = a ^ b;
 270	int i = offset;
 271
 272	if (!diff)
 273		return 0;
 274	while ((diff << i) >> (KEYLENGTH-1) == 0)
 275		i++;
 276	return i;
 277}
 278
 279/*
 280  To understand this stuff, an understanding of keys and all their bits is
 281  necessary. Every node in the trie has a key associated with it, but not
 282  all of the bits in that key are significant.
 283
 284  Consider a node 'n' and its parent 'tp'.
 285
 286  If n is a leaf, every bit in its key is significant. Its presence is
 287  necessitated by path compression, since during a tree traversal (when
 288  searching for a leaf - unless we are doing an insertion) we will completely
 289  ignore all skipped bits we encounter. Thus we need to verify, at the end of
 290  a potentially successful search, that we have indeed been walking the
 291  correct key path.
 292
 293  Note that we can never "miss" the correct key in the tree if present by
 294  following the wrong path. Path compression ensures that segments of the key
 295  that are the same for all keys with a given prefix are skipped, but the
 296  skipped part *is* identical for each node in the subtrie below the skipped
 297  bit! trie_insert() in this implementation takes care of that - note the
 298  call to tkey_sub_equals() in trie_insert().
 299
 300  if n is an internal node - a 'tnode' here, the various parts of its key
 301  have many different meanings.
 302
 303  Example:
 304  _________________________________________________________________
 305  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
 306  -----------------------------------------------------------------
 307    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 308
 309  _________________________________________________________________
 310  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
 311  -----------------------------------------------------------------
 312   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
 313
 314  tp->pos = 7
 315  tp->bits = 3
 316  n->pos = 15
 317  n->bits = 4
 318
 319  First, let's just ignore the bits that come before the parent tp, that is
 320  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
 321  not use them for anything.
 322
 323  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
 324  index into the parent's child array. That is, they will be used to find
 325  'n' among tp's children.
 326
 327  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
 328  for the node n.
 329
 330  All the bits we have seen so far are significant to the node n. The rest
 331  of the bits are really not needed or indeed known in n->key.
 332
 333  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
 334  n's child array, and will of course be different for each child.
 335
 336
 337  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
 338  at this point.
 339
 340*/
 341
 342static inline void check_tnode(const struct tnode *tn)
 343{
 344	WARN_ON(tn && tn->pos+tn->bits > 32);
 345}
 346
 347static const int halve_threshold = 25;
 348static const int inflate_threshold = 50;
 349static const int halve_threshold_root = 15;
 350static const int inflate_threshold_root = 30;
 351
 352static void __alias_free_mem(struct rcu_head *head)
 353{
 354	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
 355	kmem_cache_free(fn_alias_kmem, fa);
 356}
 357
 358static inline void alias_free_mem_rcu(struct fib_alias *fa)
 359{
 360	call_rcu(&fa->rcu, __alias_free_mem);
 361}
 362
 363static void __leaf_free_rcu(struct rcu_head *head)
 
 
 
 
 
 364{
 365	struct leaf *l = container_of(head, struct leaf, rcu);
 366	kmem_cache_free(trie_leaf_kmem, l);
 
 
 
 
 367}
 368
 369static inline void free_leaf(struct leaf *l)
 
 
 370{
 371	call_rcu_bh(&l->rcu, __leaf_free_rcu);
 372}
 
 
 
 373
 374static inline void free_leaf_info(struct leaf_info *leaf)
 375{
 376	kfree_rcu(leaf, rcu);
 377}
 378
 379static struct tnode *tnode_alloc(size_t size)
 380{
 381	if (size <= PAGE_SIZE)
 382		return kzalloc(size, GFP_KERNEL);
 383	else
 384		return vzalloc(size);
 385}
 386
 387static void __tnode_vfree(struct work_struct *arg)
 388{
 389	struct tnode *tn = container_of(arg, struct tnode, work);
 390	vfree(tn);
 391}
 392
 393static void __tnode_free_rcu(struct rcu_head *head)
 394{
 395	struct tnode *tn = container_of(head, struct tnode, rcu);
 396	size_t size = sizeof(struct tnode) +
 397		      (sizeof(struct rt_trie_node *) << tn->bits);
 398
 399	if (size <= PAGE_SIZE)
 400		kfree(tn);
 401	else {
 402		INIT_WORK(&tn->work, __tnode_vfree);
 403		schedule_work(&tn->work);
 404	}
 405}
 406
 407static inline void tnode_free(struct tnode *tn)
 408{
 409	if (IS_LEAF(tn))
 410		free_leaf((struct leaf *) tn);
 411	else
 412		call_rcu(&tn->rcu, __tnode_free_rcu);
 413}
 414
 415static void tnode_free_safe(struct tnode *tn)
 416{
 417	BUG_ON(IS_LEAF(tn));
 418	tn->tnode_free = tnode_free_head;
 419	tnode_free_head = tn;
 420	tnode_free_size += sizeof(struct tnode) +
 421			   (sizeof(struct rt_trie_node *) << tn->bits);
 422}
 423
 424static void tnode_free_flush(void)
 425{
 426	struct tnode *tn;
 
 427
 428	while ((tn = tnode_free_head)) {
 429		tnode_free_head = tn->tnode_free;
 430		tn->tnode_free = NULL;
 431		tnode_free(tn);
 432	}
 433
 434	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
 435		tnode_free_size = 0;
 436		synchronize_rcu();
 437	}
 438}
 
 
 
 
 
 439
 440static struct leaf *leaf_new(void)
 441{
 442	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
 443	if (l) {
 444		l->parent = T_LEAF;
 445		INIT_HLIST_HEAD(&l->list);
 446	}
 447	return l;
 448}
 449
 450static struct leaf_info *leaf_info_new(int plen)
 451{
 452	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
 453	if (li) {
 454		li->plen = plen;
 455		li->mask_plen = ntohl(inet_make_mask(plen));
 456		INIT_LIST_HEAD(&li->falh);
 457	}
 458	return li;
 459}
 
 
 
 
 
 460
 461static struct tnode *tnode_new(t_key key, int pos, int bits)
 462{
 463	size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
 464	struct tnode *tn = tnode_alloc(sz);
 465
 466	if (tn) {
 467		tn->parent = T_TNODE;
 468		tn->pos = pos;
 469		tn->bits = bits;
 470		tn->key = key;
 471		tn->full_children = 0;
 472		tn->empty_children = 1<<bits;
 473	}
 474
 475	pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
 476		 sizeof(struct rt_trie_node) << bits);
 477	return tn;
 478}
 479
 480/*
 481 * Check whether a tnode 'n' is "full", i.e. it is an internal node
 482 * and no bits are skipped. See discussion in dyntree paper p. 6
 483 */
 484
 485static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
 486{
 487	if (n == NULL || IS_LEAF(n))
 488		return 0;
 489
 490	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
 491}
 492
 493static inline void put_child(struct trie *t, struct tnode *tn, int i,
 494			     struct rt_trie_node *n)
 
 
 
 495{
 496	tnode_put_child_reorg(tn, i, n, -1);
 497}
 498
 499 /*
 500  * Add a child at position i overwriting the old value.
 501  * Update the value of full_children and empty_children.
 502  */
 503
 504static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
 505				  int wasfull)
 506{
 507	struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
 508	int isfull;
 509
 510	BUG_ON(i >= 1<<tn->bits);
 511
 512	/* update emptyChildren */
 513	if (n == NULL && chi != NULL)
 514		tn->empty_children++;
 515	else if (n != NULL && chi == NULL)
 516		tn->empty_children--;
 517
 518	/* update fullChildren */
 519	if (wasfull == -1)
 520		wasfull = tnode_full(tn, chi);
 521
 522	isfull = tnode_full(tn, n);
 523	if (wasfull && !isfull)
 524		tn->full_children--;
 525	else if (!wasfull && isfull)
 526		tn->full_children++;
 527
 528	if (n)
 529		node_set_parent(n, tn);
 530
 531	rcu_assign_pointer(tn->child[i], n);
 532}
 533
 534#define MAX_WORK 10
 535static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
 536{
 537	int i;
 538	struct tnode *old_tn;
 539	int inflate_threshold_use;
 540	int halve_threshold_use;
 541	int max_work;
 542
 543	if (!tn)
 544		return NULL;
 
 545
 546	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
 547		 tn, inflate_threshold, halve_threshold);
 548
 549	/* No children */
 550	if (tn->empty_children == tnode_child_length(tn)) {
 551		tnode_free_safe(tn);
 552		return NULL;
 
 
 
 
 553	}
 554	/* One child */
 555	if (tn->empty_children == tnode_child_length(tn) - 1)
 556		goto one_child;
 557	/*
 558	 * Double as long as the resulting node has a number of
 559	 * nonempty nodes that are above the threshold.
 560	 */
 561
 562	/*
 563	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
 564	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
 565	 * Telecommunications, page 6:
 566	 * "A node is doubled if the ratio of non-empty children to all
 567	 * children in the *doubled* node is at least 'high'."
 568	 *
 569	 * 'high' in this instance is the variable 'inflate_threshold'. It
 570	 * is expressed as a percentage, so we multiply it with
 571	 * tnode_child_length() and instead of multiplying by 2 (since the
 572	 * child array will be doubled by inflate()) and multiplying
 573	 * the left-hand side by 100 (to handle the percentage thing) we
 574	 * multiply the left-hand side by 50.
 575	 *
 576	 * The left-hand side may look a bit weird: tnode_child_length(tn)
 577	 * - tn->empty_children is of course the number of non-null children
 578	 * in the current node. tn->full_children is the number of "full"
 579	 * children, that is non-null tnodes with a skip value of 0.
 580	 * All of those will be doubled in the resulting inflated tnode, so
 581	 * we just count them one extra time here.
 582	 *
 583	 * A clearer way to write this would be:
 584	 *
 585	 * to_be_doubled = tn->full_children;
 586	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
 587	 *     tn->full_children;
 588	 *
 589	 * new_child_length = tnode_child_length(tn) * 2;
 590	 *
 591	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
 592	 *      new_child_length;
 593	 * if (new_fill_factor >= inflate_threshold)
 594	 *
 595	 * ...and so on, tho it would mess up the while () loop.
 596	 *
 597	 * anyway,
 598	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
 599	 *      inflate_threshold
 600	 *
 601	 * avoid a division:
 602	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
 603	 *      inflate_threshold * new_child_length
 604	 *
 605	 * expand not_to_be_doubled and to_be_doubled, and shorten:
 606	 * 100 * (tnode_child_length(tn) - tn->empty_children +
 607	 *    tn->full_children) >= inflate_threshold * new_child_length
 608	 *
 609	 * expand new_child_length:
 610	 * 100 * (tnode_child_length(tn) - tn->empty_children +
 611	 *    tn->full_children) >=
 612	 *      inflate_threshold * tnode_child_length(tn) * 2
 613	 *
 614	 * shorten again:
 615	 * 50 * (tn->full_children + tnode_child_length(tn) -
 616	 *    tn->empty_children) >= inflate_threshold *
 617	 *    tnode_child_length(tn)
 618	 *
 619	 */
 620
 621	check_tnode(tn);
 
 
 
 622
 623	/* Keep root node larger  */
 
 
 
 
 
 624
 625	if (!node_parent((struct rt_trie_node *)tn)) {
 626		inflate_threshold_use = inflate_threshold_root;
 627		halve_threshold_use = halve_threshold_root;
 628	} else {
 629		inflate_threshold_use = inflate_threshold;
 630		halve_threshold_use = halve_threshold;
 631	}
 632
 633	max_work = MAX_WORK;
 634	while ((tn->full_children > 0 &&  max_work-- &&
 635		50 * (tn->full_children + tnode_child_length(tn)
 636		      - tn->empty_children)
 637		>= inflate_threshold_use * tnode_child_length(tn))) {
 638
 639		old_tn = tn;
 640		tn = inflate(t, tn);
 641
 642		if (IS_ERR(tn)) {
 643			tn = old_tn;
 644#ifdef CONFIG_IP_FIB_TRIE_STATS
 645			t->stats.resize_node_skipped++;
 646#endif
 647			break;
 648		}
 649	}
 
 650
 651	check_tnode(tn);
 652
 653	/* Return if at least one inflate is run */
 654	if (max_work != MAX_WORK)
 655		return (struct rt_trie_node *) tn;
 656
 657	/*
 658	 * Halve as long as the number of empty children in this
 659	 * node is above threshold.
 660	 */
 661
 662	max_work = MAX_WORK;
 663	while (tn->bits > 1 &&  max_work-- &&
 664	       100 * (tnode_child_length(tn) - tn->empty_children) <
 665	       halve_threshold_use * tnode_child_length(tn)) {
 666
 667		old_tn = tn;
 668		tn = halve(t, tn);
 669		if (IS_ERR(tn)) {
 670			tn = old_tn;
 671#ifdef CONFIG_IP_FIB_TRIE_STATS
 672			t->stats.resize_node_skipped++;
 673#endif
 674			break;
 675		}
 676	}
 677
 
 
 678
 679	/* Only one child remains */
 680	if (tn->empty_children == tnode_child_length(tn) - 1) {
 681one_child:
 682		for (i = 0; i < tnode_child_length(tn); i++) {
 683			struct rt_trie_node *n;
 684
 685			n = rtnl_dereference(tn->child[i]);
 686			if (!n)
 687				continue;
 688
 689			/* compress one level */
 690
 691			node_set_parent(n, NULL);
 692			tnode_free_safe(tn);
 693			return n;
 694		}
 695	}
 696	return (struct rt_trie_node *) tn;
 697}
 698
 699
 700static void tnode_clean_free(struct tnode *tn)
 701{
 702	int i;
 703	struct tnode *tofree;
 704
 705	for (i = 0; i < tnode_child_length(tn); i++) {
 706		tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
 707		if (tofree)
 708			tnode_free(tofree);
 709	}
 710	tnode_free(tn);
 711}
 712
 713static struct tnode *inflate(struct trie *t, struct tnode *tn)
 
 714{
 715	struct tnode *oldtnode = tn;
 716	int olen = tnode_child_length(tn);
 717	int i;
 718
 719	pr_debug("In inflate\n");
 720
 721	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
 
 
 722
 723	if (!tn)
 724		return ERR_PTR(-ENOMEM);
 725
 726	/*
 727	 * Preallocate and store tnodes before the actual work so we
 728	 * don't get into an inconsistent state if memory allocation
 729	 * fails. In case of failure we return the oldnode and  inflate
 730	 * of tnode is ignored.
 731	 */
 732
 733	for (i = 0; i < olen; i++) {
 734		struct tnode *inode;
 735
 736		inode = (struct tnode *) tnode_get_child(oldtnode, i);
 737		if (inode &&
 738		    IS_TNODE(inode) &&
 739		    inode->pos == oldtnode->pos + oldtnode->bits &&
 740		    inode->bits > 1) {
 741			struct tnode *left, *right;
 742			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
 743
 744			left = tnode_new(inode->key&(~m), inode->pos + 1,
 745					 inode->bits - 1);
 746			if (!left)
 747				goto nomem;
 748
 749			right = tnode_new(inode->key|m, inode->pos + 1,
 750					  inode->bits - 1);
 751
 752			if (!right) {
 753				tnode_free(left);
 754				goto nomem;
 755			}
 756
 757			put_child(t, tn, 2*i, (struct rt_trie_node *) left);
 758			put_child(t, tn, 2*i+1, (struct rt_trie_node *) right);
 759		}
 760	}
 761
 762	for (i = 0; i < olen; i++) {
 763		struct tnode *inode;
 764		struct rt_trie_node *node = tnode_get_child(oldtnode, i);
 765		struct tnode *left, *right;
 766		int size, j;
 767
 768		/* An empty child */
 769		if (node == NULL)
 770			continue;
 771
 772		/* A leaf or an internal node with skipped bits */
 773
 774		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
 775		   tn->pos + tn->bits - 1) {
 776			if (tkey_extract_bits(node->key,
 777					      oldtnode->pos + oldtnode->bits,
 778					      1) == 0)
 779				put_child(t, tn, 2*i, node);
 780			else
 781				put_child(t, tn, 2*i+1, node);
 782			continue;
 783		}
 784
 
 
 
 785		/* An internal node with two children */
 786		inode = (struct tnode *) node;
 787
 788		if (inode->bits == 1) {
 789			put_child(t, tn, 2*i, rtnl_dereference(inode->child[0]));
 790			put_child(t, tn, 2*i+1, rtnl_dereference(inode->child[1]));
 791
 792			tnode_free_safe(inode);
 793			continue;
 794		}
 795
 796		/* An internal node with more than two children */
 797
 798		/* We will replace this node 'inode' with two new
 799		 * ones, 'left' and 'right', each with half of the
 800		 * original children. The two new nodes will have
 801		 * a position one bit further down the key and this
 802		 * means that the "significant" part of their keys
 803		 * (see the discussion near the top of this file)
 804		 * will differ by one bit, which will be "0" in
 805		 * left's key and "1" in right's key. Since we are
 806		 * moving the key position by one step, the bit that
 807		 * we are moving away from - the bit at position
 808		 * (inode->pos) - is the one that will differ between
 809		 * left and right. So... we synthesize that bit in the
 810		 * two  new keys.
 811		 * The mask 'm' below will be a single "one" bit at
 812		 * the position (inode->pos)
 813		 */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 814
 815		/* Use the old key, but set the new significant
 816		 *   bit to zero.
 817		 */
 
 
 
 
 
 818
 819		left = (struct tnode *) tnode_get_child(tn, 2*i);
 820		put_child(t, tn, 2*i, NULL);
 
 
 
 821
 822		BUG_ON(!left);
 823
 824		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
 825		put_child(t, tn, 2*i+1, NULL);
 
 
 
 
 826
 827		BUG_ON(!right);
 
 
 
 
 
 
 
 
 828
 829		size = tnode_child_length(left);
 830		for (j = 0; j < size; j++) {
 831			put_child(t, left, j, rtnl_dereference(inode->child[j]));
 832			put_child(t, right, j, rtnl_dereference(inode->child[j + size]));
 833		}
 834		put_child(t, tn, 2*i, resize(t, left));
 835		put_child(t, tn, 2*i+1, resize(t, right));
 836
 837		tnode_free_safe(inode);
 
 
 
 
 
 
 
 
 
 
 
 
 838	}
 839	tnode_free_safe(oldtnode);
 840	return tn;
 
 841nomem:
 842	tnode_clean_free(tn);
 843	return ERR_PTR(-ENOMEM);
 
 
 844}
 845
 846static struct tnode *halve(struct trie *t, struct tnode *tn)
 
 847{
 848	struct tnode *oldtnode = tn;
 849	struct rt_trie_node *left, *right;
 850	int i;
 851	int olen = tnode_child_length(tn);
 852
 853	pr_debug("In halve\n");
 
 
 854
 855	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
 
 
 
 856
 857	if (!tn)
 858		return ERR_PTR(-ENOMEM);
 859
 860	/*
 861	 * Preallocate and store tnodes before the actual work so we
 862	 * don't get into an inconsistent state if memory allocation
 863	 * fails. In case of failure we return the oldnode and halve
 864	 * of tnode is ignored.
 865	 */
 866
 867	for (i = 0; i < olen; i += 2) {
 868		left = tnode_get_child(oldtnode, i);
 869		right = tnode_get_child(oldtnode, i+1);
 
 
 870
 871		/* Two nonempty children */
 872		if (left && right) {
 873			struct tnode *newn;
 
 
 874
 875			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
 
 
 
 
 
 
 876
 877			if (!newn)
 878				goto nomem;
 879
 880			put_child(t, tn, i/2, (struct rt_trie_node *)newn);
 881		}
 
 
 882
 
 
 
 883	}
 884
 885	for (i = 0; i < olen; i += 2) {
 886		struct tnode *newBinNode;
 887
 888		left = tnode_get_child(oldtnode, i);
 889		right = tnode_get_child(oldtnode, i+1);
 890
 891		/* At least one of the children is empty */
 892		if (left == NULL) {
 893			if (right == NULL)    /* Both are empty */
 894				continue;
 895			put_child(t, tn, i/2, right);
 896			continue;
 897		}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 898
 899		if (right == NULL) {
 900			put_child(t, tn, i/2, left);
 901			continue;
 902		}
 903
 904		/* Two nonempty children */
 905		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
 906		put_child(t, tn, i/2, NULL);
 907		put_child(t, newBinNode, 0, left);
 908		put_child(t, newBinNode, 1, right);
 909		put_child(t, tn, i/2, resize(t, newBinNode));
 910	}
 911	tnode_free_safe(oldtnode);
 912	return tn;
 913nomem:
 914	tnode_clean_free(tn);
 915	return ERR_PTR(-ENOMEM);
 916}
 917
 918/* readside must use rcu_read_lock currently dump routines
 919 via get_fa_head and dump */
 
 
 920
 921static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
 922{
 923	struct hlist_head *head = &l->list;
 924	struct hlist_node *node;
 925	struct leaf_info *li;
 926
 927	hlist_for_each_entry_rcu(li, node, head, hlist)
 928		if (li->plen == plen)
 929			return li;
 930
 931	return NULL;
 932}
 933
 934static inline struct list_head *get_fa_head(struct leaf *l, int plen)
 935{
 936	struct leaf_info *li = find_leaf_info(l, plen);
 937
 938	if (!li)
 939		return NULL;
 
 
 
 940
 941	return &li->falh;
 
 942}
 943
 944static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
 
 945{
 946	struct leaf_info *li = NULL, *last = NULL;
 947	struct hlist_node *node;
 
 
 
 
 948
 949	if (hlist_empty(head)) {
 950		hlist_add_head_rcu(&new->hlist, head);
 951	} else {
 952		hlist_for_each_entry(li, node, head, hlist) {
 953			if (new->plen > li->plen)
 954				break;
 
 
 955
 956			last = li;
 
 
 
 
 
 
 
 
 
 957		}
 958		if (last)
 959			hlist_add_after_rcu(&last->hlist, &new->hlist);
 960		else
 961			hlist_add_before_rcu(&new->hlist, &li->hlist);
 962	}
 963}
 964
 965/* rcu_read_lock needs to be hold by caller from readside */
 
 
 
 
 
 966
 967static struct leaf *
 968fib_find_node(struct trie *t, u32 key)
 969{
 970	int pos;
 971	struct tnode *tn;
 972	struct rt_trie_node *n;
 973
 974	pos = 0;
 975	n = rcu_dereference_rtnl(t->trie);
 976
 977	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
 978		tn = (struct tnode *) n;
 979
 980		check_tnode(tn);
 981
 982		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
 983			pos = tn->pos + tn->bits;
 984			n = tnode_get_child_rcu(tn,
 985						tkey_extract_bits(key,
 986								  tn->pos,
 987								  tn->bits));
 988		} else
 989			break;
 
 
 
 
 990	}
 991	/* Case we have found a leaf. Compare prefixes */
 992
 993	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
 994		return (struct leaf *)n;
 
 995
 996	return NULL;
 
 997}
 998
 999static void trie_rebalance(struct trie *t, struct tnode *tn)
1000{
1001	int wasfull;
1002	t_key cindex, key;
1003	struct tnode *tp;
1004
1005	key = tn->key;
 
 
 
1006
1007	while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
1008		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1009		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1010		tn = (struct tnode *) resize(t, (struct tnode *)tn);
 
 
 
 
 
 
 
 
1011
1012		tnode_put_child_reorg((struct tnode *)tp, cindex,
1013				      (struct rt_trie_node *)tn, wasfull);
 
 
 
 
1014
1015		tp = node_parent((struct rt_trie_node *) tn);
1016		if (!tp)
1017			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1018
1019		tnode_free_flush();
1020		if (!tp)
1021			break;
1022		tn = tp;
1023	}
1024
1025	/* Handle last (top) tnode */
1026	if (IS_TNODE(tn))
1027		tn = (struct tnode *)resize(t, (struct tnode *)tn);
1028
1029	rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1030	tnode_free_flush();
1031}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1032
1033/* only used from updater-side */
 
1034
1035static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1036{
1037	int pos, newpos;
1038	struct tnode *tp = NULL, *tn = NULL;
1039	struct rt_trie_node *n;
1040	struct leaf *l;
1041	int missbit;
1042	struct list_head *fa_head = NULL;
1043	struct leaf_info *li;
1044	t_key cindex;
1045
1046	pos = 0;
1047	n = rtnl_dereference(t->trie);
1048
1049	/* If we point to NULL, stop. Either the tree is empty and we should
1050	 * just put a new leaf in if, or we have reached an empty child slot,
1051	 * and we should just put our new leaf in that.
1052	 * If we point to a T_TNODE, check if it matches our key. Note that
1053	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1054	 * not be the parent's 'pos'+'bits'!
1055	 *
1056	 * If it does match the current key, get pos/bits from it, extract
1057	 * the index from our key, push the T_TNODE and walk the tree.
1058	 *
1059	 * If it doesn't, we have to replace it with a new T_TNODE.
1060	 *
1061	 * If we point to a T_LEAF, it might or might not have the same key
1062	 * as we do. If it does, just change the value, update the T_LEAF's
1063	 * value, and return it.
1064	 * If it doesn't, we need to replace it with a T_TNODE.
1065	 */
1066
1067	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1068		tn = (struct tnode *) n;
1069
1070		check_tnode(tn);
1071
1072		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1073			tp = tn;
1074			pos = tn->pos + tn->bits;
1075			n = tnode_get_child(tn,
1076					    tkey_extract_bits(key,
1077							      tn->pos,
1078							      tn->bits));
1079
1080			BUG_ON(n && node_parent(n) != tn);
1081		} else
1082			break;
 
 
 
 
1083	}
1084
1085	/*
1086	 * n  ----> NULL, LEAF or TNODE
1087	 *
1088	 * tp is n's (parent) ----> NULL or TNODE
1089	 */
1090
1091	BUG_ON(tp && IS_LEAF(tp));
 
 
 
 
1092
1093	/* Case 1: n is a leaf. Compare prefixes */
 
 
 
1094
1095	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1096		l = (struct leaf *) n;
1097		li = leaf_info_new(plen);
1098
1099		if (!li)
1100			return NULL;
1101
1102		fa_head = &li->falh;
1103		insert_leaf_info(&l->list, li);
1104		goto done;
1105	}
1106	l = leaf_new();
 
 
 
1107
1108	if (!l)
1109		return NULL;
 
 
 
 
 
1110
1111	l->key = key;
1112	li = leaf_info_new(plen);
 
1113
1114	if (!li) {
1115		free_leaf(l);
1116		return NULL;
1117	}
1118
1119	fa_head = &li->falh;
1120	insert_leaf_info(&l->list, li);
 
 
 
1121
1122	if (t->trie && n == NULL) {
1123		/* Case 2: n is NULL, and will just insert a new leaf */
 
 
 
 
1124
1125		node_set_parent((struct rt_trie_node *)l, tp);
 
 
 
 
 
 
 
 
1126
1127		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1128		put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l);
1129	} else {
1130		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1131		/*
1132		 *  Add a new tnode here
1133		 *  first tnode need some special handling
1134		 */
1135
1136		if (tp)
1137			pos = tp->pos+tp->bits;
1138		else
1139			pos = 0;
1140
1141		if (n) {
1142			newpos = tkey_mismatch(key, pos, n->key);
1143			tn = tnode_new(n->key, newpos, 1);
1144		} else {
1145			newpos = 0;
1146			tn = tnode_new(key, newpos, 1); /* First tnode */
1147		}
1148
1149		if (!tn) {
1150			free_leaf_info(li);
1151			free_leaf(l);
1152			return NULL;
1153		}
1154
1155		node_set_parent((struct rt_trie_node *)tn, tp);
 
 
 
 
1156
1157		missbit = tkey_extract_bits(key, newpos, 1);
1158		put_child(t, tn, missbit, (struct rt_trie_node *)l);
1159		put_child(t, tn, 1-missbit, n);
1160
1161		if (tp) {
1162			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1163			put_child(t, (struct tnode *)tp, cindex,
1164				  (struct rt_trie_node *)tn);
1165		} else {
1166			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1167			tp = tn;
1168		}
1169	}
1170
1171	if (tp && tp->pos + tp->bits > 32)
1172		pr_warning("fib_trie"
1173			   " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1174			   tp, tp->pos, tp->bits, key, plen);
1175
1176	/* Rebalance the trie */
1177
1178	trie_rebalance(t, tp);
1179done:
1180	return fa_head;
1181}
1182
1183/*
1184 * Caller must hold RTNL.
1185 */
1186int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1187{
1188	struct trie *t = (struct trie *) tb->tb_data;
 
1189	struct fib_alias *fa, *new_fa;
1190	struct list_head *fa_head = NULL;
 
1191	struct fib_info *fi;
1192	int plen = cfg->fc_dst_len;
 
1193	u8 tos = cfg->fc_tos;
1194	u32 key, mask;
1195	int err;
1196	struct leaf *l;
1197
1198	if (plen > 32)
1199		return -EINVAL;
1200
1201	key = ntohl(cfg->fc_dst);
1202
1203	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1204
1205	mask = ntohl(inet_make_mask(plen));
1206
1207	if (key & ~mask)
1208		return -EINVAL;
1209
1210	key = key & mask;
1211
1212	fi = fib_create_info(cfg);
1213	if (IS_ERR(fi)) {
1214		err = PTR_ERR(fi);
1215		goto err;
1216	}
1217
1218	l = fib_find_node(t, key);
1219	fa = NULL;
1220
1221	if (l) {
1222		fa_head = get_fa_head(l, plen);
1223		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1224	}
1225
1226	/* Now fa, if non-NULL, points to the first fib alias
1227	 * with the same keys [prefix,tos,priority], if such key already
1228	 * exists or to the node before which we will insert new one.
1229	 *
1230	 * If fa is NULL, we will need to allocate a new one and
1231	 * insert to the head of f.
1232	 *
1233	 * If f is NULL, no fib node matched the destination key
1234	 * and we need to allocate a new one of those as well.
1235	 */
1236
1237	if (fa && fa->fa_tos == tos &&
1238	    fa->fa_info->fib_priority == fi->fib_priority) {
1239		struct fib_alias *fa_first, *fa_match;
1240
1241		err = -EEXIST;
1242		if (cfg->fc_nlflags & NLM_F_EXCL)
1243			goto out;
1244
 
 
1245		/* We have 2 goals:
1246		 * 1. Find exact match for type, scope, fib_info to avoid
1247		 * duplicate routes
1248		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1249		 */
1250		fa_match = NULL;
1251		fa_first = fa;
1252		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1253		list_for_each_entry_continue(fa, fa_head, fa_list) {
1254			if (fa->fa_tos != tos)
 
1255				break;
1256			if (fa->fa_info->fib_priority != fi->fib_priority)
1257				break;
1258			if (fa->fa_type == cfg->fc_type &&
1259			    fa->fa_info == fi) {
1260				fa_match = fa;
1261				break;
1262			}
1263		}
1264
1265		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1266			struct fib_info *fi_drop;
1267			u8 state;
1268
 
1269			fa = fa_first;
1270			if (fa_match) {
1271				if (fa == fa_match)
1272					err = 0;
1273				goto out;
1274			}
1275			err = -ENOBUFS;
1276			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1277			if (new_fa == NULL)
1278				goto out;
1279
1280			fi_drop = fa->fa_info;
1281			new_fa->fa_tos = fa->fa_tos;
1282			new_fa->fa_info = fi;
1283			new_fa->fa_type = cfg->fc_type;
1284			state = fa->fa_state;
1285			new_fa->fa_state = state & ~FA_S_ACCESSED;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1286
1287			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1288			alias_free_mem_rcu(fa);
1289
1290			fib_release_info(fi_drop);
1291			if (state & FA_S_ACCESSED)
1292				rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1293			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1294				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1295
1296			goto succeeded;
1297		}
1298		/* Error if we find a perfect match which
1299		 * uses the same scope, type, and nexthop
1300		 * information.
1301		 */
1302		if (fa_match)
1303			goto out;
1304
1305		if (!(cfg->fc_nlflags & NLM_F_APPEND))
 
 
 
1306			fa = fa_first;
 
1307	}
1308	err = -ENOENT;
1309	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1310		goto out;
1311
 
1312	err = -ENOBUFS;
1313	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1314	if (new_fa == NULL)
1315		goto out;
1316
1317	new_fa->fa_info = fi;
1318	new_fa->fa_tos = tos;
1319	new_fa->fa_type = cfg->fc_type;
1320	new_fa->fa_state = 0;
1321	/*
1322	 * Insert new entry to the list.
1323	 */
1324
1325	if (!fa_head) {
1326		fa_head = fib_insert_node(t, key, plen);
1327		if (unlikely(!fa_head)) {
1328			err = -ENOMEM;
1329			goto out_free_new_fa;
1330		}
1331	}
 
1332
1333	if (!plen)
1334		tb->tb_num_default++;
1335
1336	list_add_tail_rcu(&new_fa->fa_list,
1337			  (fa ? &fa->fa_list : fa_head));
1338
1339	rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1340	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1341		  &cfg->fc_nlinfo, 0);
1342succeeded:
1343	return 0;
1344
 
 
 
 
 
 
 
 
1345out_free_new_fa:
1346	kmem_cache_free(fn_alias_kmem, new_fa);
1347out:
1348	fib_release_info(fi);
1349err:
1350	return err;
1351}
1352
1353/* should be called with rcu_read_lock */
1354static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
1355		      t_key key,  const struct flowi4 *flp,
1356		      struct fib_result *res, int fib_flags)
1357{
1358	struct leaf_info *li;
1359	struct hlist_head *hhead = &l->list;
1360	struct hlist_node *node;
1361
1362	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1363		struct fib_alias *fa;
1364
1365		if (l->key != (key & li->mask_plen))
1366			continue;
1367
1368		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1369			struct fib_info *fi = fa->fa_info;
1370			int nhsel, err;
1371
1372			if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1373				continue;
1374			if (fa->fa_info->fib_scope < flp->flowi4_scope)
1375				continue;
1376			fib_alias_accessed(fa);
1377			err = fib_props[fa->fa_type].error;
1378			if (err) {
1379#ifdef CONFIG_IP_FIB_TRIE_STATS
1380				t->stats.semantic_match_passed++;
1381#endif
1382				return err;
1383			}
1384			if (fi->fib_flags & RTNH_F_DEAD)
1385				continue;
1386			for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1387				const struct fib_nh *nh = &fi->fib_nh[nhsel];
1388
1389				if (nh->nh_flags & RTNH_F_DEAD)
1390					continue;
1391				if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1392					continue;
1393
1394#ifdef CONFIG_IP_FIB_TRIE_STATS
1395				t->stats.semantic_match_passed++;
1396#endif
1397				res->prefixlen = li->plen;
1398				res->nh_sel = nhsel;
1399				res->type = fa->fa_type;
1400				res->scope = fa->fa_info->fib_scope;
1401				res->fi = fi;
1402				res->table = tb;
1403				res->fa_head = &li->falh;
1404				if (!(fib_flags & FIB_LOOKUP_NOREF))
1405					atomic_inc(&fi->fib_clntref);
1406				return 0;
1407			}
1408		}
1409
1410#ifdef CONFIG_IP_FIB_TRIE_STATS
1411		t->stats.semantic_match_miss++;
1412#endif
1413	}
1414
1415	return 1;
1416}
1417
 
1418int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1419		     struct fib_result *res, int fib_flags)
1420{
1421	struct trie *t = (struct trie *) tb->tb_data;
1422	int ret;
1423	struct rt_trie_node *n;
1424	struct tnode *pn;
1425	unsigned int pos, bits;
1426	t_key key = ntohl(flp->daddr);
1427	unsigned int chopped_off;
1428	t_key cindex = 0;
1429	unsigned int current_prefix_length = KEYLENGTH;
1430	struct tnode *cn;
1431	t_key pref_mismatch;
1432
1433	rcu_read_lock();
1434
1435	n = rcu_dereference(t->trie);
1436	if (!n)
1437		goto failed;
1438
1439#ifdef CONFIG_IP_FIB_TRIE_STATS
1440	t->stats.gets++;
1441#endif
 
 
 
 
 
 
 
 
1442
1443	/* Just a leaf? */
1444	if (IS_LEAF(n)) {
1445		ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1446		goto found;
1447	}
1448
1449	pn = (struct tnode *) n;
1450	chopped_off = 0;
1451
1452	while (pn) {
1453		pos = pn->pos;
1454		bits = pn->bits;
1455
1456		if (!chopped_off)
1457			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1458						   pos, bits);
1459
1460		n = tnode_get_child_rcu(pn, cindex);
1461
1462		if (n == NULL) {
1463#ifdef CONFIG_IP_FIB_TRIE_STATS
1464			t->stats.null_node_hit++;
1465#endif
1466			goto backtrace;
1467		}
1468
1469		if (IS_LEAF(n)) {
1470			ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1471			if (ret > 0)
1472				goto backtrace;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1473			goto found;
 
 
 
 
 
 
 
1474		}
1475
1476		cn = (struct tnode *)n;
 
 
 
1477
1478		/*
1479		 * It's a tnode, and we can do some extra checks here if we
1480		 * like, to avoid descending into a dead-end branch.
1481		 * This tnode is in the parent's child array at index
1482		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1483		 * chopped off, so in reality the index may be just a
1484		 * subprefix, padded with zero at the end.
1485		 * We can also take a look at any skipped bits in this
1486		 * tnode - everything up to p_pos is supposed to be ok,
1487		 * and the non-chopped bits of the index (se previous
1488		 * paragraph) are also guaranteed ok, but the rest is
1489		 * considered unknown.
1490		 *
1491		 * The skipped bits are key[pos+bits..cn->pos].
1492		 */
 
 
1493
1494		/* If current_prefix_length < pos+bits, we are already doing
1495		 * actual prefix  matching, which means everything from
1496		 * pos+(bits-chopped_off) onward must be zero along some
1497		 * branch of this subtree - otherwise there is *no* valid
1498		 * prefix present. Here we can only check the skipped
1499		 * bits. Remember, since we have already indexed into the
1500		 * parent's child array, we know that the bits we chopped of
1501		 * *are* zero.
1502		 */
1503
1504		/* NOTA BENE: Checking only skipped bits
1505		   for the new node here */
1506
1507		if (current_prefix_length < pos+bits) {
1508			if (tkey_extract_bits(cn->key, current_prefix_length,
1509						cn->pos - current_prefix_length)
1510			    || !(cn->child[0]))
1511				goto backtrace;
1512		}
1513
1514		/*
1515		 * If chopped_off=0, the index is fully validated and we
1516		 * only need to look at the skipped bits for this, the new,
1517		 * tnode. What we actually want to do is to find out if
1518		 * these skipped bits match our key perfectly, or if we will
1519		 * have to count on finding a matching prefix further down,
1520		 * because if we do, we would like to have some way of
1521		 * verifying the existence of such a prefix at this point.
1522		 */
1523
1524		/* The only thing we can do at this point is to verify that
1525		 * any such matching prefix can indeed be a prefix to our
1526		 * key, and if the bits in the node we are inspecting that
1527		 * do not match our key are not ZERO, this cannot be true.
1528		 * Thus, find out where there is a mismatch (before cn->pos)
1529		 * and verify that all the mismatching bits are zero in the
1530		 * new tnode's key.
1531		 */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1532
1533		/*
1534		 * Note: We aren't very concerned about the piece of
1535		 * the key that precede pn->pos+pn->bits, since these
1536		 * have already been checked. The bits after cn->pos
1537		 * aren't checked since these are by definition
1538		 * "unknown" at this point. Thus, what we want to see
1539		 * is if we are about to enter the "prefix matching"
1540		 * state, and in that case verify that the skipped
1541		 * bits that will prevail throughout this subtree are
1542		 * zero, as they have to be if we are to find a
1543		 * matching prefix.
1544		 */
1545
1546		pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
 
 
 
1547
1548		/*
1549		 * In short: If skipped bits in this node do not match
1550		 * the search key, enter the "prefix matching"
1551		 * state.directly.
1552		 */
1553		if (pref_mismatch) {
1554			int mp = KEYLENGTH - fls(pref_mismatch);
1555
1556			if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
1557				goto backtrace;
 
 
1558
1559			if (current_prefix_length >= cn->pos)
1560				current_prefix_length = mp;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1561		}
 
 
1562
1563		pn = (struct tnode *)n; /* Descend */
1564		chopped_off = 0;
1565		continue;
 
1566
1567backtrace:
1568		chopped_off++;
1569
1570		/* As zero don't change the child key (cindex) */
1571		while ((chopped_off <= pn->bits)
1572		       && !(cindex & (1<<(chopped_off-1))))
1573			chopped_off++;
1574
1575		/* Decrease current_... with bits chopped off */
1576		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1577			current_prefix_length = pn->pos + pn->bits
1578				- chopped_off;
1579
1580		/*
1581		 * Either we do the actual chop off according or if we have
1582		 * chopped off all bits in this tnode walk up to our parent.
1583		 */
1584
1585		if (chopped_off <= pn->bits) {
1586			cindex &= ~(1 << (chopped_off-1));
1587		} else {
1588			struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
1589			if (!parent)
1590				goto failed;
1591
1592			/* Get Child's index */
1593			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1594			pn = parent;
1595			chopped_off = 0;
1596
 
 
 
 
 
 
 
 
 
1597#ifdef CONFIG_IP_FIB_TRIE_STATS
1598			t->stats.backtrack++;
1599#endif
1600			goto backtrace;
 
 
1601		}
1602	}
1603failed:
1604	ret = 1;
1605found:
1606	rcu_read_unlock();
1607	return ret;
1608}
 
1609
1610/*
1611 * Remove the leaf and return parent.
1612 */
1613static void trie_leaf_remove(struct trie *t, struct leaf *l)
1614{
1615	struct tnode *tp = node_parent((struct rt_trie_node *) l);
 
 
1616
1617	pr_debug("entering trie_leaf_remove(%p)\n", l);
 
1618
1619	if (tp) {
1620		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1621		put_child(t, (struct tnode *)tp, cindex, NULL);
 
 
 
 
 
1622		trie_rebalance(t, tp);
1623	} else
1624		rcu_assign_pointer(t->trie, NULL);
 
 
 
 
1625
1626	free_leaf(l);
 
 
1627}
1628
1629/*
1630 * Caller must hold RTNL.
1631 */
1632int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1633{
1634	struct trie *t = (struct trie *) tb->tb_data;
1635	u32 key, mask;
1636	int plen = cfg->fc_dst_len;
 
 
1637	u8 tos = cfg->fc_tos;
1638	struct fib_alias *fa, *fa_to_delete;
1639	struct list_head *fa_head;
1640	struct leaf *l;
1641	struct leaf_info *li;
1642
1643	if (plen > 32)
1644		return -EINVAL;
1645
1646	key = ntohl(cfg->fc_dst);
1647	mask = ntohl(inet_make_mask(plen));
1648
1649	if (key & ~mask)
1650		return -EINVAL;
1651
1652	key = key & mask;
1653	l = fib_find_node(t, key);
1654
1655	if (!l)
1656		return -ESRCH;
1657
1658	fa_head = get_fa_head(l, plen);
1659	fa = fib_find_alias(fa_head, tos, 0);
1660
1661	if (!fa)
1662		return -ESRCH;
1663
1664	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1665
1666	fa_to_delete = NULL;
1667	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1668	list_for_each_entry_continue(fa, fa_head, fa_list) {
1669		struct fib_info *fi = fa->fa_info;
1670
1671		if (fa->fa_tos != tos)
 
 
1672			break;
1673
1674		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1675		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1676		     fa->fa_info->fib_scope == cfg->fc_scope) &&
1677		    (!cfg->fc_prefsrc ||
1678		     fi->fib_prefsrc == cfg->fc_prefsrc) &&
1679		    (!cfg->fc_protocol ||
1680		     fi->fib_protocol == cfg->fc_protocol) &&
1681		    fib_nh_match(cfg, fi) == 0) {
 
1682			fa_to_delete = fa;
1683			break;
1684		}
1685	}
1686
1687	if (!fa_to_delete)
1688		return -ESRCH;
1689
1690	fa = fa_to_delete;
1691	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
 
1692		  &cfg->fc_nlinfo, 0);
1693
1694	l = fib_find_node(t, key);
1695	li = find_leaf_info(l, plen);
1696
1697	list_del_rcu(&fa->fa_list);
1698
1699	if (!plen)
1700		tb->tb_num_default--;
1701
1702	if (list_empty(fa_head)) {
1703		hlist_del_rcu(&li->hlist);
1704		free_leaf_info(li);
1705	}
1706
1707	if (hlist_empty(&l->list))
1708		trie_leaf_remove(t, l);
1709
1710	if (fa->fa_state & FA_S_ACCESSED)
1711		rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1712
1713	fib_release_info(fa->fa_info);
1714	alias_free_mem_rcu(fa);
1715	return 0;
1716}
1717
1718static int trie_flush_list(struct list_head *head)
 
1719{
1720	struct fib_alias *fa, *fa_node;
1721	int found = 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1722
1723	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1724		struct fib_info *fi = fa->fa_info;
 
 
 
1725
1726		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1727			list_del_rcu(&fa->fa_list);
1728			fib_release_info(fa->fa_info);
1729			alias_free_mem_rcu(fa);
1730			found++;
1731		}
 
 
 
 
 
 
 
 
 
 
 
 
 
1732	}
1733	return found;
 
 
 
 
 
 
1734}
1735
1736static int trie_flush_leaf(struct leaf *l)
1737{
1738	int found = 0;
1739	struct hlist_head *lih = &l->list;
1740	struct hlist_node *node, *tmp;
1741	struct leaf_info *li = NULL;
1742
1743	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1744		found += trie_flush_list(&li->falh);
1745
1746		if (list_empty(&li->falh)) {
1747			hlist_del_rcu(&li->hlist);
1748			free_leaf_info(li);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1749		}
 
 
 
1750	}
1751	return found;
 
 
 
 
1752}
1753
1754/*
1755 * Scan for the next right leaf starting at node p->child[idx]
1756 * Since we have back pointer, no recursion necessary.
1757 */
1758static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
1759{
1760	do {
1761		t_key idx;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1762
1763		if (c)
1764			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1765		else
1766			idx = 0;
1767
1768		while (idx < 1u << p->bits) {
1769			c = tnode_get_child_rcu(p, idx++);
1770			if (!c)
1771				continue;
1772
1773			if (IS_LEAF(c)) {
1774				prefetch(rcu_dereference_rtnl(p->child[idx]));
1775				return (struct leaf *) c;
 
 
 
 
 
 
 
 
 
 
 
 
1776			}
 
1777
1778			/* Rescan start scanning in new node */
1779			p = (struct tnode *) c;
1780			idx = 0;
1781		}
 
1782
1783		/* Node empty, walk back up to parent */
1784		c = (struct rt_trie_node *) p;
1785	} while ((p = node_parent_rcu(c)) != NULL);
1786
1787	return NULL; /* Root of trie */
1788}
1789
1790static struct leaf *trie_firstleaf(struct trie *t)
 
1791{
1792	struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1793
1794	if (!n)
1795		return NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1796
1797	if (IS_LEAF(n))          /* trie is just a leaf */
1798		return (struct leaf *) n;
1799
1800	return leaf_walk_rcu(n, NULL);
 
 
 
 
1801}
1802
1803static struct leaf *trie_nextleaf(struct leaf *l)
 
1804{
1805	struct rt_trie_node *c = (struct rt_trie_node *) l;
1806	struct tnode *p = node_parent_rcu(c);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1807
1808	if (!p)
1809		return NULL;	/* trie with just one leaf */
 
 
 
 
 
1810
1811	return leaf_walk_rcu(p, c);
1812}
 
 
1813
1814static struct leaf *trie_leafindex(struct trie *t, int index)
1815{
1816	struct leaf *l = trie_firstleaf(t);
1817
1818	while (l && index-- > 0)
1819		l = trie_nextleaf(l);
1820
1821	return l;
1822}
 
 
 
 
1823
 
 
 
 
 
 
 
1824
1825/*
1826 * Caller must hold RTNL.
1827 */
1828int fib_table_flush(struct fib_table *tb)
1829{
1830	struct trie *t = (struct trie *) tb->tb_data;
1831	struct leaf *l, *ll = NULL;
1832	int found = 0;
 
1833
1834	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1835		found += trie_flush_leaf(l);
1836
1837		if (ll && hlist_empty(&ll->list))
1838			trie_leaf_remove(t, ll);
1839		ll = l;
 
1840	}
1841
1842	if (ll && hlist_empty(&ll->list))
1843		trie_leaf_remove(t, ll);
1844
1845	pr_debug("trie_flush found=%d\n", found);
1846	return found;
1847}
1848
1849void fib_free_table(struct fib_table *tb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1850{
1851	kfree(tb);
 
 
 
 
 
 
 
 
1852}
1853
1854static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1855			   struct fib_table *tb,
1856			   struct sk_buff *skb, struct netlink_callback *cb)
1857{
1858	int i, s_i;
1859	struct fib_alias *fa;
1860	__be32 xkey = htonl(key);
1861
1862	s_i = cb->args[5];
1863	i = 0;
1864
1865	/* rcu_read_lock is hold by caller */
 
1866
1867	list_for_each_entry_rcu(fa, fah, fa_list) {
1868		if (i < s_i) {
1869			i++;
 
1870			continue;
1871		}
1872
1873		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1874				  cb->nlh->nlmsg_seq,
1875				  RTM_NEWROUTE,
1876				  tb->tb_id,
1877				  fa->fa_type,
1878				  xkey,
1879				  plen,
1880				  fa->fa_tos,
1881				  fa->fa_info, NLM_F_MULTI) < 0) {
1882			cb->args[5] = i;
1883			return -1;
1884		}
1885		i++;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1886	}
1887	cb->args[5] = i;
1888	return skb->len;
1889}
1890
1891static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1892			struct sk_buff *skb, struct netlink_callback *cb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1893{
1894	struct leaf_info *li;
1895	struct hlist_node *node;
1896	int i, s_i;
 
 
 
 
 
1897
1898	s_i = cb->args[4];
 
1899	i = 0;
1900
1901	/* rcu_read_lock is hold by caller */
1902	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1903		if (i < s_i) {
1904			i++;
1905			continue;
1906		}
 
 
 
 
 
1907
1908		if (i > s_i)
1909			cb->args[5] = 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1910
1911		if (list_empty(&li->falh))
1912			continue;
1913
1914		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1915			cb->args[4] = i;
1916			return -1;
 
 
1917		}
 
 
1918		i++;
1919	}
1920
1921	cb->args[4] = i;
1922	return skb->len;
 
 
 
 
 
1923}
1924
 
1925int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1926		   struct netlink_callback *cb)
1927{
1928	struct leaf *l;
1929	struct trie *t = (struct trie *) tb->tb_data;
1930	t_key key = cb->args[2];
1931	int count = cb->args[3];
1932
1933	rcu_read_lock();
1934	/* Dump starting at last key.
1935	 * Note: 0.0.0.0/0 (ie default) is first key.
1936	 */
1937	if (count == 0)
1938		l = trie_firstleaf(t);
1939	else {
1940		/* Normally, continue from last key, but if that is missing
1941		 * fallback to using slow rescan
1942		 */
1943		l = fib_find_node(t, key);
1944		if (!l)
1945			l = trie_leafindex(t, count);
1946	}
1947
1948	while (l) {
1949		cb->args[2] = l->key;
1950		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1951			cb->args[3] = count;
1952			rcu_read_unlock();
1953			return -1;
1954		}
1955
1956		++count;
1957		l = trie_nextleaf(l);
 
1958		memset(&cb->args[4], 0,
1959		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
 
 
 
 
1960	}
1961	cb->args[3] = count;
1962	rcu_read_unlock();
 
1963
1964	return skb->len;
1965}
1966
1967void __init fib_trie_init(void)
1968{
1969	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1970					  sizeof(struct fib_alias),
1971					  0, SLAB_PANIC, NULL);
1972
1973	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1974					   max(sizeof(struct leaf),
1975					       sizeof(struct leaf_info)),
1976					   0, SLAB_PANIC, NULL);
1977}
1978
1979
1980struct fib_table *fib_trie_table(u32 id)
1981{
1982	struct fib_table *tb;
1983	struct trie *t;
 
 
 
 
1984
1985	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1986		     GFP_KERNEL);
1987	if (tb == NULL)
1988		return NULL;
1989
1990	tb->tb_id = id;
1991	tb->tb_default = -1;
1992	tb->tb_num_default = 0;
 
 
 
 
1993
1994	t = (struct trie *) tb->tb_data;
1995	memset(t, 0, sizeof(*t));
 
 
 
 
 
 
 
 
1996
1997	return tb;
1998}
1999
2000#ifdef CONFIG_PROC_FS
2001/* Depth first Trie walk iterator */
2002struct fib_trie_iter {
2003	struct seq_net_private p;
2004	struct fib_table *tb;
2005	struct tnode *tnode;
2006	unsigned int index;
2007	unsigned int depth;
2008};
2009
2010static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
2011{
2012	struct tnode *tn = iter->tnode;
2013	unsigned int cindex = iter->index;
2014	struct tnode *p;
2015
2016	/* A single entry routing table */
2017	if (!tn)
2018		return NULL;
2019
2020	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2021		 iter->tnode, iter->index, iter->depth);
2022rescan:
2023	while (cindex < (1<<tn->bits)) {
2024		struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
2025
2026		if (n) {
 
 
 
 
 
 
2027			if (IS_LEAF(n)) {
2028				iter->tnode = tn;
2029				iter->index = cindex + 1;
2030			} else {
2031				/* push down one level */
2032				iter->tnode = (struct tnode *) n;
2033				iter->index = 0;
2034				++iter->depth;
2035			}
 
2036			return n;
2037		}
2038
2039		++cindex;
 
 
 
 
2040	}
2041
2042	/* Current node exhausted, pop back up */
2043	p = node_parent_rcu((struct rt_trie_node *)tn);
2044	if (p) {
2045		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2046		tn = p;
2047		--iter->depth;
2048		goto rescan;
2049	}
2050
2051	/* got root? */
2052	return NULL;
2053}
2054
2055static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
2056				       struct trie *t)
2057{
2058	struct rt_trie_node *n;
2059
2060	if (!t)
2061		return NULL;
2062
2063	n = rcu_dereference(t->trie);
 
2064	if (!n)
2065		return NULL;
2066
2067	if (IS_TNODE(n)) {
2068		iter->tnode = (struct tnode *) n;
2069		iter->index = 0;
2070		iter->depth = 1;
2071	} else {
2072		iter->tnode = NULL;
2073		iter->index = 0;
2074		iter->depth = 0;
2075	}
2076
2077	return n;
2078}
2079
2080static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2081{
2082	struct rt_trie_node *n;
2083	struct fib_trie_iter iter;
2084
2085	memset(s, 0, sizeof(*s));
2086
2087	rcu_read_lock();
2088	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2089		if (IS_LEAF(n)) {
2090			struct leaf *l = (struct leaf *)n;
2091			struct leaf_info *li;
2092			struct hlist_node *tmp;
2093
2094			s->leaves++;
2095			s->totdepth += iter.depth;
2096			if (iter.depth > s->maxdepth)
2097				s->maxdepth = iter.depth;
2098
2099			hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2100				++s->prefixes;
2101		} else {
2102			const struct tnode *tn = (const struct tnode *) n;
2103			int i;
2104
2105			s->tnodes++;
2106			if (tn->bits < MAX_STAT_DEPTH)
2107				s->nodesizes[tn->bits]++;
2108
2109			for (i = 0; i < (1<<tn->bits); i++)
2110				if (!tn->child[i])
2111					s->nullpointers++;
2112		}
2113	}
2114	rcu_read_unlock();
2115}
2116
2117/*
2118 *	This outputs /proc/net/fib_triestats
2119 */
2120static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2121{
2122	unsigned int i, max, pointers, bytes, avdepth;
2123
2124	if (stat->leaves)
2125		avdepth = stat->totdepth*100 / stat->leaves;
2126	else
2127		avdepth = 0;
2128
2129	seq_printf(seq, "\tAver depth:     %u.%02d\n",
2130		   avdepth / 100, avdepth % 100);
2131	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2132
2133	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2134	bytes = sizeof(struct leaf) * stat->leaves;
2135
2136	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2137	bytes += sizeof(struct leaf_info) * stat->prefixes;
2138
2139	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2140	bytes += sizeof(struct tnode) * stat->tnodes;
2141
2142	max = MAX_STAT_DEPTH;
2143	while (max > 0 && stat->nodesizes[max-1] == 0)
2144		max--;
2145
2146	pointers = 0;
2147	for (i = 1; i <= max; i++)
2148		if (stat->nodesizes[i] != 0) {
2149			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2150			pointers += (1<<i) * stat->nodesizes[i];
2151		}
2152	seq_putc(seq, '\n');
2153	seq_printf(seq, "\tPointers: %u\n", pointers);
2154
2155	bytes += sizeof(struct rt_trie_node *) * pointers;
2156	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2157	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2158}
2159
2160#ifdef CONFIG_IP_FIB_TRIE_STATS
2161static void trie_show_usage(struct seq_file *seq,
2162			    const struct trie_use_stats *stats)
2163{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2164	seq_printf(seq, "\nCounters:\n---------\n");
2165	seq_printf(seq, "gets = %u\n", stats->gets);
2166	seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2167	seq_printf(seq, "semantic match passed = %u\n",
2168		   stats->semantic_match_passed);
2169	seq_printf(seq, "semantic match miss = %u\n",
2170		   stats->semantic_match_miss);
2171	seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2172	seq_printf(seq, "skipped node resize = %u\n\n",
2173		   stats->resize_node_skipped);
2174}
2175#endif /*  CONFIG_IP_FIB_TRIE_STATS */
2176
2177static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2178{
2179	if (tb->tb_id == RT_TABLE_LOCAL)
2180		seq_puts(seq, "Local:\n");
2181	else if (tb->tb_id == RT_TABLE_MAIN)
2182		seq_puts(seq, "Main:\n");
2183	else
2184		seq_printf(seq, "Id %d:\n", tb->tb_id);
2185}
2186
2187
2188static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2189{
2190	struct net *net = (struct net *)seq->private;
2191	unsigned int h;
2192
2193	seq_printf(seq,
2194		   "Basic info: size of leaf:"
2195		   " %Zd bytes, size of tnode: %Zd bytes.\n",
2196		   sizeof(struct leaf), sizeof(struct tnode));
2197
2198	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2199		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2200		struct hlist_node *node;
2201		struct fib_table *tb;
2202
2203		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2204			struct trie *t = (struct trie *) tb->tb_data;
2205			struct trie_stat stat;
2206
2207			if (!t)
2208				continue;
2209
2210			fib_table_print(seq, tb);
2211
2212			trie_collect_stats(t, &stat);
2213			trie_show_stats(seq, &stat);
2214#ifdef CONFIG_IP_FIB_TRIE_STATS
2215			trie_show_usage(seq, &t->stats);
2216#endif
2217		}
2218	}
2219
2220	return 0;
2221}
2222
2223static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2224{
2225	return single_open_net(inode, file, fib_triestat_seq_show);
2226}
2227
2228static const struct file_operations fib_triestat_fops = {
2229	.owner	= THIS_MODULE,
2230	.open	= fib_triestat_seq_open,
2231	.read	= seq_read,
2232	.llseek	= seq_lseek,
2233	.release = single_release_net,
2234};
2235
2236static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2237{
2238	struct fib_trie_iter *iter = seq->private;
2239	struct net *net = seq_file_net(seq);
2240	loff_t idx = 0;
2241	unsigned int h;
2242
2243	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2244		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2245		struct hlist_node *node;
2246		struct fib_table *tb;
2247
2248		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2249			struct rt_trie_node *n;
2250
2251			for (n = fib_trie_get_first(iter,
2252						    (struct trie *) tb->tb_data);
2253			     n; n = fib_trie_get_next(iter))
2254				if (pos == idx++) {
2255					iter->tb = tb;
2256					return n;
2257				}
2258		}
2259	}
2260
2261	return NULL;
2262}
2263
2264static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2265	__acquires(RCU)
2266{
2267	rcu_read_lock();
2268	return fib_trie_get_idx(seq, *pos);
2269}
2270
2271static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2272{
2273	struct fib_trie_iter *iter = seq->private;
2274	struct net *net = seq_file_net(seq);
2275	struct fib_table *tb = iter->tb;
2276	struct hlist_node *tb_node;
2277	unsigned int h;
2278	struct rt_trie_node *n;
2279
2280	++*pos;
2281	/* next node in same table */
2282	n = fib_trie_get_next(iter);
2283	if (n)
2284		return n;
2285
2286	/* walk rest of this hash chain */
2287	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2288	while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2289		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2290		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2291		if (n)
2292			goto found;
2293	}
2294
2295	/* new hash chain */
2296	while (++h < FIB_TABLE_HASHSZ) {
2297		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2298		hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2299			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2300			if (n)
2301				goto found;
2302		}
2303	}
2304	return NULL;
2305
2306found:
2307	iter->tb = tb;
2308	return n;
2309}
2310
2311static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2312	__releases(RCU)
2313{
2314	rcu_read_unlock();
2315}
2316
2317static void seq_indent(struct seq_file *seq, int n)
2318{
2319	while (n-- > 0)
2320		seq_puts(seq, "   ");
2321}
2322
2323static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2324{
2325	switch (s) {
2326	case RT_SCOPE_UNIVERSE: return "universe";
2327	case RT_SCOPE_SITE:	return "site";
2328	case RT_SCOPE_LINK:	return "link";
2329	case RT_SCOPE_HOST:	return "host";
2330	case RT_SCOPE_NOWHERE:	return "nowhere";
2331	default:
2332		snprintf(buf, len, "scope=%d", s);
2333		return buf;
2334	}
2335}
2336
2337static const char *const rtn_type_names[__RTN_MAX] = {
2338	[RTN_UNSPEC] = "UNSPEC",
2339	[RTN_UNICAST] = "UNICAST",
2340	[RTN_LOCAL] = "LOCAL",
2341	[RTN_BROADCAST] = "BROADCAST",
2342	[RTN_ANYCAST] = "ANYCAST",
2343	[RTN_MULTICAST] = "MULTICAST",
2344	[RTN_BLACKHOLE] = "BLACKHOLE",
2345	[RTN_UNREACHABLE] = "UNREACHABLE",
2346	[RTN_PROHIBIT] = "PROHIBIT",
2347	[RTN_THROW] = "THROW",
2348	[RTN_NAT] = "NAT",
2349	[RTN_XRESOLVE] = "XRESOLVE",
2350};
2351
2352static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2353{
2354	if (t < __RTN_MAX && rtn_type_names[t])
2355		return rtn_type_names[t];
2356	snprintf(buf, len, "type %u", t);
2357	return buf;
2358}
2359
2360/* Pretty print the trie */
2361static int fib_trie_seq_show(struct seq_file *seq, void *v)
2362{
2363	const struct fib_trie_iter *iter = seq->private;
2364	struct rt_trie_node *n = v;
2365
2366	if (!node_parent_rcu(n))
2367		fib_table_print(seq, iter->tb);
2368
2369	if (IS_TNODE(n)) {
2370		struct tnode *tn = (struct tnode *) n;
2371		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2372
2373		seq_indent(seq, iter->depth-1);
2374		seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
2375			   &prf, tn->pos, tn->bits, tn->full_children,
2376			   tn->empty_children);
2377
2378	} else {
2379		struct leaf *l = (struct leaf *) n;
2380		struct leaf_info *li;
2381		struct hlist_node *node;
2382		__be32 val = htonl(l->key);
2383
2384		seq_indent(seq, iter->depth);
2385		seq_printf(seq, "  |-- %pI4\n", &val);
2386
2387		hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2388			struct fib_alias *fa;
2389
2390			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2391				char buf1[32], buf2[32];
2392
2393				seq_indent(seq, iter->depth+1);
2394				seq_printf(seq, "  /%d %s %s", li->plen,
2395					   rtn_scope(buf1, sizeof(buf1),
2396						     fa->fa_info->fib_scope),
2397					   rtn_type(buf2, sizeof(buf2),
2398						    fa->fa_type));
2399				if (fa->fa_tos)
2400					seq_printf(seq, " tos=%d", fa->fa_tos);
2401				seq_putc(seq, '\n');
2402			}
2403		}
2404	}
2405
2406	return 0;
2407}
2408
2409static const struct seq_operations fib_trie_seq_ops = {
2410	.start  = fib_trie_seq_start,
2411	.next   = fib_trie_seq_next,
2412	.stop   = fib_trie_seq_stop,
2413	.show   = fib_trie_seq_show,
2414};
2415
2416static int fib_trie_seq_open(struct inode *inode, struct file *file)
2417{
2418	return seq_open_net(inode, file, &fib_trie_seq_ops,
2419			    sizeof(struct fib_trie_iter));
2420}
2421
2422static const struct file_operations fib_trie_fops = {
2423	.owner  = THIS_MODULE,
2424	.open   = fib_trie_seq_open,
2425	.read   = seq_read,
2426	.llseek = seq_lseek,
2427	.release = seq_release_net,
2428};
2429
2430struct fib_route_iter {
2431	struct seq_net_private p;
2432	struct trie *main_trie;
 
2433	loff_t	pos;
2434	t_key	key;
2435};
2436
2437static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
 
2438{
2439	struct leaf *l = NULL;
2440	struct trie *t = iter->main_trie;
2441
2442	/* use cache location of last found key */
2443	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2444		pos -= iter->pos;
2445	else {
2446		iter->pos = 0;
2447		l = trie_firstleaf(t);
2448	}
2449
2450	while (l && pos-- > 0) {
 
 
 
2451		iter->pos++;
2452		l = trie_nextleaf(l);
 
 
 
 
2453	}
2454
2455	if (l)
2456		iter->key = pos;	/* remember it */
2457	else
2458		iter->pos = 0;		/* forget it */
2459
2460	return l;
2461}
2462
2463static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2464	__acquires(RCU)
2465{
2466	struct fib_route_iter *iter = seq->private;
2467	struct fib_table *tb;
 
2468
2469	rcu_read_lock();
 
2470	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2471	if (!tb)
2472		return NULL;
2473
2474	iter->main_trie = (struct trie *) tb->tb_data;
2475	if (*pos == 0)
2476		return SEQ_START_TOKEN;
2477	else
2478		return fib_route_get_idx(iter, *pos - 1);
 
 
 
 
 
 
2479}
2480
2481static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2482{
2483	struct fib_route_iter *iter = seq->private;
2484	struct leaf *l = v;
 
2485
2486	++*pos;
2487	if (v == SEQ_START_TOKEN) {
 
 
 
 
 
 
 
 
2488		iter->pos = 0;
2489		l = trie_firstleaf(iter->main_trie);
2490	} else {
2491		iter->pos++;
2492		l = trie_nextleaf(l);
2493	}
2494
2495	if (l)
2496		iter->key = l->key;
2497	else
2498		iter->pos = 0;
2499	return l;
2500}
2501
2502static void fib_route_seq_stop(struct seq_file *seq, void *v)
2503	__releases(RCU)
2504{
2505	rcu_read_unlock();
2506}
2507
2508static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2509{
2510	unsigned int flags = 0;
2511
2512	if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2513		flags = RTF_REJECT;
2514	if (fi && fi->fib_nh->nh_gw)
2515		flags |= RTF_GATEWAY;
 
 
 
 
2516	if (mask == htonl(0xFFFFFFFF))
2517		flags |= RTF_HOST;
2518	flags |= RTF_UP;
2519	return flags;
2520}
2521
2522/*
2523 *	This outputs /proc/net/route.
2524 *	The format of the file is not supposed to be changed
2525 *	and needs to be same as fib_hash output to avoid breaking
2526 *	legacy utilities
2527 */
2528static int fib_route_seq_show(struct seq_file *seq, void *v)
2529{
2530	struct leaf *l = v;
2531	struct leaf_info *li;
2532	struct hlist_node *node;
 
 
2533
2534	if (v == SEQ_START_TOKEN) {
2535		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2536			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2537			   "\tWindow\tIRTT");
2538		return 0;
2539	}
2540
2541	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2542		struct fib_alias *fa;
2543		__be32 mask, prefix;
2544
2545		mask = inet_make_mask(li->plen);
2546		prefix = htonl(l->key);
 
 
2547
2548		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2549			const struct fib_info *fi = fa->fa_info;
2550			unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2551			int len;
2552
2553			if (fa->fa_type == RTN_BROADCAST
2554			    || fa->fa_type == RTN_MULTICAST)
2555				continue;
2556
2557			if (fi)
2558				seq_printf(seq,
2559					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2560					 "%d\t%08X\t%d\t%u\t%u%n",
2561					 fi->fib_dev ? fi->fib_dev->name : "*",
2562					 prefix,
2563					 fi->fib_nh->nh_gw, flags, 0, 0,
2564					 fi->fib_priority,
2565					 mask,
2566					 (fi->fib_advmss ?
2567					  fi->fib_advmss + 40 : 0),
2568					 fi->fib_window,
2569					 fi->fib_rtt >> 3, &len);
2570			else
2571				seq_printf(seq,
2572					 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2573					 "%d\t%08X\t%d\t%u\t%u%n",
2574					 prefix, 0, flags, 0, 0, 0,
2575					 mask, 0, 0, 0, &len);
2576
2577			seq_printf(seq, "%*s\n", 127 - len, "");
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2578		}
 
2579	}
2580
2581	return 0;
2582}
2583
2584static const struct seq_operations fib_route_seq_ops = {
2585	.start  = fib_route_seq_start,
2586	.next   = fib_route_seq_next,
2587	.stop   = fib_route_seq_stop,
2588	.show   = fib_route_seq_show,
2589};
2590
2591static int fib_route_seq_open(struct inode *inode, struct file *file)
2592{
2593	return seq_open_net(inode, file, &fib_route_seq_ops,
2594			    sizeof(struct fib_route_iter));
2595}
2596
2597static const struct file_operations fib_route_fops = {
2598	.owner  = THIS_MODULE,
2599	.open   = fib_route_seq_open,
2600	.read   = seq_read,
2601	.llseek = seq_lseek,
2602	.release = seq_release_net,
2603};
2604
2605int __net_init fib_proc_init(struct net *net)
2606{
2607	if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
 
2608		goto out1;
2609
2610	if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2611				  &fib_triestat_fops))
2612		goto out2;
2613
2614	if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
 
2615		goto out3;
2616
2617	return 0;
2618
2619out3:
2620	proc_net_remove(net, "fib_triestat");
2621out2:
2622	proc_net_remove(net, "fib_trie");
2623out1:
2624	return -ENOMEM;
2625}
2626
2627void __net_exit fib_proc_exit(struct net *net)
2628{
2629	proc_net_remove(net, "fib_trie");
2630	proc_net_remove(net, "fib_triestat");
2631	proc_net_remove(net, "route");
2632}
2633
2634#endif /* CONFIG_PROC_FS */