Linux Audio

Check our new training course

Loading...
v3.1
 
   1/*
   2 *   This program is free software; you can redistribute it and/or
   3 *   modify it under the terms of the GNU General Public License
   4 *   as published by the Free Software Foundation; either version
   5 *   2 of the License, or (at your option) any later version.
   6 *
   7 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
   8 *     & Swedish University of Agricultural Sciences.
   9 *
  10 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
  11 *     Agricultural Sciences.
  12 *
  13 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
  14 *
  15 * This work is based on the LPC-trie which is originally described in:
  16 *
  17 * An experimental study of compression methods for dynamic tries
  18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
  19 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
  20 *
  21 *
  22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
  23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
  24 *
  25 *
  26 * Code from fib_hash has been reused which includes the following header:
  27 *
  28 *
  29 * INET		An implementation of the TCP/IP protocol suite for the LINUX
  30 *		operating system.  INET is implemented using the  BSD Socket
  31 *		interface as the means of communication with the user level.
  32 *
  33 *		IPv4 FIB: lookup engine and maintenance routines.
  34 *
  35 *
  36 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  37 *
  38 *		This program is free software; you can redistribute it and/or
  39 *		modify it under the terms of the GNU General Public License
  40 *		as published by the Free Software Foundation; either version
  41 *		2 of the License, or (at your option) any later version.
  42 *
  43 * Substantial contributions to this work comes from:
  44 *
  45 *		David S. Miller, <davem@davemloft.net>
  46 *		Stephen Hemminger <shemminger@osdl.org>
  47 *		Paul E. McKenney <paulmck@us.ibm.com>
  48 *		Patrick McHardy <kaber@trash.net>
  49 */
  50
  51#define VERSION "0.409"
  52
  53#include <asm/uaccess.h>
  54#include <asm/system.h>
  55#include <linux/bitops.h>
  56#include <linux/types.h>
  57#include <linux/kernel.h>
  58#include <linux/mm.h>
  59#include <linux/string.h>
  60#include <linux/socket.h>
  61#include <linux/sockios.h>
  62#include <linux/errno.h>
  63#include <linux/in.h>
  64#include <linux/inet.h>
  65#include <linux/inetdevice.h>
  66#include <linux/netdevice.h>
  67#include <linux/if_arp.h>
  68#include <linux/proc_fs.h>
  69#include <linux/rcupdate.h>
  70#include <linux/skbuff.h>
  71#include <linux/netlink.h>
  72#include <linux/init.h>
  73#include <linux/list.h>
  74#include <linux/slab.h>
  75#include <linux/prefetch.h>
 
 
  76#include <net/net_namespace.h>
 
  77#include <net/ip.h>
  78#include <net/protocol.h>
  79#include <net/route.h>
  80#include <net/tcp.h>
  81#include <net/sock.h>
  82#include <net/ip_fib.h>
 
 
  83#include "fib_lookup.h"
  84
  85#define MAX_STAT_DEPTH 32
  86
  87#define KEYLENGTH (8*sizeof(t_key))
  88
  89typedef unsigned int t_key;
 
 
 
 
 
 
 
 
 
 
 
  90
  91#define T_TNODE 0
  92#define T_LEAF  1
  93#define NODE_TYPE_MASK	0x1UL
  94#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
 
 
 
 
 
 
 
 
 
 
 
 
  95
  96#define IS_TNODE(n) (!(n->parent & T_LEAF))
  97#define IS_LEAF(n) (n->parent & T_LEAF)
  98
  99struct rt_trie_node {
 100	unsigned long parent;
 101	t_key key;
 102};
 103
 104struct leaf {
 105	unsigned long parent;
 106	t_key key;
 107	struct hlist_head list;
 108	struct rcu_head rcu;
 109};
 110
 111struct leaf_info {
 112	struct hlist_node hlist;
 113	int plen;
 114	u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
 115	struct list_head falh;
 116	struct rcu_head rcu;
 117};
 118
 119struct tnode {
 120	unsigned long parent;
 121	t_key key;
 122	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
 123	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
 124	unsigned int full_children;	/* KEYLENGTH bits needed */
 125	unsigned int empty_children;	/* KEYLENGTH bits needed */
 126	union {
 127		struct rcu_head rcu;
 128		struct work_struct work;
 129		struct tnode *tnode_free;
 
 130	};
 131	struct rt_trie_node __rcu *child[0];
 132};
 133
 
 
 
 
 
 
 
 
 
 
 
 
 134#ifdef CONFIG_IP_FIB_TRIE_STATS
 135struct trie_use_stats {
 136	unsigned int gets;
 137	unsigned int backtrack;
 138	unsigned int semantic_match_passed;
 139	unsigned int semantic_match_miss;
 140	unsigned int null_node_hit;
 141	unsigned int resize_node_skipped;
 142};
 143#endif
 144
 145struct trie_stat {
 146	unsigned int totdepth;
 147	unsigned int maxdepth;
 148	unsigned int tnodes;
 149	unsigned int leaves;
 150	unsigned int nullpointers;
 151	unsigned int prefixes;
 152	unsigned int nodesizes[MAX_STAT_DEPTH];
 153};
 154
 155struct trie {
 156	struct rt_trie_node __rcu *trie;
 157#ifdef CONFIG_IP_FIB_TRIE_STATS
 158	struct trie_use_stats stats;
 159#endif
 160};
 161
 162static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n);
 163static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
 164				  int wasfull);
 165static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
 166static struct tnode *inflate(struct trie *t, struct tnode *tn);
 167static struct tnode *halve(struct trie *t, struct tnode *tn);
 168/* tnodes to free after resize(); protected by RTNL */
 169static struct tnode *tnode_free_head;
 170static size_t tnode_free_size;
 171
 172/*
 173 * synchronize_rcu after call_rcu for that many pages; it should be especially
 174 * useful before resizing the root node with PREEMPT_NONE configs; the value was
 175 * obtained experimentally, aiming to avoid visible slowdown.
 176 */
 177static const int sync_pages = 128;
 
 
 178
 179static struct kmem_cache *fn_alias_kmem __read_mostly;
 180static struct kmem_cache *trie_leaf_kmem __read_mostly;
 181
 182/*
 183 * caller must hold RTNL
 184 */
 185static inline struct tnode *node_parent(const struct rt_trie_node *node)
 186{
 187	unsigned long parent;
 188
 189	parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());
 190
 191	return (struct tnode *)(parent & ~NODE_TYPE_MASK);
 192}
 193
 194/*
 195 * caller must hold RCU read lock or RTNL
 196 */
 197static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
 198{
 199	unsigned long parent;
 200
 201	parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
 202							   lockdep_rtnl_is_held());
 203
 204	return (struct tnode *)(parent & ~NODE_TYPE_MASK);
 205}
 
 206
 207/* Same as rcu_assign_pointer
 208 * but that macro() assumes that value is a pointer.
 209 */
 210static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
 211{
 212	smp_wmb();
 213	node->parent = (unsigned long)ptr | NODE_TYPE(node);
 214}
 215
 216/*
 217 * caller must hold RTNL
 218 */
 219static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
 220{
 221	BUG_ON(i >= 1U << tn->bits);
 222
 223	return rtnl_dereference(tn->child[i]);
 224}
 225
 226/*
 227 * caller must hold RCU read lock or RTNL
 228 */
 229static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
 230{
 231	BUG_ON(i >= 1U << tn->bits);
 232
 233	return rcu_dereference_rtnl(tn->child[i]);
 234}
 235
 236static inline int tnode_child_length(const struct tnode *tn)
 237{
 238	return 1 << tn->bits;
 239}
 240
 241static inline t_key mask_pfx(t_key k, unsigned int l)
 242{
 243	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
 244}
 245
 246static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
 247{
 248	if (offset < KEYLENGTH)
 249		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
 250	else
 251		return 0;
 252}
 253
 254static inline int tkey_equals(t_key a, t_key b)
 255{
 256	return a == b;
 257}
 258
 259static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
 260{
 261	if (bits == 0 || offset >= KEYLENGTH)
 262		return 1;
 263	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
 264	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
 265}
 266
 267static inline int tkey_mismatch(t_key a, int offset, t_key b)
 268{
 269	t_key diff = a ^ b;
 270	int i = offset;
 271
 272	if (!diff)
 273		return 0;
 274	while ((diff << i) >> (KEYLENGTH-1) == 0)
 275		i++;
 276	return i;
 277}
 278
 279/*
 280  To understand this stuff, an understanding of keys and all their bits is
 281  necessary. Every node in the trie has a key associated with it, but not
 282  all of the bits in that key are significant.
 283
 284  Consider a node 'n' and its parent 'tp'.
 285
 286  If n is a leaf, every bit in its key is significant. Its presence is
 287  necessitated by path compression, since during a tree traversal (when
 288  searching for a leaf - unless we are doing an insertion) we will completely
 289  ignore all skipped bits we encounter. Thus we need to verify, at the end of
 290  a potentially successful search, that we have indeed been walking the
 291  correct key path.
 292
 293  Note that we can never "miss" the correct key in the tree if present by
 294  following the wrong path. Path compression ensures that segments of the key
 295  that are the same for all keys with a given prefix are skipped, but the
 296  skipped part *is* identical for each node in the subtrie below the skipped
 297  bit! trie_insert() in this implementation takes care of that - note the
 298  call to tkey_sub_equals() in trie_insert().
 299
 300  if n is an internal node - a 'tnode' here, the various parts of its key
 301  have many different meanings.
 302
 303  Example:
 304  _________________________________________________________________
 305  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
 306  -----------------------------------------------------------------
 307    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 308
 309  _________________________________________________________________
 310  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
 311  -----------------------------------------------------------------
 312   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
 313
 314  tp->pos = 7
 315  tp->bits = 3
 316  n->pos = 15
 317  n->bits = 4
 318
 319  First, let's just ignore the bits that come before the parent tp, that is
 320  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
 321  not use them for anything.
 322
 323  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
 324  index into the parent's child array. That is, they will be used to find
 325  'n' among tp's children.
 326
 327  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
 328  for the node n.
 329
 330  All the bits we have seen so far are significant to the node n. The rest
 331  of the bits are really not needed or indeed known in n->key.
 332
 333  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
 334  n's child array, and will of course be different for each child.
 335
 336
 337  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
 338  at this point.
 339
 340*/
 341
 342static inline void check_tnode(const struct tnode *tn)
 343{
 344	WARN_ON(tn && tn->pos+tn->bits > 32);
 345}
 346
 347static const int halve_threshold = 25;
 348static const int inflate_threshold = 50;
 349static const int halve_threshold_root = 15;
 350static const int inflate_threshold_root = 30;
 351
 352static void __alias_free_mem(struct rcu_head *head)
 353{
 354	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
 355	kmem_cache_free(fn_alias_kmem, fa);
 356}
 357
 358static inline void alias_free_mem_rcu(struct fib_alias *fa)
 359{
 360	call_rcu(&fa->rcu, __alias_free_mem);
 361}
 362
 363static void __leaf_free_rcu(struct rcu_head *head)
 364{
 365	struct leaf *l = container_of(head, struct leaf, rcu);
 366	kmem_cache_free(trie_leaf_kmem, l);
 367}
 368
 369static inline void free_leaf(struct leaf *l)
 370{
 371	call_rcu_bh(&l->rcu, __leaf_free_rcu);
 372}
 373
 374static inline void free_leaf_info(struct leaf_info *leaf)
 375{
 376	kfree_rcu(leaf, rcu);
 
 377}
 378
 379static struct tnode *tnode_alloc(size_t size)
 
 
 380{
 
 
 
 
 
 
 
 
 
 381	if (size <= PAGE_SIZE)
 382		return kzalloc(size, GFP_KERNEL);
 383	else
 384		return vzalloc(size);
 385}
 386
 387static void __tnode_vfree(struct work_struct *arg)
 388{
 389	struct tnode *tn = container_of(arg, struct tnode, work);
 390	vfree(tn);
 391}
 392
 393static void __tnode_free_rcu(struct rcu_head *head)
 394{
 395	struct tnode *tn = container_of(head, struct tnode, rcu);
 396	size_t size = sizeof(struct tnode) +
 397		      (sizeof(struct rt_trie_node *) << tn->bits);
 398
 399	if (size <= PAGE_SIZE)
 400		kfree(tn);
 401	else {
 402		INIT_WORK(&tn->work, __tnode_vfree);
 403		schedule_work(&tn->work);
 404	}
 405}
 406
 407static inline void tnode_free(struct tnode *tn)
 408{
 409	if (IS_LEAF(tn))
 410		free_leaf((struct leaf *) tn);
 411	else
 412		call_rcu(&tn->rcu, __tnode_free_rcu);
 413}
 414
 415static void tnode_free_safe(struct tnode *tn)
 416{
 417	BUG_ON(IS_LEAF(tn));
 418	tn->tnode_free = tnode_free_head;
 419	tnode_free_head = tn;
 420	tnode_free_size += sizeof(struct tnode) +
 421			   (sizeof(struct rt_trie_node *) << tn->bits);
 422}
 423
 424static void tnode_free_flush(void)
 425{
 426	struct tnode *tn;
 
 427
 428	while ((tn = tnode_free_head)) {
 429		tnode_free_head = tn->tnode_free;
 430		tn->tnode_free = NULL;
 431		tnode_free(tn);
 432	}
 433
 434	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
 435		tnode_free_size = 0;
 436		synchronize_rcu();
 437	}
 438}
 
 
 
 
 
 439
 440static struct leaf *leaf_new(void)
 441{
 442	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
 443	if (l) {
 444		l->parent = T_LEAF;
 445		INIT_HLIST_HEAD(&l->list);
 446	}
 447	return l;
 448}
 449
 450static struct leaf_info *leaf_info_new(int plen)
 451{
 452	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
 453	if (li) {
 454		li->plen = plen;
 455		li->mask_plen = ntohl(inet_make_mask(plen));
 456		INIT_LIST_HEAD(&li->falh);
 457	}
 458	return li;
 459}
 460
 461static struct tnode *tnode_new(t_key key, int pos, int bits)
 462{
 463	size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
 464	struct tnode *tn = tnode_alloc(sz);
 465
 466	if (tn) {
 467		tn->parent = T_TNODE;
 468		tn->pos = pos;
 469		tn->bits = bits;
 470		tn->key = key;
 471		tn->full_children = 0;
 472		tn->empty_children = 1<<bits;
 473	}
 474
 475	pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
 476		 sizeof(struct rt_trie_node) << bits);
 477	return tn;
 478}
 479
 480/*
 481 * Check whether a tnode 'n' is "full", i.e. it is an internal node
 482 * and no bits are skipped. See discussion in dyntree paper p. 6
 483 */
 484
 485static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
 486{
 487	if (n == NULL || IS_LEAF(n))
 488		return 0;
 
 489
 490	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
 491}
 492
 493static inline void put_child(struct trie *t, struct tnode *tn, int i,
 494			     struct rt_trie_node *n)
 
 
 495{
 496	tnode_put_child_reorg(tn, i, n, -1);
 497}
 498
 499 /*
 500  * Add a child at position i overwriting the old value.
 501  * Update the value of full_children and empty_children.
 502  */
 503
 504static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
 505				  int wasfull)
 506{
 507	struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
 508	int isfull;
 509
 510	BUG_ON(i >= 1<<tn->bits);
 511
 512	/* update emptyChildren */
 513	if (n == NULL && chi != NULL)
 514		tn->empty_children++;
 515	else if (n != NULL && chi == NULL)
 516		tn->empty_children--;
 517
 518	/* update fullChildren */
 519	if (wasfull == -1)
 520		wasfull = tnode_full(tn, chi);
 521
 522	isfull = tnode_full(tn, n);
 
 523	if (wasfull && !isfull)
 524		tn->full_children--;
 525	else if (!wasfull && isfull)
 526		tn->full_children++;
 527
 528	if (n)
 529		node_set_parent(n, tn);
 530
 531	rcu_assign_pointer(tn->child[i], n);
 532}
 533
 534#define MAX_WORK 10
 535static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
 536{
 537	int i;
 538	struct tnode *old_tn;
 539	int inflate_threshold_use;
 540	int halve_threshold_use;
 541	int max_work;
 542
 543	if (!tn)
 544		return NULL;
 
 545
 546	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
 547		 tn, inflate_threshold, halve_threshold);
 548
 549	/* No children */
 550	if (tn->empty_children == tnode_child_length(tn)) {
 551		tnode_free_safe(tn);
 552		return NULL;
 
 
 
 
 553	}
 554	/* One child */
 555	if (tn->empty_children == tnode_child_length(tn) - 1)
 556		goto one_child;
 557	/*
 558	 * Double as long as the resulting node has a number of
 559	 * nonempty nodes that are above the threshold.
 560	 */
 561
 562	/*
 563	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
 564	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
 565	 * Telecommunications, page 6:
 566	 * "A node is doubled if the ratio of non-empty children to all
 567	 * children in the *doubled* node is at least 'high'."
 568	 *
 569	 * 'high' in this instance is the variable 'inflate_threshold'. It
 570	 * is expressed as a percentage, so we multiply it with
 571	 * tnode_child_length() and instead of multiplying by 2 (since the
 572	 * child array will be doubled by inflate()) and multiplying
 573	 * the left-hand side by 100 (to handle the percentage thing) we
 574	 * multiply the left-hand side by 50.
 575	 *
 576	 * The left-hand side may look a bit weird: tnode_child_length(tn)
 577	 * - tn->empty_children is of course the number of non-null children
 578	 * in the current node. tn->full_children is the number of "full"
 579	 * children, that is non-null tnodes with a skip value of 0.
 580	 * All of those will be doubled in the resulting inflated tnode, so
 581	 * we just count them one extra time here.
 582	 *
 583	 * A clearer way to write this would be:
 584	 *
 585	 * to_be_doubled = tn->full_children;
 586	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
 587	 *     tn->full_children;
 588	 *
 589	 * new_child_length = tnode_child_length(tn) * 2;
 590	 *
 591	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
 592	 *      new_child_length;
 593	 * if (new_fill_factor >= inflate_threshold)
 594	 *
 595	 * ...and so on, tho it would mess up the while () loop.
 596	 *
 597	 * anyway,
 598	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
 599	 *      inflate_threshold
 600	 *
 601	 * avoid a division:
 602	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
 603	 *      inflate_threshold * new_child_length
 604	 *
 605	 * expand not_to_be_doubled and to_be_doubled, and shorten:
 606	 * 100 * (tnode_child_length(tn) - tn->empty_children +
 607	 *    tn->full_children) >= inflate_threshold * new_child_length
 608	 *
 609	 * expand new_child_length:
 610	 * 100 * (tnode_child_length(tn) - tn->empty_children +
 611	 *    tn->full_children) >=
 612	 *      inflate_threshold * tnode_child_length(tn) * 2
 613	 *
 614	 * shorten again:
 615	 * 50 * (tn->full_children + tnode_child_length(tn) -
 616	 *    tn->empty_children) >= inflate_threshold *
 617	 *    tnode_child_length(tn)
 618	 *
 619	 */
 620
 621	check_tnode(tn);
 
 
 
 
 
 
 
 622
 623	/* Keep root node larger  */
 
 
 
 624
 625	if (!node_parent((struct rt_trie_node *)tn)) {
 626		inflate_threshold_use = inflate_threshold_root;
 627		halve_threshold_use = halve_threshold_root;
 628	} else {
 629		inflate_threshold_use = inflate_threshold;
 630		halve_threshold_use = halve_threshold;
 631	}
 632
 633	max_work = MAX_WORK;
 634	while ((tn->full_children > 0 &&  max_work-- &&
 635		50 * (tn->full_children + tnode_child_length(tn)
 636		      - tn->empty_children)
 637		>= inflate_threshold_use * tnode_child_length(tn))) {
 638
 639		old_tn = tn;
 640		tn = inflate(t, tn);
 
 
 641
 642		if (IS_ERR(tn)) {
 643			tn = old_tn;
 644#ifdef CONFIG_IP_FIB_TRIE_STATS
 645			t->stats.resize_node_skipped++;
 646#endif
 647			break;
 648		}
 649	}
 650
 651	check_tnode(tn);
 652
 653	/* Return if at least one inflate is run */
 654	if (max_work != MAX_WORK)
 655		return (struct rt_trie_node *) tn;
 656
 657	/*
 658	 * Halve as long as the number of empty children in this
 659	 * node is above threshold.
 660	 */
 661
 662	max_work = MAX_WORK;
 663	while (tn->bits > 1 &&  max_work-- &&
 664	       100 * (tnode_child_length(tn) - tn->empty_children) <
 665	       halve_threshold_use * tnode_child_length(tn)) {
 666
 667		old_tn = tn;
 668		tn = halve(t, tn);
 669		if (IS_ERR(tn)) {
 670			tn = old_tn;
 671#ifdef CONFIG_IP_FIB_TRIE_STATS
 672			t->stats.resize_node_skipped++;
 673#endif
 674			break;
 675		}
 676	}
 
 677
 
 
 
 
 
 
 678
 679	/* Only one child remains */
 680	if (tn->empty_children == tnode_child_length(tn) - 1) {
 681one_child:
 682		for (i = 0; i < tnode_child_length(tn); i++) {
 683			struct rt_trie_node *n;
 684
 685			n = rtnl_dereference(tn->child[i]);
 686			if (!n)
 687				continue;
 688
 689			/* compress one level */
 690
 691			node_set_parent(n, NULL);
 692			tnode_free_safe(tn);
 693			return n;
 694		}
 695	}
 696	return (struct rt_trie_node *) tn;
 697}
 698
 
 
 699
 700static void tnode_clean_free(struct tnode *tn)
 701{
 702	int i;
 703	struct tnode *tofree;
 704
 705	for (i = 0; i < tnode_child_length(tn); i++) {
 706		tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
 707		if (tofree)
 708			tnode_free(tofree);
 709	}
 710	tnode_free(tn);
 
 711}
 712
 713static struct tnode *inflate(struct trie *t, struct tnode *tn)
 
 714{
 715	struct tnode *oldtnode = tn;
 716	int olen = tnode_child_length(tn);
 717	int i;
 718
 719	pr_debug("In inflate\n");
 720
 721	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
 722
 723	if (!tn)
 724		return ERR_PTR(-ENOMEM);
 725
 726	/*
 727	 * Preallocate and store tnodes before the actual work so we
 728	 * don't get into an inconsistent state if memory allocation
 729	 * fails. In case of failure we return the oldnode and  inflate
 730	 * of tnode is ignored.
 731	 */
 732
 733	for (i = 0; i < olen; i++) {
 734		struct tnode *inode;
 735
 736		inode = (struct tnode *) tnode_get_child(oldtnode, i);
 737		if (inode &&
 738		    IS_TNODE(inode) &&
 739		    inode->pos == oldtnode->pos + oldtnode->bits &&
 740		    inode->bits > 1) {
 741			struct tnode *left, *right;
 742			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
 743
 744			left = tnode_new(inode->key&(~m), inode->pos + 1,
 745					 inode->bits - 1);
 746			if (!left)
 747				goto nomem;
 748
 749			right = tnode_new(inode->key|m, inode->pos + 1,
 750					  inode->bits - 1);
 751
 752			if (!right) {
 753				tnode_free(left);
 754				goto nomem;
 755			}
 756
 757			put_child(t, tn, 2*i, (struct rt_trie_node *) left);
 758			put_child(t, tn, 2*i+1, (struct rt_trie_node *) right);
 759		}
 760	}
 761
 762	for (i = 0; i < olen; i++) {
 763		struct tnode *inode;
 764		struct rt_trie_node *node = tnode_get_child(oldtnode, i);
 765		struct tnode *left, *right;
 766		int size, j;
 767
 768		/* An empty child */
 769		if (node == NULL)
 770			continue;
 771
 772		/* A leaf or an internal node with skipped bits */
 773
 774		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
 775		   tn->pos + tn->bits - 1) {
 776			if (tkey_extract_bits(node->key,
 777					      oldtnode->pos + oldtnode->bits,
 778					      1) == 0)
 779				put_child(t, tn, 2*i, node);
 780			else
 781				put_child(t, tn, 2*i+1, node);
 782			continue;
 783		}
 784
 785		/* An internal node with two children */
 786		inode = (struct tnode *) node;
 787
 
 788		if (inode->bits == 1) {
 789			put_child(t, tn, 2*i, rtnl_dereference(inode->child[0]));
 790			put_child(t, tn, 2*i+1, rtnl_dereference(inode->child[1]));
 791
 792			tnode_free_safe(inode);
 793			continue;
 794		}
 795
 796		/* An internal node with more than two children */
 797
 798		/* We will replace this node 'inode' with two new
 799		 * ones, 'left' and 'right', each with half of the
 800		 * original children. The two new nodes will have
 801		 * a position one bit further down the key and this
 802		 * means that the "significant" part of their keys
 803		 * (see the discussion near the top of this file)
 804		 * will differ by one bit, which will be "0" in
 805		 * left's key and "1" in right's key. Since we are
 806		 * moving the key position by one step, the bit that
 807		 * we are moving away from - the bit at position
 808		 * (inode->pos) - is the one that will differ between
 809		 * left and right. So... we synthesize that bit in the
 810		 * two  new keys.
 811		 * The mask 'm' below will be a single "one" bit at
 812		 * the position (inode->pos)
 813		 */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 814
 815		/* Use the old key, but set the new significant
 816		 *   bit to zero.
 817		 */
 
 
 
 
 
 818
 819		left = (struct tnode *) tnode_get_child(tn, 2*i);
 820		put_child(t, tn, 2*i, NULL);
 
 
 
 821
 822		BUG_ON(!left);
 823
 824		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
 825		put_child(t, tn, 2*i+1, NULL);
 
 826
 827		BUG_ON(!right);
 
 828
 829		size = tnode_child_length(left);
 830		for (j = 0; j < size; j++) {
 831			put_child(t, left, j, rtnl_dereference(inode->child[j]));
 832			put_child(t, right, j, rtnl_dereference(inode->child[j + size]));
 
 
 
 
 
 
 
 
 
 
 833		}
 834		put_child(t, tn, 2*i, resize(t, left));
 835		put_child(t, tn, 2*i+1, resize(t, right));
 836
 837		tnode_free_safe(inode);
 
 
 
 
 
 
 
 
 
 
 
 
 838	}
 839	tnode_free_safe(oldtnode);
 840	return tn;
 
 841nomem:
 842	tnode_clean_free(tn);
 843	return ERR_PTR(-ENOMEM);
 
 
 844}
 845
 846static struct tnode *halve(struct trie *t, struct tnode *tn)
 
 847{
 848	struct tnode *oldtnode = tn;
 849	struct rt_trie_node *left, *right;
 850	int i;
 851	int olen = tnode_child_length(tn);
 852
 853	pr_debug("In halve\n");
 
 
 854
 855	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
 
 
 
 856
 857	if (!tn)
 858		return ERR_PTR(-ENOMEM);
 859
 860	/*
 861	 * Preallocate and store tnodes before the actual work so we
 862	 * don't get into an inconsistent state if memory allocation
 863	 * fails. In case of failure we return the oldnode and halve
 864	 * of tnode is ignored.
 865	 */
 866
 867	for (i = 0; i < olen; i += 2) {
 868		left = tnode_get_child(oldtnode, i);
 869		right = tnode_get_child(oldtnode, i+1);
 
 
 870
 871		/* Two nonempty children */
 872		if (left && right) {
 873			struct tnode *newn;
 
 
 874
 875			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
 
 
 
 
 
 
 876
 877			if (!newn)
 878				goto nomem;
 879
 880			put_child(t, tn, i/2, (struct rt_trie_node *)newn);
 881		}
 
 
 882
 
 
 
 883	}
 884
 885	for (i = 0; i < olen; i += 2) {
 886		struct tnode *newBinNode;
 887
 888		left = tnode_get_child(oldtnode, i);
 889		right = tnode_get_child(oldtnode, i+1);
 890
 891		/* At least one of the children is empty */
 892		if (left == NULL) {
 893			if (right == NULL)    /* Both are empty */
 894				continue;
 895			put_child(t, tn, i/2, right);
 896			continue;
 897		}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 898
 899		if (right == NULL) {
 900			put_child(t, tn, i/2, left);
 901			continue;
 902		}
 903
 904		/* Two nonempty children */
 905		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
 906		put_child(t, tn, i/2, NULL);
 907		put_child(t, newBinNode, 0, left);
 908		put_child(t, newBinNode, 1, right);
 909		put_child(t, tn, i/2, resize(t, newBinNode));
 910	}
 911	tnode_free_safe(oldtnode);
 912	return tn;
 913nomem:
 914	tnode_clean_free(tn);
 915	return ERR_PTR(-ENOMEM);
 916}
 917
 918/* readside must use rcu_read_lock currently dump routines
 919 via get_fa_head and dump */
 920
 921static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
 922{
 923	struct hlist_head *head = &l->list;
 924	struct hlist_node *node;
 925	struct leaf_info *li;
 926
 927	hlist_for_each_entry_rcu(li, node, head, hlist)
 928		if (li->plen == plen)
 929			return li;
 930
 931	return NULL;
 
 
 932}
 933
 934static inline struct list_head *get_fa_head(struct leaf *l, int plen)
 935{
 936	struct leaf_info *li = find_leaf_info(l, plen);
 937
 938	if (!li)
 939		return NULL;
 
 
 
 940
 941	return &li->falh;
 
 942}
 943
 944static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
 
 945{
 946	struct leaf_info *li = NULL, *last = NULL;
 947	struct hlist_node *node;
 
 
 
 
 948
 949	if (hlist_empty(head)) {
 950		hlist_add_head_rcu(&new->hlist, head);
 951	} else {
 952		hlist_for_each_entry(li, node, head, hlist) {
 953			if (new->plen > li->plen)
 954				break;
 
 
 955
 956			last = li;
 
 
 
 
 
 
 
 
 
 957		}
 958		if (last)
 959			hlist_add_after_rcu(&last->hlist, &new->hlist);
 960		else
 961			hlist_add_before_rcu(&new->hlist, &li->hlist);
 962	}
 963}
 964
 965/* rcu_read_lock needs to be hold by caller from readside */
 
 966
 967static struct leaf *
 968fib_find_node(struct trie *t, u32 key)
 969{
 970	int pos;
 971	struct tnode *tn;
 972	struct rt_trie_node *n;
 973
 974	pos = 0;
 975	n = rcu_dereference_rtnl(t->trie);
 976
 977	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
 978		tn = (struct tnode *) n;
 979
 980		check_tnode(tn);
 981
 982		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
 983			pos = tn->pos + tn->bits;
 984			n = tnode_get_child_rcu(tn,
 985						tkey_extract_bits(key,
 986								  tn->pos,
 987								  tn->bits));
 988		} else
 989			break;
 
 
 
 
 990	}
 991	/* Case we have found a leaf. Compare prefixes */
 992
 993	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
 994		return (struct leaf *)n;
 
 995
 996	return NULL;
 
 997}
 998
 999static void trie_rebalance(struct trie *t, struct tnode *tn)
1000{
1001	int wasfull;
1002	t_key cindex, key;
1003	struct tnode *tp;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1004
1005	key = tn->key;
 
 
 
 
 
1006
1007	while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
1008		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1009		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1010		tn = (struct tnode *) resize(t, (struct tnode *)tn);
1011
1012		tnode_put_child_reorg((struct tnode *)tp, cindex,
1013				      (struct rt_trie_node *)tn, wasfull);
1014
1015		tp = node_parent((struct rt_trie_node *) tn);
1016		if (!tp)
1017			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1018
1019		tnode_free_flush();
1020		if (!tp)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1021			break;
1022		tn = tp;
1023	}
1024
1025	/* Handle last (top) tnode */
1026	if (IS_TNODE(tn))
1027		tn = (struct tnode *)resize(t, (struct tnode *)tn);
1028
1029	rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1030	tnode_free_flush();
1031}
1032
1033/* only used from updater-side */
 
1034
1035static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
 
 
 
 
 
 
 
1036{
1037	int pos, newpos;
1038	struct tnode *tp = NULL, *tn = NULL;
1039	struct rt_trie_node *n;
1040	struct leaf *l;
1041	int missbit;
1042	struct list_head *fa_head = NULL;
1043	struct leaf_info *li;
1044	t_key cindex;
1045
1046	pos = 0;
1047	n = rtnl_dereference(t->trie);
1048
1049	/* If we point to NULL, stop. Either the tree is empty and we should
1050	 * just put a new leaf in if, or we have reached an empty child slot,
1051	 * and we should just put our new leaf in that.
1052	 * If we point to a T_TNODE, check if it matches our key. Note that
1053	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1054	 * not be the parent's 'pos'+'bits'!
1055	 *
1056	 * If it does match the current key, get pos/bits from it, extract
1057	 * the index from our key, push the T_TNODE and walk the tree.
1058	 *
1059	 * If it doesn't, we have to replace it with a new T_TNODE.
1060	 *
1061	 * If we point to a T_LEAF, it might or might not have the same key
1062	 * as we do. If it does, just change the value, update the T_LEAF's
1063	 * value, and return it.
1064	 * If it doesn't, we need to replace it with a T_TNODE.
1065	 */
1066
1067	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1068		tn = (struct tnode *) n;
 
 
 
 
 
 
 
 
 
 
 
 
 
1069
1070		check_tnode(tn);
 
1071
1072		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1073			tp = tn;
1074			pos = tn->pos + tn->bits;
1075			n = tnode_get_child(tn,
1076					    tkey_extract_bits(key,
1077							      tn->pos,
1078							      tn->bits));
 
1079
1080			BUG_ON(n && node_parent(n) != tn);
1081		} else
1082			break;
 
 
 
 
 
 
 
 
 
 
 
1083	}
1084
1085	/*
1086	 * n  ----> NULL, LEAF or TNODE
1087	 *
1088	 * tp is n's (parent) ----> NULL or TNODE
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1089	 */
 
 
 
 
 
 
 
 
 
1090
1091	BUG_ON(tp && IS_LEAF(tp));
 
 
 
1092
1093	/* Case 1: n is a leaf. Compare prefixes */
1094
1095	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1096		l = (struct leaf *) n;
1097		li = leaf_info_new(plen);
1098
1099		if (!li)
1100			return NULL;
 
 
 
1101
1102		fa_head = &li->falh;
1103		insert_leaf_info(&l->list, li);
1104		goto done;
 
 
 
1105	}
1106	l = leaf_new();
1107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1108	if (!l)
1109		return NULL;
1110
1111	l->key = key;
1112	li = leaf_info_new(plen);
1113
1114	if (!li) {
1115		free_leaf(l);
1116		return NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1117	}
1118
1119	fa_head = &li->falh;
1120	insert_leaf_info(&l->list, li);
 
 
 
1121
1122	if (t->trie && n == NULL) {
1123		/* Case 2: n is NULL, and will just insert a new leaf */
 
 
 
 
1124
1125		node_set_parent((struct rt_trie_node *)l, tp);
 
 
 
 
 
1126
1127		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1128		put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l);
1129	} else {
1130		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1131		/*
1132		 *  Add a new tnode here
1133		 *  first tnode need some special handling
1134		 */
1135
1136		if (tp)
1137			pos = tp->pos+tp->bits;
1138		else
1139			pos = 0;
1140
1141		if (n) {
1142			newpos = tkey_mismatch(key, pos, n->key);
1143			tn = tnode_new(n->key, newpos, 1);
1144		} else {
1145			newpos = 0;
1146			tn = tnode_new(key, newpos, 1); /* First tnode */
 
1147		}
1148
1149		if (!tn) {
1150			free_leaf_info(li);
1151			free_leaf(l);
1152			return NULL;
1153		}
1154
1155		node_set_parent((struct rt_trie_node *)tn, tp);
 
 
 
 
1156
1157		missbit = tkey_extract_bits(key, newpos, 1);
1158		put_child(t, tn, missbit, (struct rt_trie_node *)l);
1159		put_child(t, tn, 1-missbit, n);
1160
1161		if (tp) {
1162			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1163			put_child(t, (struct tnode *)tp, cindex,
1164				  (struct rt_trie_node *)tn);
1165		} else {
1166			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1167			tp = tn;
1168		}
1169	}
1170
1171	if (tp && tp->pos + tp->bits > 32)
1172		pr_warning("fib_trie"
1173			   " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1174			   tp, tp->pos, tp->bits, key, plen);
1175
1176	/* Rebalance the trie */
1177
1178	trie_rebalance(t, tp);
1179done:
1180	return fa_head;
1181}
1182
1183/*
1184 * Caller must hold RTNL.
1185 */
1186int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 
 
1187{
1188	struct trie *t = (struct trie *) tb->tb_data;
1189	struct fib_alias *fa, *new_fa;
1190	struct list_head *fa_head = NULL;
 
1191	struct fib_info *fi;
1192	int plen = cfg->fc_dst_len;
1193	u8 tos = cfg->fc_tos;
1194	u32 key, mask;
 
1195	int err;
1196	struct leaf *l;
1197
1198	if (plen > 32)
1199		return -EINVAL;
1200
1201	key = ntohl(cfg->fc_dst);
1202
1203	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1204
1205	mask = ntohl(inet_make_mask(plen));
1206
1207	if (key & ~mask)
1208		return -EINVAL;
1209
1210	key = key & mask;
1211
1212	fi = fib_create_info(cfg);
1213	if (IS_ERR(fi)) {
1214		err = PTR_ERR(fi);
1215		goto err;
1216	}
1217
1218	l = fib_find_node(t, key);
1219	fa = NULL;
1220
1221	if (l) {
1222		fa_head = get_fa_head(l, plen);
1223		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1224	}
1225
1226	/* Now fa, if non-NULL, points to the first fib alias
1227	 * with the same keys [prefix,tos,priority], if such key already
1228	 * exists or to the node before which we will insert new one.
1229	 *
1230	 * If fa is NULL, we will need to allocate a new one and
1231	 * insert to the head of f.
1232	 *
1233	 * If f is NULL, no fib node matched the destination key
1234	 * and we need to allocate a new one of those as well.
1235	 */
1236
1237	if (fa && fa->fa_tos == tos &&
1238	    fa->fa_info->fib_priority == fi->fib_priority) {
1239		struct fib_alias *fa_first, *fa_match;
1240
1241		err = -EEXIST;
1242		if (cfg->fc_nlflags & NLM_F_EXCL)
1243			goto out;
1244
 
 
1245		/* We have 2 goals:
1246		 * 1. Find exact match for type, scope, fib_info to avoid
1247		 * duplicate routes
1248		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1249		 */
1250		fa_match = NULL;
1251		fa_first = fa;
1252		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1253		list_for_each_entry_continue(fa, fa_head, fa_list) {
1254			if (fa->fa_tos != tos)
 
1255				break;
1256			if (fa->fa_info->fib_priority != fi->fib_priority)
1257				break;
1258			if (fa->fa_type == cfg->fc_type &&
1259			    fa->fa_info == fi) {
1260				fa_match = fa;
1261				break;
1262			}
1263		}
1264
1265		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1266			struct fib_info *fi_drop;
1267			u8 state;
1268
 
1269			fa = fa_first;
1270			if (fa_match) {
1271				if (fa == fa_match)
1272					err = 0;
1273				goto out;
1274			}
1275			err = -ENOBUFS;
1276			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1277			if (new_fa == NULL)
1278				goto out;
1279
1280			fi_drop = fa->fa_info;
1281			new_fa->fa_tos = fa->fa_tos;
1282			new_fa->fa_info = fi;
1283			new_fa->fa_type = cfg->fc_type;
1284			state = fa->fa_state;
1285			new_fa->fa_state = state & ~FA_S_ACCESSED;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1286
1287			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1288			alias_free_mem_rcu(fa);
1289
1290			fib_release_info(fi_drop);
1291			if (state & FA_S_ACCESSED)
1292				rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1293			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1294				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1295
1296			goto succeeded;
1297		}
1298		/* Error if we find a perfect match which
1299		 * uses the same scope, type, and nexthop
1300		 * information.
1301		 */
1302		if (fa_match)
1303			goto out;
1304
1305		if (!(cfg->fc_nlflags & NLM_F_APPEND))
 
 
1306			fa = fa_first;
1307	}
1308	err = -ENOENT;
1309	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1310		goto out;
1311
 
1312	err = -ENOBUFS;
1313	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1314	if (new_fa == NULL)
1315		goto out;
1316
1317	new_fa->fa_info = fi;
1318	new_fa->fa_tos = tos;
1319	new_fa->fa_type = cfg->fc_type;
1320	new_fa->fa_state = 0;
1321	/*
1322	 * Insert new entry to the list.
1323	 */
1324
1325	if (!fa_head) {
1326		fa_head = fib_insert_node(t, key, plen);
1327		if (unlikely(!fa_head)) {
1328			err = -ENOMEM;
1329			goto out_free_new_fa;
1330		}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1331	}
1332
1333	if (!plen)
1334		tb->tb_num_default++;
1335
1336	list_add_tail_rcu(&new_fa->fa_list,
1337			  (fa ? &fa->fa_list : fa_head));
1338
1339	rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1340	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1341		  &cfg->fc_nlinfo, 0);
1342succeeded:
1343	return 0;
1344
 
 
1345out_free_new_fa:
1346	kmem_cache_free(fn_alias_kmem, new_fa);
1347out:
1348	fib_release_info(fi);
1349err:
1350	return err;
1351}
1352
1353/* should be called with rcu_read_lock */
1354static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
1355		      t_key key,  const struct flowi4 *flp,
1356		      struct fib_result *res, int fib_flags)
1357{
1358	struct leaf_info *li;
1359	struct hlist_head *hhead = &l->list;
1360	struct hlist_node *node;
1361
1362	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1363		struct fib_alias *fa;
1364
1365		if (l->key != (key & li->mask_plen))
1366			continue;
1367
1368		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1369			struct fib_info *fi = fa->fa_info;
1370			int nhsel, err;
1371
1372			if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1373				continue;
1374			if (fa->fa_info->fib_scope < flp->flowi4_scope)
1375				continue;
1376			fib_alias_accessed(fa);
1377			err = fib_props[fa->fa_type].error;
1378			if (err) {
1379#ifdef CONFIG_IP_FIB_TRIE_STATS
1380				t->stats.semantic_match_passed++;
1381#endif
1382				return err;
1383			}
1384			if (fi->fib_flags & RTNH_F_DEAD)
1385				continue;
1386			for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1387				const struct fib_nh *nh = &fi->fib_nh[nhsel];
1388
1389				if (nh->nh_flags & RTNH_F_DEAD)
1390					continue;
1391				if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1392					continue;
 
1393
1394#ifdef CONFIG_IP_FIB_TRIE_STATS
1395				t->stats.semantic_match_passed++;
1396#endif
1397				res->prefixlen = li->plen;
1398				res->nh_sel = nhsel;
1399				res->type = fa->fa_type;
1400				res->scope = fa->fa_info->fib_scope;
1401				res->fi = fi;
1402				res->table = tb;
1403				res->fa_head = &li->falh;
1404				if (!(fib_flags & FIB_LOOKUP_NOREF))
1405					atomic_inc(&fi->fib_clntref);
1406				return 0;
1407			}
1408		}
1409
1410#ifdef CONFIG_IP_FIB_TRIE_STATS
1411		t->stats.semantic_match_miss++;
1412#endif
1413	}
1414
1415	return 1;
1416}
1417
 
1418int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1419		     struct fib_result *res, int fib_flags)
1420{
1421	struct trie *t = (struct trie *) tb->tb_data;
1422	int ret;
1423	struct rt_trie_node *n;
1424	struct tnode *pn;
1425	unsigned int pos, bits;
1426	t_key key = ntohl(flp->daddr);
1427	unsigned int chopped_off;
1428	t_key cindex = 0;
1429	unsigned int current_prefix_length = KEYLENGTH;
1430	struct tnode *cn;
1431	t_key pref_mismatch;
1432
1433	rcu_read_lock();
1434
1435	n = rcu_dereference(t->trie);
1436	if (!n)
1437		goto failed;
1438
1439#ifdef CONFIG_IP_FIB_TRIE_STATS
1440	t->stats.gets++;
1441#endif
 
 
 
 
 
1442
1443	/* Just a leaf? */
1444	if (IS_LEAF(n)) {
1445		ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1446		goto found;
1447	}
1448
1449	pn = (struct tnode *) n;
1450	chopped_off = 0;
1451
1452	while (pn) {
1453		pos = pn->pos;
1454		bits = pn->bits;
1455
1456		if (!chopped_off)
1457			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1458						   pos, bits);
1459
1460		n = tnode_get_child_rcu(pn, cindex);
 
 
 
 
1461
1462		if (n == NULL) {
1463#ifdef CONFIG_IP_FIB_TRIE_STATS
1464			t->stats.null_node_hit++;
1465#endif
1466			goto backtrace;
1467		}
1468
1469		if (IS_LEAF(n)) {
1470			ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1471			if (ret > 0)
1472				goto backtrace;
1473			goto found;
1474		}
1475
1476		cn = (struct tnode *)n;
1477
1478		/*
1479		 * It's a tnode, and we can do some extra checks here if we
1480		 * like, to avoid descending into a dead-end branch.
1481		 * This tnode is in the parent's child array at index
1482		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1483		 * chopped off, so in reality the index may be just a
1484		 * subprefix, padded with zero at the end.
1485		 * We can also take a look at any skipped bits in this
1486		 * tnode - everything up to p_pos is supposed to be ok,
1487		 * and the non-chopped bits of the index (se previous
1488		 * paragraph) are also guaranteed ok, but the rest is
1489		 * considered unknown.
1490		 *
1491		 * The skipped bits are key[pos+bits..cn->pos].
1492		 */
1493
1494		/* If current_prefix_length < pos+bits, we are already doing
1495		 * actual prefix  matching, which means everything from
1496		 * pos+(bits-chopped_off) onward must be zero along some
1497		 * branch of this subtree - otherwise there is *no* valid
1498		 * prefix present. Here we can only check the skipped
1499		 * bits. Remember, since we have already indexed into the
1500		 * parent's child array, we know that the bits we chopped of
1501		 * *are* zero.
1502		 */
 
 
1503
1504		/* NOTA BENE: Checking only skipped bits
1505		   for the new node here */
 
1506
1507		if (current_prefix_length < pos+bits) {
1508			if (tkey_extract_bits(cn->key, current_prefix_length,
1509						cn->pos - current_prefix_length)
1510			    || !(cn->child[0]))
1511				goto backtrace;
1512		}
1513
1514		/*
1515		 * If chopped_off=0, the index is fully validated and we
1516		 * only need to look at the skipped bits for this, the new,
1517		 * tnode. What we actually want to do is to find out if
1518		 * these skipped bits match our key perfectly, or if we will
1519		 * have to count on finding a matching prefix further down,
1520		 * because if we do, we would like to have some way of
1521		 * verifying the existence of such a prefix at this point.
1522		 */
 
 
 
 
1523
1524		/* The only thing we can do at this point is to verify that
1525		 * any such matching prefix can indeed be a prefix to our
1526		 * key, and if the bits in the node we are inspecting that
1527		 * do not match our key are not ZERO, this cannot be true.
1528		 * Thus, find out where there is a mismatch (before cn->pos)
1529		 * and verify that all the mismatching bits are zero in the
1530		 * new tnode's key.
1531		 */
1532
1533		/*
1534		 * Note: We aren't very concerned about the piece of
1535		 * the key that precede pn->pos+pn->bits, since these
1536		 * have already been checked. The bits after cn->pos
1537		 * aren't checked since these are by definition
1538		 * "unknown" at this point. Thus, what we want to see
1539		 * is if we are about to enter the "prefix matching"
1540		 * state, and in that case verify that the skipped
1541		 * bits that will prevail throughout this subtree are
1542		 * zero, as they have to be if we are to find a
1543		 * matching prefix.
1544		 */
 
 
1545
1546		pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
 
 
1547
1548		/*
1549		 * In short: If skipped bits in this node do not match
1550		 * the search key, enter the "prefix matching"
1551		 * state.directly.
1552		 */
1553		if (pref_mismatch) {
1554			int mp = KEYLENGTH - fls(pref_mismatch);
1555
1556			if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
1557				goto backtrace;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1558
1559			if (current_prefix_length >= cn->pos)
1560				current_prefix_length = mp;
1561		}
 
1562
1563		pn = (struct tnode *)n; /* Descend */
1564		chopped_off = 0;
1565		continue;
1566
1567backtrace:
1568		chopped_off++;
 
 
 
1569
1570		/* As zero don't change the child key (cindex) */
1571		while ((chopped_off <= pn->bits)
1572		       && !(cindex & (1<<(chopped_off-1))))
1573			chopped_off++;
1574
1575		/* Decrease current_... with bits chopped off */
1576		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1577			current_prefix_length = pn->pos + pn->bits
1578				- chopped_off;
1579
1580		/*
1581		 * Either we do the actual chop off according or if we have
1582		 * chopped off all bits in this tnode walk up to our parent.
1583		 */
 
 
 
 
 
 
 
 
 
1584
1585		if (chopped_off <= pn->bits) {
1586			cindex &= ~(1 << (chopped_off-1));
1587		} else {
1588			struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
1589			if (!parent)
1590				goto failed;
1591
1592			/* Get Child's index */
1593			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1594			pn = parent;
1595			chopped_off = 0;
1596
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1597#ifdef CONFIG_IP_FIB_TRIE_STATS
1598			t->stats.backtrack++;
1599#endif
1600			goto backtrace;
 
 
1601		}
1602	}
1603failed:
1604	ret = 1;
1605found:
1606	rcu_read_unlock();
1607	return ret;
1608}
 
1609
1610/*
1611 * Remove the leaf and return parent.
1612 */
1613static void trie_leaf_remove(struct trie *t, struct leaf *l)
1614{
1615	struct tnode *tp = node_parent((struct rt_trie_node *) l);
 
 
1616
1617	pr_debug("entering trie_leaf_remove(%p)\n", l);
 
1618
1619	if (tp) {
1620		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1621		put_child(t, (struct tnode *)tp, cindex, NULL);
 
 
 
 
 
1622		trie_rebalance(t, tp);
1623	} else
1624		rcu_assign_pointer(t->trie, NULL);
1625
1626	free_leaf(l);
 
 
 
 
 
 
1627}
1628
1629/*
1630 * Caller must hold RTNL.
1631 */
1632int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1633{
1634	struct trie *t = (struct trie *) tb->tb_data;
1635	u32 key, mask;
1636	int plen = cfg->fc_dst_len;
1637	u8 tos = cfg->fc_tos;
1638	struct fib_alias *fa, *fa_to_delete;
1639	struct list_head *fa_head;
1640	struct leaf *l;
1641	struct leaf_info *li;
1642
1643	if (plen > 32)
1644		return -EINVAL;
1645
1646	key = ntohl(cfg->fc_dst);
1647	mask = ntohl(inet_make_mask(plen));
1648
1649	if (key & ~mask)
1650		return -EINVAL;
1651
1652	key = key & mask;
1653	l = fib_find_node(t, key);
1654
1655	if (!l)
1656		return -ESRCH;
1657
1658	fa_head = get_fa_head(l, plen);
1659	fa = fib_find_alias(fa_head, tos, 0);
1660
1661	if (!fa)
1662		return -ESRCH;
1663
1664	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
 
1665
1666	fa_to_delete = NULL;
1667	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1668	list_for_each_entry_continue(fa, fa_head, fa_list) {
1669		struct fib_info *fi = fa->fa_info;
1670
1671		if (fa->fa_tos != tos)
 
 
1672			break;
1673
1674		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1675		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1676		     fa->fa_info->fib_scope == cfg->fc_scope) &&
1677		    (!cfg->fc_prefsrc ||
1678		     fi->fib_prefsrc == cfg->fc_prefsrc) &&
1679		    (!cfg->fc_protocol ||
1680		     fi->fib_protocol == cfg->fc_protocol) &&
1681		    fib_nh_match(cfg, fi) == 0) {
 
1682			fa_to_delete = fa;
1683			break;
1684		}
1685	}
1686
1687	if (!fa_to_delete)
1688		return -ESRCH;
1689
1690	fa = fa_to_delete;
1691	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1692		  &cfg->fc_nlinfo, 0);
1693
1694	l = fib_find_node(t, key);
1695	li = find_leaf_info(l, plen);
1696
1697	list_del_rcu(&fa->fa_list);
1698
1699	if (!plen)
1700		tb->tb_num_default--;
1701
1702	if (list_empty(fa_head)) {
1703		hlist_del_rcu(&li->hlist);
1704		free_leaf_info(li);
1705	}
1706
1707	if (hlist_empty(&l->list))
1708		trie_leaf_remove(t, l);
1709
1710	if (fa->fa_state & FA_S_ACCESSED)
1711		rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1712
1713	fib_release_info(fa->fa_info);
1714	alias_free_mem_rcu(fa);
1715	return 0;
1716}
1717
1718static int trie_flush_list(struct list_head *head)
 
1719{
1720	struct fib_alias *fa, *fa_node;
1721	int found = 0;
1722
1723	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1724		struct fib_info *fi = fa->fa_info;
 
 
 
1725
1726		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1727			list_del_rcu(&fa->fa_list);
1728			fib_release_info(fa->fa_info);
1729			alias_free_mem_rcu(fa);
1730			found++;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1731		}
 
 
 
 
 
 
 
 
 
 
 
 
 
1732	}
1733	return found;
 
 
 
 
 
 
1734}
1735
1736static int trie_flush_leaf(struct leaf *l)
1737{
1738	int found = 0;
1739	struct hlist_head *lih = &l->list;
1740	struct hlist_node *node, *tmp;
1741	struct leaf_info *li = NULL;
1742
1743	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1744		found += trie_flush_list(&li->falh);
1745
1746		if (list_empty(&li->falh)) {
1747			hlist_del_rcu(&li->hlist);
1748			free_leaf_info(li);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1749		}
 
 
 
 
 
 
 
 
1750	}
1751	return found;
 
 
 
 
1752}
1753
1754/*
1755 * Scan for the next right leaf starting at node p->child[idx]
1756 * Since we have back pointer, no recursion necessary.
1757 */
1758static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
1759{
1760	do {
1761		t_key idx;
 
 
 
 
1762
1763		if (c)
1764			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1765		else
1766			idx = 0;
 
 
 
 
 
 
 
 
 
 
1767
1768		while (idx < 1u << p->bits) {
1769			c = tnode_get_child_rcu(p, idx++);
1770			if (!c)
1771				continue;
1772
1773			if (IS_LEAF(c)) {
1774				prefetch(rcu_dereference_rtnl(p->child[idx]));
1775				return (struct leaf *) c;
1776			}
 
 
1777
1778			/* Rescan start scanning in new node */
1779			p = (struct tnode *) c;
1780			idx = 0;
 
 
 
 
 
 
1781		}
1782
1783		/* Node empty, walk back up to parent */
1784		c = (struct rt_trie_node *) p;
1785	} while ((p = node_parent_rcu(c)) != NULL);
 
 
1786
1787	return NULL; /* Root of trie */
 
 
 
 
1788}
1789
1790static struct leaf *trie_firstleaf(struct trie *t)
 
1791{
1792	struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
 
 
 
 
1793
1794	if (!n)
1795		return NULL;
 
 
1796
1797	if (IS_LEAF(n))          /* trie is just a leaf */
1798		return (struct leaf *) n;
1799
1800	return leaf_walk_rcu(n, NULL);
1801}
 
1802
1803static struct leaf *trie_nextleaf(struct leaf *l)
1804{
1805	struct rt_trie_node *c = (struct rt_trie_node *) l;
1806	struct tnode *p = node_parent_rcu(c);
 
 
 
1807
1808	if (!p)
1809		return NULL;	/* trie with just one leaf */
1810
1811	return leaf_walk_rcu(p, c);
1812}
 
 
1813
1814static struct leaf *trie_leafindex(struct trie *t, int index)
1815{
1816	struct leaf *l = trie_firstleaf(t);
 
1817
1818	while (l && index-- > 0)
1819		l = trie_nextleaf(l);
1820
1821	return l;
1822}
 
 
 
 
 
 
 
1823
 
 
 
1824
1825/*
1826 * Caller must hold RTNL.
1827 */
1828int fib_table_flush(struct fib_table *tb)
 
 
 
 
 
 
 
 
1829{
1830	struct trie *t = (struct trie *) tb->tb_data;
1831	struct leaf *l, *ll = NULL;
 
 
 
1832	int found = 0;
1833
1834	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1835		found += trie_flush_leaf(l);
 
 
1836
1837		if (ll && hlist_empty(&ll->list))
1838			trie_leaf_remove(t, ll);
1839		ll = l;
1840	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1841
1842	if (ll && hlist_empty(&ll->list))
1843		trie_leaf_remove(t, ll);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1844
1845	pr_debug("trie_flush found=%d\n", found);
1846	return found;
1847}
1848
1849void fib_free_table(struct fib_table *tb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1850{
1851	kfree(tb);
 
 
 
 
 
 
 
 
 
1852}
1853
1854static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1855			   struct fib_table *tb,
1856			   struct sk_buff *skb, struct netlink_callback *cb)
1857{
1858	int i, s_i;
1859	struct fib_alias *fa;
1860	__be32 xkey = htonl(key);
 
1861
1862	s_i = cb->args[5];
1863	i = 0;
1864
1865	/* rcu_read_lock is hold by caller */
 
1866
1867	list_for_each_entry_rcu(fa, fah, fa_list) {
1868		if (i < s_i) {
1869			i++;
 
1870			continue;
1871		}
1872
1873		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1874				  cb->nlh->nlmsg_seq,
1875				  RTM_NEWROUTE,
1876				  tb->tb_id,
1877				  fa->fa_type,
1878				  xkey,
1879				  plen,
1880				  fa->fa_tos,
1881				  fa->fa_info, NLM_F_MULTI) < 0) {
1882			cb->args[5] = i;
1883			return -1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1884		}
1885		i++;
1886	}
1887	cb->args[5] = i;
1888	return skb->len;
1889}
1890
1891static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1892			struct sk_buff *skb, struct netlink_callback *cb)
1893{
1894	struct leaf_info *li;
1895	struct hlist_node *node;
1896	int i, s_i;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1897
1898	s_i = cb->args[4];
 
1899	i = 0;
1900
1901	/* rcu_read_lock is hold by caller */
1902	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1903		if (i < s_i) {
1904			i++;
1905			continue;
1906		}
1907
1908		if (i > s_i)
1909			cb->args[5] = 0;
1910
1911		if (list_empty(&li->falh))
1912			continue;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1913
1914		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1915			cb->args[4] = i;
1916			return -1;
 
 
1917		}
 
 
1918		i++;
1919	}
1920
1921	cb->args[4] = i;
1922	return skb->len;
 
 
 
 
 
1923}
1924
 
1925int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1926		   struct netlink_callback *cb)
1927{
1928	struct leaf *l;
1929	struct trie *t = (struct trie *) tb->tb_data;
1930	t_key key = cb->args[2];
1931	int count = cb->args[3];
1932
1933	rcu_read_lock();
1934	/* Dump starting at last key.
1935	 * Note: 0.0.0.0/0 (ie default) is first key.
1936	 */
1937	if (count == 0)
1938		l = trie_firstleaf(t);
1939	else {
1940		/* Normally, continue from last key, but if that is missing
1941		 * fallback to using slow rescan
1942		 */
1943		l = fib_find_node(t, key);
1944		if (!l)
1945			l = trie_leafindex(t, count);
1946	}
 
1947
1948	while (l) {
1949		cb->args[2] = l->key;
1950		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1951			cb->args[3] = count;
1952			rcu_read_unlock();
1953			return -1;
1954		}
1955
1956		++count;
1957		l = trie_nextleaf(l);
 
1958		memset(&cb->args[4], 0,
1959		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
 
 
 
 
1960	}
1961	cb->args[3] = count;
1962	rcu_read_unlock();
 
1963
1964	return skb->len;
1965}
1966
1967void __init fib_trie_init(void)
1968{
1969	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1970					  sizeof(struct fib_alias),
1971					  0, SLAB_PANIC, NULL);
1972
1973	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1974					   max(sizeof(struct leaf),
1975					       sizeof(struct leaf_info)),
1976					   0, SLAB_PANIC, NULL);
1977}
1978
1979
1980struct fib_table *fib_trie_table(u32 id)
1981{
1982	struct fib_table *tb;
1983	struct trie *t;
 
 
 
 
1984
1985	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1986		     GFP_KERNEL);
1987	if (tb == NULL)
1988		return NULL;
1989
1990	tb->tb_id = id;
1991	tb->tb_default = -1;
1992	tb->tb_num_default = 0;
 
 
 
 
1993
1994	t = (struct trie *) tb->tb_data;
1995	memset(t, 0, sizeof(*t));
 
 
 
 
 
 
 
 
1996
1997	return tb;
1998}
1999
2000#ifdef CONFIG_PROC_FS
2001/* Depth first Trie walk iterator */
2002struct fib_trie_iter {
2003	struct seq_net_private p;
2004	struct fib_table *tb;
2005	struct tnode *tnode;
2006	unsigned int index;
2007	unsigned int depth;
2008};
2009
2010static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
2011{
2012	struct tnode *tn = iter->tnode;
2013	unsigned int cindex = iter->index;
2014	struct tnode *p;
2015
2016	/* A single entry routing table */
2017	if (!tn)
2018		return NULL;
2019
2020	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2021		 iter->tnode, iter->index, iter->depth);
2022rescan:
2023	while (cindex < (1<<tn->bits)) {
2024		struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
2025
2026		if (n) {
 
 
 
 
 
 
2027			if (IS_LEAF(n)) {
2028				iter->tnode = tn;
2029				iter->index = cindex + 1;
2030			} else {
2031				/* push down one level */
2032				iter->tnode = (struct tnode *) n;
2033				iter->index = 0;
2034				++iter->depth;
2035			}
 
2036			return n;
2037		}
2038
2039		++cindex;
2040	}
2041
2042	/* Current node exhausted, pop back up */
2043	p = node_parent_rcu((struct rt_trie_node *)tn);
2044	if (p) {
2045		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2046		tn = p;
2047		--iter->depth;
2048		goto rescan;
2049	}
2050
2051	/* got root? */
 
 
 
2052	return NULL;
2053}
2054
2055static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
2056				       struct trie *t)
2057{
2058	struct rt_trie_node *n;
2059
2060	if (!t)
2061		return NULL;
2062
2063	n = rcu_dereference(t->trie);
 
2064	if (!n)
2065		return NULL;
2066
2067	if (IS_TNODE(n)) {
2068		iter->tnode = (struct tnode *) n;
2069		iter->index = 0;
2070		iter->depth = 1;
2071	} else {
2072		iter->tnode = NULL;
2073		iter->index = 0;
2074		iter->depth = 0;
2075	}
2076
2077	return n;
2078}
2079
2080static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2081{
2082	struct rt_trie_node *n;
2083	struct fib_trie_iter iter;
2084
2085	memset(s, 0, sizeof(*s));
2086
2087	rcu_read_lock();
2088	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2089		if (IS_LEAF(n)) {
2090			struct leaf *l = (struct leaf *)n;
2091			struct leaf_info *li;
2092			struct hlist_node *tmp;
2093
2094			s->leaves++;
2095			s->totdepth += iter.depth;
2096			if (iter.depth > s->maxdepth)
2097				s->maxdepth = iter.depth;
2098
2099			hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2100				++s->prefixes;
2101		} else {
2102			const struct tnode *tn = (const struct tnode *) n;
2103			int i;
2104
2105			s->tnodes++;
2106			if (tn->bits < MAX_STAT_DEPTH)
2107				s->nodesizes[tn->bits]++;
2108
2109			for (i = 0; i < (1<<tn->bits); i++)
2110				if (!tn->child[i])
2111					s->nullpointers++;
2112		}
2113	}
2114	rcu_read_unlock();
2115}
2116
2117/*
2118 *	This outputs /proc/net/fib_triestats
2119 */
2120static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2121{
2122	unsigned int i, max, pointers, bytes, avdepth;
2123
2124	if (stat->leaves)
2125		avdepth = stat->totdepth*100 / stat->leaves;
2126	else
2127		avdepth = 0;
2128
2129	seq_printf(seq, "\tAver depth:     %u.%02d\n",
2130		   avdepth / 100, avdepth % 100);
2131	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2132
2133	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2134	bytes = sizeof(struct leaf) * stat->leaves;
2135
2136	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2137	bytes += sizeof(struct leaf_info) * stat->prefixes;
2138
2139	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2140	bytes += sizeof(struct tnode) * stat->tnodes;
2141
2142	max = MAX_STAT_DEPTH;
2143	while (max > 0 && stat->nodesizes[max-1] == 0)
2144		max--;
2145
2146	pointers = 0;
2147	for (i = 1; i <= max; i++)
2148		if (stat->nodesizes[i] != 0) {
2149			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2150			pointers += (1<<i) * stat->nodesizes[i];
2151		}
2152	seq_putc(seq, '\n');
2153	seq_printf(seq, "\tPointers: %u\n", pointers);
2154
2155	bytes += sizeof(struct rt_trie_node *) * pointers;
2156	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2157	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2158}
2159
2160#ifdef CONFIG_IP_FIB_TRIE_STATS
2161static void trie_show_usage(struct seq_file *seq,
2162			    const struct trie_use_stats *stats)
2163{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2164	seq_printf(seq, "\nCounters:\n---------\n");
2165	seq_printf(seq, "gets = %u\n", stats->gets);
2166	seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2167	seq_printf(seq, "semantic match passed = %u\n",
2168		   stats->semantic_match_passed);
2169	seq_printf(seq, "semantic match miss = %u\n",
2170		   stats->semantic_match_miss);
2171	seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2172	seq_printf(seq, "skipped node resize = %u\n\n",
2173		   stats->resize_node_skipped);
2174}
2175#endif /*  CONFIG_IP_FIB_TRIE_STATS */
2176
2177static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2178{
2179	if (tb->tb_id == RT_TABLE_LOCAL)
2180		seq_puts(seq, "Local:\n");
2181	else if (tb->tb_id == RT_TABLE_MAIN)
2182		seq_puts(seq, "Main:\n");
2183	else
2184		seq_printf(seq, "Id %d:\n", tb->tb_id);
2185}
2186
2187
2188static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2189{
2190	struct net *net = (struct net *)seq->private;
2191	unsigned int h;
2192
2193	seq_printf(seq,
2194		   "Basic info: size of leaf:"
2195		   " %Zd bytes, size of tnode: %Zd bytes.\n",
2196		   sizeof(struct leaf), sizeof(struct tnode));
2197
 
2198	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2199		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2200		struct hlist_node *node;
2201		struct fib_table *tb;
2202
2203		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2204			struct trie *t = (struct trie *) tb->tb_data;
2205			struct trie_stat stat;
2206
2207			if (!t)
2208				continue;
2209
2210			fib_table_print(seq, tb);
2211
2212			trie_collect_stats(t, &stat);
2213			trie_show_stats(seq, &stat);
2214#ifdef CONFIG_IP_FIB_TRIE_STATS
2215			trie_show_usage(seq, &t->stats);
2216#endif
2217		}
 
2218	}
 
2219
2220	return 0;
2221}
2222
2223static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2224{
2225	return single_open_net(inode, file, fib_triestat_seq_show);
2226}
2227
2228static const struct file_operations fib_triestat_fops = {
2229	.owner	= THIS_MODULE,
2230	.open	= fib_triestat_seq_open,
2231	.read	= seq_read,
2232	.llseek	= seq_lseek,
2233	.release = single_release_net,
2234};
2235
2236static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2237{
2238	struct fib_trie_iter *iter = seq->private;
2239	struct net *net = seq_file_net(seq);
2240	loff_t idx = 0;
2241	unsigned int h;
2242
2243	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2244		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2245		struct hlist_node *node;
2246		struct fib_table *tb;
2247
2248		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2249			struct rt_trie_node *n;
2250
2251			for (n = fib_trie_get_first(iter,
2252						    (struct trie *) tb->tb_data);
2253			     n; n = fib_trie_get_next(iter))
2254				if (pos == idx++) {
2255					iter->tb = tb;
2256					return n;
2257				}
2258		}
2259	}
2260
2261	return NULL;
2262}
2263
2264static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2265	__acquires(RCU)
2266{
2267	rcu_read_lock();
2268	return fib_trie_get_idx(seq, *pos);
2269}
2270
2271static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2272{
2273	struct fib_trie_iter *iter = seq->private;
2274	struct net *net = seq_file_net(seq);
2275	struct fib_table *tb = iter->tb;
2276	struct hlist_node *tb_node;
2277	unsigned int h;
2278	struct rt_trie_node *n;
2279
2280	++*pos;
2281	/* next node in same table */
2282	n = fib_trie_get_next(iter);
2283	if (n)
2284		return n;
2285
2286	/* walk rest of this hash chain */
2287	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2288	while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2289		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2290		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2291		if (n)
2292			goto found;
2293	}
2294
2295	/* new hash chain */
2296	while (++h < FIB_TABLE_HASHSZ) {
2297		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2298		hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2299			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2300			if (n)
2301				goto found;
2302		}
2303	}
2304	return NULL;
2305
2306found:
2307	iter->tb = tb;
2308	return n;
2309}
2310
2311static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2312	__releases(RCU)
2313{
2314	rcu_read_unlock();
2315}
2316
2317static void seq_indent(struct seq_file *seq, int n)
2318{
2319	while (n-- > 0)
2320		seq_puts(seq, "   ");
2321}
2322
2323static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2324{
2325	switch (s) {
2326	case RT_SCOPE_UNIVERSE: return "universe";
2327	case RT_SCOPE_SITE:	return "site";
2328	case RT_SCOPE_LINK:	return "link";
2329	case RT_SCOPE_HOST:	return "host";
2330	case RT_SCOPE_NOWHERE:	return "nowhere";
2331	default:
2332		snprintf(buf, len, "scope=%d", s);
2333		return buf;
2334	}
2335}
2336
2337static const char *const rtn_type_names[__RTN_MAX] = {
2338	[RTN_UNSPEC] = "UNSPEC",
2339	[RTN_UNICAST] = "UNICAST",
2340	[RTN_LOCAL] = "LOCAL",
2341	[RTN_BROADCAST] = "BROADCAST",
2342	[RTN_ANYCAST] = "ANYCAST",
2343	[RTN_MULTICAST] = "MULTICAST",
2344	[RTN_BLACKHOLE] = "BLACKHOLE",
2345	[RTN_UNREACHABLE] = "UNREACHABLE",
2346	[RTN_PROHIBIT] = "PROHIBIT",
2347	[RTN_THROW] = "THROW",
2348	[RTN_NAT] = "NAT",
2349	[RTN_XRESOLVE] = "XRESOLVE",
2350};
2351
2352static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2353{
2354	if (t < __RTN_MAX && rtn_type_names[t])
2355		return rtn_type_names[t];
2356	snprintf(buf, len, "type %u", t);
2357	return buf;
2358}
2359
2360/* Pretty print the trie */
2361static int fib_trie_seq_show(struct seq_file *seq, void *v)
2362{
2363	const struct fib_trie_iter *iter = seq->private;
2364	struct rt_trie_node *n = v;
2365
2366	if (!node_parent_rcu(n))
2367		fib_table_print(seq, iter->tb);
2368
2369	if (IS_TNODE(n)) {
2370		struct tnode *tn = (struct tnode *) n;
2371		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2372
2373		seq_indent(seq, iter->depth-1);
2374		seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
2375			   &prf, tn->pos, tn->bits, tn->full_children,
2376			   tn->empty_children);
2377
2378	} else {
2379		struct leaf *l = (struct leaf *) n;
2380		struct leaf_info *li;
2381		struct hlist_node *node;
2382		__be32 val = htonl(l->key);
2383
2384		seq_indent(seq, iter->depth);
2385		seq_printf(seq, "  |-- %pI4\n", &val);
2386
2387		hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2388			struct fib_alias *fa;
2389
2390			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2391				char buf1[32], buf2[32];
2392
2393				seq_indent(seq, iter->depth+1);
2394				seq_printf(seq, "  /%d %s %s", li->plen,
2395					   rtn_scope(buf1, sizeof(buf1),
2396						     fa->fa_info->fib_scope),
2397					   rtn_type(buf2, sizeof(buf2),
2398						    fa->fa_type));
2399				if (fa->fa_tos)
2400					seq_printf(seq, " tos=%d", fa->fa_tos);
2401				seq_putc(seq, '\n');
2402			}
 
2403		}
2404	}
2405
2406	return 0;
2407}
2408
2409static const struct seq_operations fib_trie_seq_ops = {
2410	.start  = fib_trie_seq_start,
2411	.next   = fib_trie_seq_next,
2412	.stop   = fib_trie_seq_stop,
2413	.show   = fib_trie_seq_show,
2414};
2415
2416static int fib_trie_seq_open(struct inode *inode, struct file *file)
2417{
2418	return seq_open_net(inode, file, &fib_trie_seq_ops,
2419			    sizeof(struct fib_trie_iter));
2420}
2421
2422static const struct file_operations fib_trie_fops = {
2423	.owner  = THIS_MODULE,
2424	.open   = fib_trie_seq_open,
2425	.read   = seq_read,
2426	.llseek = seq_lseek,
2427	.release = seq_release_net,
2428};
2429
2430struct fib_route_iter {
2431	struct seq_net_private p;
2432	struct trie *main_trie;
 
2433	loff_t	pos;
2434	t_key	key;
2435};
2436
2437static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
 
2438{
2439	struct leaf *l = NULL;
2440	struct trie *t = iter->main_trie;
2441
2442	/* use cache location of last found key */
2443	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2444		pos -= iter->pos;
2445	else {
2446		iter->pos = 0;
2447		l = trie_firstleaf(t);
2448	}
2449
2450	while (l && pos-- > 0) {
 
 
 
2451		iter->pos++;
2452		l = trie_nextleaf(l);
 
 
 
 
2453	}
2454
2455	if (l)
2456		iter->key = pos;	/* remember it */
2457	else
2458		iter->pos = 0;		/* forget it */
2459
2460	return l;
2461}
2462
2463static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2464	__acquires(RCU)
2465{
2466	struct fib_route_iter *iter = seq->private;
2467	struct fib_table *tb;
 
2468
2469	rcu_read_lock();
 
2470	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2471	if (!tb)
2472		return NULL;
2473
2474	iter->main_trie = (struct trie *) tb->tb_data;
2475	if (*pos == 0)
2476		return SEQ_START_TOKEN;
2477	else
2478		return fib_route_get_idx(iter, *pos - 1);
 
 
 
 
 
 
2479}
2480
2481static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2482{
2483	struct fib_route_iter *iter = seq->private;
2484	struct leaf *l = v;
 
2485
2486	++*pos;
2487	if (v == SEQ_START_TOKEN) {
2488		iter->pos = 0;
2489		l = trie_firstleaf(iter->main_trie);
2490	} else {
2491		iter->pos++;
2492		l = trie_nextleaf(l);
2493	}
2494
2495	if (l)
 
 
 
 
2496		iter->key = l->key;
2497	else
 
2498		iter->pos = 0;
 
 
2499	return l;
2500}
2501
2502static void fib_route_seq_stop(struct seq_file *seq, void *v)
2503	__releases(RCU)
2504{
2505	rcu_read_unlock();
2506}
2507
2508static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2509{
2510	unsigned int flags = 0;
2511
2512	if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2513		flags = RTF_REJECT;
2514	if (fi && fi->fib_nh->nh_gw)
2515		flags |= RTF_GATEWAY;
 
 
 
 
2516	if (mask == htonl(0xFFFFFFFF))
2517		flags |= RTF_HOST;
2518	flags |= RTF_UP;
2519	return flags;
2520}
2521
2522/*
2523 *	This outputs /proc/net/route.
2524 *	The format of the file is not supposed to be changed
2525 *	and needs to be same as fib_hash output to avoid breaking
2526 *	legacy utilities
2527 */
2528static int fib_route_seq_show(struct seq_file *seq, void *v)
2529{
2530	struct leaf *l = v;
2531	struct leaf_info *li;
2532	struct hlist_node *node;
 
 
2533
2534	if (v == SEQ_START_TOKEN) {
2535		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2536			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2537			   "\tWindow\tIRTT");
2538		return 0;
2539	}
2540
2541	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2542		struct fib_alias *fa;
2543		__be32 mask, prefix;
2544
2545		mask = inet_make_mask(li->plen);
2546		prefix = htonl(l->key);
 
 
2547
2548		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2549			const struct fib_info *fi = fa->fa_info;
2550			unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2551			int len;
2552
2553			if (fa->fa_type == RTN_BROADCAST
2554			    || fa->fa_type == RTN_MULTICAST)
2555				continue;
2556
2557			if (fi)
2558				seq_printf(seq,
2559					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2560					 "%d\t%08X\t%d\t%u\t%u%n",
2561					 fi->fib_dev ? fi->fib_dev->name : "*",
2562					 prefix,
2563					 fi->fib_nh->nh_gw, flags, 0, 0,
2564					 fi->fib_priority,
2565					 mask,
2566					 (fi->fib_advmss ?
2567					  fi->fib_advmss + 40 : 0),
2568					 fi->fib_window,
2569					 fi->fib_rtt >> 3, &len);
2570			else
2571				seq_printf(seq,
2572					 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2573					 "%d\t%08X\t%d\t%u\t%u%n",
2574					 prefix, 0, flags, 0, 0, 0,
2575					 mask, 0, 0, 0, &len);
2576
2577			seq_printf(seq, "%*s\n", 127 - len, "");
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2578		}
 
2579	}
2580
2581	return 0;
2582}
2583
2584static const struct seq_operations fib_route_seq_ops = {
2585	.start  = fib_route_seq_start,
2586	.next   = fib_route_seq_next,
2587	.stop   = fib_route_seq_stop,
2588	.show   = fib_route_seq_show,
2589};
2590
2591static int fib_route_seq_open(struct inode *inode, struct file *file)
2592{
2593	return seq_open_net(inode, file, &fib_route_seq_ops,
2594			    sizeof(struct fib_route_iter));
2595}
2596
2597static const struct file_operations fib_route_fops = {
2598	.owner  = THIS_MODULE,
2599	.open   = fib_route_seq_open,
2600	.read   = seq_read,
2601	.llseek = seq_lseek,
2602	.release = seq_release_net,
2603};
2604
2605int __net_init fib_proc_init(struct net *net)
2606{
2607	if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
 
2608		goto out1;
2609
2610	if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2611				  &fib_triestat_fops))
2612		goto out2;
2613
2614	if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
 
2615		goto out3;
2616
2617	return 0;
2618
2619out3:
2620	proc_net_remove(net, "fib_triestat");
2621out2:
2622	proc_net_remove(net, "fib_trie");
2623out1:
2624	return -ENOMEM;
2625}
2626
2627void __net_exit fib_proc_exit(struct net *net)
2628{
2629	proc_net_remove(net, "fib_trie");
2630	proc_net_remove(net, "fib_triestat");
2631	proc_net_remove(net, "route");
2632}
2633
2634#endif /* CONFIG_PROC_FS */
v6.2
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
 
 
 
 
   3 *
   4 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
   5 *     & Swedish University of Agricultural Sciences.
   6 *
   7 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
   8 *     Agricultural Sciences.
   9 *
  10 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
  11 *
  12 * This work is based on the LPC-trie which is originally described in:
  13 *
  14 * An experimental study of compression methods for dynamic tries
  15 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
  16 * https://www.csc.kth.se/~snilsson/software/dyntrie2/
 
  17 *
  18 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
  19 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
  20 *
 
  21 * Code from fib_hash has been reused which includes the following header:
  22 *
 
  23 * INET		An implementation of the TCP/IP protocol suite for the LINUX
  24 *		operating system.  INET is implemented using the  BSD Socket
  25 *		interface as the means of communication with the user level.
  26 *
  27 *		IPv4 FIB: lookup engine and maintenance routines.
  28 *
 
  29 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  30 *
 
 
 
 
 
  31 * Substantial contributions to this work comes from:
  32 *
  33 *		David S. Miller, <davem@davemloft.net>
  34 *		Stephen Hemminger <shemminger@osdl.org>
  35 *		Paul E. McKenney <paulmck@us.ibm.com>
  36 *		Patrick McHardy <kaber@trash.net>
  37 */
  38#include <linux/cache.h>
  39#include <linux/uaccess.h>
 
 
 
  40#include <linux/bitops.h>
  41#include <linux/types.h>
  42#include <linux/kernel.h>
  43#include <linux/mm.h>
  44#include <linux/string.h>
  45#include <linux/socket.h>
  46#include <linux/sockios.h>
  47#include <linux/errno.h>
  48#include <linux/in.h>
  49#include <linux/inet.h>
  50#include <linux/inetdevice.h>
  51#include <linux/netdevice.h>
  52#include <linux/if_arp.h>
  53#include <linux/proc_fs.h>
  54#include <linux/rcupdate.h>
  55#include <linux/skbuff.h>
  56#include <linux/netlink.h>
  57#include <linux/init.h>
  58#include <linux/list.h>
  59#include <linux/slab.h>
  60#include <linux/export.h>
  61#include <linux/vmalloc.h>
  62#include <linux/notifier.h>
  63#include <net/net_namespace.h>
  64#include <net/inet_dscp.h>
  65#include <net/ip.h>
  66#include <net/protocol.h>
  67#include <net/route.h>
  68#include <net/tcp.h>
  69#include <net/sock.h>
  70#include <net/ip_fib.h>
  71#include <net/fib_notifier.h>
  72#include <trace/events/fib.h>
  73#include "fib_lookup.h"
  74
  75static int call_fib_entry_notifier(struct notifier_block *nb,
  76				   enum fib_event_type event_type, u32 dst,
  77				   int dst_len, struct fib_alias *fa,
  78				   struct netlink_ext_ack *extack)
  79{
  80	struct fib_entry_notifier_info info = {
  81		.info.extack = extack,
  82		.dst = dst,
  83		.dst_len = dst_len,
  84		.fi = fa->fa_info,
  85		.dscp = fa->fa_dscp,
  86		.type = fa->fa_type,
  87		.tb_id = fa->tb_id,
  88	};
  89	return call_fib4_notifier(nb, event_type, &info.info);
  90}
  91
  92static int call_fib_entry_notifiers(struct net *net,
  93				    enum fib_event_type event_type, u32 dst,
  94				    int dst_len, struct fib_alias *fa,
  95				    struct netlink_ext_ack *extack)
  96{
  97	struct fib_entry_notifier_info info = {
  98		.info.extack = extack,
  99		.dst = dst,
 100		.dst_len = dst_len,
 101		.fi = fa->fa_info,
 102		.dscp = fa->fa_dscp,
 103		.type = fa->fa_type,
 104		.tb_id = fa->tb_id,
 105	};
 106	return call_fib4_notifiers(net, event_type, &info.info);
 107}
 108
 109#define MAX_STAT_DEPTH 32
 
 110
 111#define KEYLENGTH	(8*sizeof(t_key))
 112#define KEY_MAX		((t_key)~0)
 
 
 113
 114typedef unsigned int t_key;
 
 
 
 
 
 115
 116#define IS_TRIE(n)	((n)->pos >= KEYLENGTH)
 117#define IS_TNODE(n)	((n)->bits)
 118#define IS_LEAF(n)	(!(n)->bits)
 
 
 
 
 119
 120struct key_vector {
 
 121	t_key key;
 122	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
 123	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
 124	unsigned char slen;
 
 125	union {
 126		/* This list pointer if valid if (pos | bits) == 0 (LEAF) */
 127		struct hlist_head leaf;
 128		/* This array is valid if (pos | bits) > 0 (TNODE) */
 129		DECLARE_FLEX_ARRAY(struct key_vector __rcu *, tnode);
 130	};
 
 131};
 132
 133struct tnode {
 134	struct rcu_head rcu;
 135	t_key empty_children;		/* KEYLENGTH bits needed */
 136	t_key full_children;		/* KEYLENGTH bits needed */
 137	struct key_vector __rcu *parent;
 138	struct key_vector kv[1];
 139#define tn_bits kv[0].bits
 140};
 141
 142#define TNODE_SIZE(n)	offsetof(struct tnode, kv[0].tnode[n])
 143#define LEAF_SIZE	TNODE_SIZE(1)
 144
 145#ifdef CONFIG_IP_FIB_TRIE_STATS
 146struct trie_use_stats {
 147	unsigned int gets;
 148	unsigned int backtrack;
 149	unsigned int semantic_match_passed;
 150	unsigned int semantic_match_miss;
 151	unsigned int null_node_hit;
 152	unsigned int resize_node_skipped;
 153};
 154#endif
 155
 156struct trie_stat {
 157	unsigned int totdepth;
 158	unsigned int maxdepth;
 159	unsigned int tnodes;
 160	unsigned int leaves;
 161	unsigned int nullpointers;
 162	unsigned int prefixes;
 163	unsigned int nodesizes[MAX_STAT_DEPTH];
 164};
 165
 166struct trie {
 167	struct key_vector kv[1];
 168#ifdef CONFIG_IP_FIB_TRIE_STATS
 169	struct trie_use_stats __percpu *stats;
 170#endif
 171};
 172
 173static struct key_vector *resize(struct trie *t, struct key_vector *tn);
 174static unsigned int tnode_free_size;
 
 
 
 
 
 
 
 175
 176/*
 177 * synchronize_rcu after call_rcu for outstanding dirty memory; it should be
 178 * especially useful before resizing the root node with PREEMPT_NONE configs;
 179 * the value was obtained experimentally, aiming to avoid visible slowdown.
 180 */
 181unsigned int sysctl_fib_sync_mem = 512 * 1024;
 182unsigned int sysctl_fib_sync_mem_min = 64 * 1024;
 183unsigned int sysctl_fib_sync_mem_max = 64 * 1024 * 1024;
 184
 185static struct kmem_cache *fn_alias_kmem __ro_after_init;
 186static struct kmem_cache *trie_leaf_kmem __ro_after_init;
 187
 188static inline struct tnode *tn_info(struct key_vector *kv)
 
 
 
 189{
 190	return container_of(kv, struct tnode, kv[0]);
 
 
 
 
 191}
 192
 193/* caller must hold RTNL */
 194#define node_parent(tn) rtnl_dereference(tn_info(tn)->parent)
 195#define get_child(tn, i) rtnl_dereference((tn)->tnode[i])
 
 
 
 
 
 
 196
 197/* caller must hold RCU read lock or RTNL */
 198#define node_parent_rcu(tn) rcu_dereference_rtnl(tn_info(tn)->parent)
 199#define get_child_rcu(tn, i) rcu_dereference_rtnl((tn)->tnode[i])
 200
 201/* wrapper for rcu_assign_pointer */
 202static inline void node_set_parent(struct key_vector *n, struct key_vector *tp)
 
 
 203{
 204	if (n)
 205		rcu_assign_pointer(tn_info(n)->parent, tp);
 206}
 207
 208#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER(tn_info(n)->parent, p)
 
 
 
 
 
 
 
 
 209
 210/* This provides us with the number of children in this node, in the case of a
 211 * leaf this will return 0 meaning none of the children are accessible.
 212 */
 213static inline unsigned long child_length(const struct key_vector *tn)
 214{
 215	return (1ul << tn->bits) & ~(1ul);
 
 
 216}
 217
 218#define get_cindex(key, kv) (((key) ^ (kv)->key) >> (kv)->pos)
 
 
 
 219
 220static inline unsigned long get_index(t_key key, struct key_vector *kv)
 221{
 222	unsigned long index = key ^ kv->key;
 
 223
 224	if ((BITS_PER_LONG <= KEYLENGTH) && (KEYLENGTH == kv->pos))
 
 
 
 
 225		return 0;
 
 226
 227	return index >> kv->pos;
 
 
 228}
 229
 230/* To understand this stuff, an understanding of keys and all their bits is
 231 * necessary. Every node in the trie has a key associated with it, but not
 232 * all of the bits in that key are significant.
 233 *
 234 * Consider a node 'n' and its parent 'tp'.
 235 *
 236 * If n is a leaf, every bit in its key is significant. Its presence is
 237 * necessitated by path compression, since during a tree traversal (when
 238 * searching for a leaf - unless we are doing an insertion) we will completely
 239 * ignore all skipped bits we encounter. Thus we need to verify, at the end of
 240 * a potentially successful search, that we have indeed been walking the
 241 * correct key path.
 242 *
 243 * Note that we can never "miss" the correct key in the tree if present by
 244 * following the wrong path. Path compression ensures that segments of the key
 245 * that are the same for all keys with a given prefix are skipped, but the
 246 * skipped part *is* identical for each node in the subtrie below the skipped
 247 * bit! trie_insert() in this implementation takes care of that.
 248 *
 249 * if n is an internal node - a 'tnode' here, the various parts of its key
 250 * have many different meanings.
 251 *
 252 * Example:
 253 * _________________________________________________________________
 254 * | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
 255 * -----------------------------------------------------------------
 256 *  31  30  29  28  27  26  25  24  23  22  21  20  19  18  17  16
 257 *
 258 * _________________________________________________________________
 259 * | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
 260 * -----------------------------------------------------------------
 261 *  15  14  13  12  11  10   9   8   7   6   5   4   3   2   1   0
 262 *
 263 * tp->pos = 22
 264 * tp->bits = 3
 265 * n->pos = 13
 266 * n->bits = 4
 267 *
 268 * First, let's just ignore the bits that come before the parent tp, that is
 269 * the bits from (tp->pos + tp->bits) to 31. They are *known* but at this
 270 * point we do not use them for anything.
 271 *
 272 * The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
 273 * index into the parent's child array. That is, they will be used to find
 274 * 'n' among tp's children.
 275 *
 276 * The bits from (n->pos + n->bits) to (tp->pos - 1) - "S" - are skipped bits
 277 * for the node n.
 278 *
 279 * All the bits we have seen so far are significant to the node n. The rest
 280 * of the bits are really not needed or indeed known in n->key.
 281 *
 282 * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
 283 * n's child array, and will of course be different for each child.
 284 *
 285 * The rest of the bits, from 0 to (n->pos -1) - "u" - are completely unknown
 286 * at this point.
 287 */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 288
 289static const int halve_threshold = 25;
 290static const int inflate_threshold = 50;
 291static const int halve_threshold_root = 15;
 292static const int inflate_threshold_root = 30;
 293
 294static void __alias_free_mem(struct rcu_head *head)
 295{
 296	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
 297	kmem_cache_free(fn_alias_kmem, fa);
 298}
 299
 300static inline void alias_free_mem_rcu(struct fib_alias *fa)
 301{
 302	call_rcu(&fa->rcu, __alias_free_mem);
 303}
 304
 305#define TNODE_VMALLOC_MAX \
 306	ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct key_vector *))
 
 
 
 307
 308static void __node_free_rcu(struct rcu_head *head)
 309{
 310	struct tnode *n = container_of(head, struct tnode, rcu);
 
 311
 312	if (!n->tn_bits)
 313		kmem_cache_free(trie_leaf_kmem, n);
 314	else
 315		kvfree(n);
 316}
 317
 318#define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu)
 319
 320static struct tnode *tnode_alloc(int bits)
 321{
 322	size_t size;
 323
 324	/* verify bits is within bounds */
 325	if (bits > TNODE_VMALLOC_MAX)
 326		return NULL;
 327
 328	/* determine size and verify it is non-zero and didn't overflow */
 329	size = TNODE_SIZE(1ul << bits);
 330
 331	if (size <= PAGE_SIZE)
 332		return kzalloc(size, GFP_KERNEL);
 333	else
 334		return vzalloc(size);
 335}
 336
 337static inline void empty_child_inc(struct key_vector *n)
 
 
 
 
 
 
 338{
 339	tn_info(n)->empty_children++;
 
 
 340
 341	if (!tn_info(n)->empty_children)
 342		tn_info(n)->full_children++;
 
 
 
 
 343}
 344
 345static inline void empty_child_dec(struct key_vector *n)
 346{
 347	if (!tn_info(n)->empty_children)
 348		tn_info(n)->full_children--;
 
 
 
 349
 350	tn_info(n)->empty_children--;
 
 
 
 
 
 
 351}
 352
 353static struct key_vector *leaf_new(t_key key, struct fib_alias *fa)
 354{
 355	struct key_vector *l;
 356	struct tnode *kv;
 357
 358	kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
 359	if (!kv)
 360		return NULL;
 
 
 361
 362	/* initialize key vector */
 363	l = kv->kv;
 364	l->key = key;
 365	l->pos = 0;
 366	l->bits = 0;
 367	l->slen = fa->fa_slen;
 368
 369	/* link leaf to fib alias */
 370	INIT_HLIST_HEAD(&l->leaf);
 371	hlist_add_head(&fa->fa_list, &l->leaf);
 372
 
 
 
 
 
 
 
 373	return l;
 374}
 375
 376static struct key_vector *tnode_new(t_key key, int pos, int bits)
 377{
 378	unsigned int shift = pos + bits;
 379	struct key_vector *tn;
 380	struct tnode *tnode;
 
 
 
 
 
 381
 382	/* verify bits and pos their msb bits clear and values are valid */
 383	BUG_ON(!bits || (shift > KEYLENGTH));
 
 
 384
 385	tnode = tnode_alloc(bits);
 386	if (!tnode)
 387		return NULL;
 
 
 
 
 
 388
 389	pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0),
 390		 sizeof(struct key_vector *) << bits);
 
 
 391
 392	if (bits == KEYLENGTH)
 393		tnode->full_children = 1;
 394	else
 395		tnode->empty_children = 1ul << bits;
 396
 397	tn = tnode->kv;
 398	tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
 399	tn->pos = pos;
 400	tn->bits = bits;
 401	tn->slen = pos;
 402
 403	return tn;
 404}
 405
 406/* Check whether a tnode 'n' is "full", i.e. it is an internal node
 407 * and no bits are skipped. See discussion in dyntree paper p. 6
 408 */
 409static inline int tnode_full(struct key_vector *tn, struct key_vector *n)
 410{
 411	return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
 412}
 413
 414/* Add a child at position i overwriting the old value.
 415 * Update the value of full_children and empty_children.
 416 */
 417static void put_child(struct key_vector *tn, unsigned long i,
 418		      struct key_vector *n)
 
 
 419{
 420	struct key_vector *chi = get_child(tn, i);
 421	int isfull, wasfull;
 422
 423	BUG_ON(i >= child_length(tn));
 424
 425	/* update emptyChildren, overflow into fullChildren */
 426	if (!n && chi)
 427		empty_child_inc(tn);
 428	if (n && !chi)
 429		empty_child_dec(tn);
 430
 431	/* update fullChildren */
 432	wasfull = tnode_full(tn, chi);
 
 
 433	isfull = tnode_full(tn, n);
 434
 435	if (wasfull && !isfull)
 436		tn_info(tn)->full_children--;
 437	else if (!wasfull && isfull)
 438		tn_info(tn)->full_children++;
 439
 440	if (n && (tn->slen < n->slen))
 441		tn->slen = n->slen;
 442
 443	rcu_assign_pointer(tn->tnode[i], n);
 444}
 445
 446static void update_children(struct key_vector *tn)
 
 447{
 448	unsigned long i;
 
 
 
 
 449
 450	/* update all of the child parent pointers */
 451	for (i = child_length(tn); i;) {
 452		struct key_vector *inode = get_child(tn, --i);
 453
 454		if (!inode)
 455			continue;
 456
 457		/* Either update the children of a tnode that
 458		 * already belongs to us or update the child
 459		 * to point to ourselves.
 460		 */
 461		if (node_parent(inode) == tn)
 462			update_children(inode);
 463		else
 464			node_set_parent(inode, tn);
 465	}
 466}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 467
 468static inline void put_child_root(struct key_vector *tp, t_key key,
 469				  struct key_vector *n)
 470{
 471	if (IS_TRIE(tp))
 472		rcu_assign_pointer(tp->tnode[0], n);
 473	else
 474		put_child(tp, get_index(key, tp), n);
 475}
 476
 477static inline void tnode_free_init(struct key_vector *tn)
 478{
 479	tn_info(tn)->rcu.next = NULL;
 480}
 481
 482static inline void tnode_free_append(struct key_vector *tn,
 483				     struct key_vector *n)
 484{
 485	tn_info(n)->rcu.next = tn_info(tn)->rcu.next;
 486	tn_info(tn)->rcu.next = &tn_info(n)->rcu;
 487}
 
 488
 489static void tnode_free(struct key_vector *tn)
 490{
 491	struct callback_head *head = &tn_info(tn)->rcu;
 
 
 492
 493	while (head) {
 494		head = head->next;
 495		tnode_free_size += TNODE_SIZE(1ul << tn->bits);
 496		node_free(tn);
 497
 498		tn = container_of(head, struct tnode, rcu)->kv;
 
 
 
 
 
 
 499	}
 500
 501	if (tnode_free_size >= READ_ONCE(sysctl_fib_sync_mem)) {
 502		tnode_free_size = 0;
 503		synchronize_rcu();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 504	}
 505}
 506
 507static struct key_vector *replace(struct trie *t,
 508				  struct key_vector *oldtnode,
 509				  struct key_vector *tn)
 510{
 511	struct key_vector *tp = node_parent(oldtnode);
 512	unsigned long i;
 513
 514	/* setup the parent pointer out of and back into this node */
 515	NODE_INIT_PARENT(tn, tp);
 516	put_child_root(tp, tn->key, tn);
 
 
 
 
 
 
 
 
 517
 518	/* update all of the child parent pointers */
 519	update_children(tn);
 
 
 
 
 
 520
 521	/* all pointers should be clean so we are done */
 522	tnode_free(oldtnode);
 523
 524	/* resize children now that oldtnode is freed */
 525	for (i = child_length(tn); i;) {
 526		struct key_vector *inode = get_child(tn, --i);
 
 527
 528		/* resize child node */
 529		if (tnode_full(tn, inode))
 530			tn = resize(t, inode);
 
 531	}
 532
 533	return tp;
 534}
 535
 536static struct key_vector *inflate(struct trie *t,
 537				  struct key_vector *oldtnode)
 538{
 539	struct key_vector *tn;
 540	unsigned long i;
 541	t_key m;
 542
 543	pr_debug("In inflate\n");
 544
 545	tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1);
 
 546	if (!tn)
 547		goto notnode;
 
 
 
 
 
 
 
 
 
 
 548
 549	/* prepare oldtnode to be freed */
 550	tnode_free_init(oldtnode);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 551
 552	/* Assemble all of the pointers in our cluster, in this case that
 553	 * represents all of the pointers out of our allocated nodes that
 554	 * point to existing tnodes and the links between our allocated
 555	 * nodes.
 556	 */
 557	for (i = child_length(oldtnode), m = 1u << tn->pos; i;) {
 558		struct key_vector *inode = get_child(oldtnode, --i);
 559		struct key_vector *node0, *node1;
 560		unsigned long j, k;
 
 561
 562		/* An empty child */
 563		if (!inode)
 564			continue;
 565
 566		/* A leaf or an internal node with skipped bits */
 567		if (!tnode_full(oldtnode, inode)) {
 568			put_child(tn, get_index(inode->key, tn), inode);
 
 
 
 
 
 
 
 569			continue;
 570		}
 571
 572		/* drop the node in the old tnode free list */
 573		tnode_free_append(oldtnode, inode);
 574
 575		/* An internal node with two children */
 576		if (inode->bits == 1) {
 577			put_child(tn, 2 * i + 1, get_child(inode, 1));
 578			put_child(tn, 2 * i, get_child(inode, 0));
 
 
 579			continue;
 580		}
 581
 
 
 582		/* We will replace this node 'inode' with two new
 583		 * ones, 'node0' and 'node1', each with half of the
 584		 * original children. The two new nodes will have
 585		 * a position one bit further down the key and this
 586		 * means that the "significant" part of their keys
 587		 * (see the discussion near the top of this file)
 588		 * will differ by one bit, which will be "0" in
 589		 * node0's key and "1" in node1's key. Since we are
 590		 * moving the key position by one step, the bit that
 591		 * we are moving away from - the bit at position
 592		 * (tn->pos) - is the one that will differ between
 593		 * node0 and node1. So... we synthesize that bit in the
 594		 * two new keys.
 
 
 595		 */
 596		node1 = tnode_new(inode->key | m, inode->pos, inode->bits - 1);
 597		if (!node1)
 598			goto nomem;
 599		node0 = tnode_new(inode->key, inode->pos, inode->bits - 1);
 600
 601		tnode_free_append(tn, node1);
 602		if (!node0)
 603			goto nomem;
 604		tnode_free_append(tn, node0);
 605
 606		/* populate child pointers in new nodes */
 607		for (k = child_length(inode), j = k / 2; j;) {
 608			put_child(node1, --j, get_child(inode, --k));
 609			put_child(node0, j, get_child(inode, j));
 610			put_child(node1, --j, get_child(inode, --k));
 611			put_child(node0, j, get_child(inode, j));
 612		}
 613
 614		/* link new nodes to parent */
 615		NODE_INIT_PARENT(node1, tn);
 616		NODE_INIT_PARENT(node0, tn);
 617
 618		/* link parent to nodes */
 619		put_child(tn, 2 * i + 1, node1);
 620		put_child(tn, 2 * i, node0);
 621	}
 622
 623	/* setup the parent pointers into and out of this node */
 624	return replace(t, oldtnode, tn);
 625nomem:
 626	/* all pointers should be clean so we are done */
 627	tnode_free(tn);
 628notnode:
 629	return NULL;
 630}
 631
 632static struct key_vector *halve(struct trie *t,
 633				struct key_vector *oldtnode)
 634{
 635	struct key_vector *tn;
 636	unsigned long i;
 637
 638	pr_debug("In halve\n");
 639
 640	tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1);
 641	if (!tn)
 642		goto notnode;
 643
 644	/* prepare oldtnode to be freed */
 645	tnode_free_init(oldtnode);
 646
 647	/* Assemble all of the pointers in our cluster, in this case that
 648	 * represents all of the pointers out of our allocated nodes that
 649	 * point to existing tnodes and the links between our allocated
 650	 * nodes.
 651	 */
 652	for (i = child_length(oldtnode); i;) {
 653		struct key_vector *node1 = get_child(oldtnode, --i);
 654		struct key_vector *node0 = get_child(oldtnode, --i);
 655		struct key_vector *inode;
 656
 657		/* At least one of the children is empty */
 658		if (!node1 || !node0) {
 659			put_child(tn, i / 2, node1 ? : node0);
 660			continue;
 661		}
 
 
 662
 663		/* Two nonempty children */
 664		inode = tnode_new(node0->key, oldtnode->pos, 1);
 665		if (!inode)
 666			goto nomem;
 667		tnode_free_append(tn, inode);
 668
 669		/* initialize pointers out of node */
 670		put_child(inode, 1, node1);
 671		put_child(inode, 0, node0);
 672		NODE_INIT_PARENT(inode, tn);
 673
 674		/* link parent to node */
 675		put_child(tn, i / 2, inode);
 676	}
 677
 678	/* setup the parent pointers into and out of this node */
 679	return replace(t, oldtnode, tn);
 680nomem:
 681	/* all pointers should be clean so we are done */
 682	tnode_free(tn);
 683notnode:
 684	return NULL;
 685}
 686
 687static struct key_vector *collapse(struct trie *t,
 688				   struct key_vector *oldtnode)
 689{
 690	struct key_vector *n, *tp;
 691	unsigned long i;
 
 
 692
 693	/* scan the tnode looking for that one child that might still exist */
 694	for (n = NULL, i = child_length(oldtnode); !n && i;)
 695		n = get_child(oldtnode, --i);
 696
 697	/* compress one level */
 698	tp = node_parent(oldtnode);
 699	put_child_root(tp, oldtnode->key, n);
 700	node_set_parent(n, tp);
 701
 702	/* drop dead node */
 703	node_free(oldtnode);
 704
 705	return tp;
 706}
 
 
 
 
 707
 708static unsigned char update_suffix(struct key_vector *tn)
 709{
 710	unsigned char slen = tn->pos;
 711	unsigned long stride, i;
 712	unsigned char slen_max;
 713
 714	/* only vector 0 can have a suffix length greater than or equal to
 715	 * tn->pos + tn->bits, the second highest node will have a suffix
 716	 * length at most of tn->pos + tn->bits - 1
 717	 */
 718	slen_max = min_t(unsigned char, tn->pos + tn->bits - 1, tn->slen);
 719
 720	/* search though the list of children looking for nodes that might
 721	 * have a suffix greater than the one we currently have.  This is
 722	 * why we start with a stride of 2 since a stride of 1 would
 723	 * represent the nodes with suffix length equal to tn->pos
 724	 */
 725	for (i = 0, stride = 0x2ul ; i < child_length(tn); i += stride) {
 726		struct key_vector *n = get_child(tn, i);
 727
 728		if (!n || (n->slen <= slen))
 729			continue;
 730
 731		/* update stride and slen based on new value */
 732		stride <<= (n->slen - slen);
 733		slen = n->slen;
 734		i &= ~(stride - 1);
 735
 736		/* stop searching if we have hit the maximum possible value */
 737		if (slen >= slen_max)
 738			break;
 739	}
 740
 741	tn->slen = slen;
 
 742
 743	return slen;
 744}
 745
 746/* From "Implementing a dynamic compressed trie" by Stefan Nilsson of
 747 * the Helsinki University of Technology and Matti Tikkanen of Nokia
 748 * Telecommunications, page 6:
 749 * "A node is doubled if the ratio of non-empty children to all
 750 * children in the *doubled* node is at least 'high'."
 751 *
 752 * 'high' in this instance is the variable 'inflate_threshold'. It
 753 * is expressed as a percentage, so we multiply it with
 754 * child_length() and instead of multiplying by 2 (since the
 755 * child array will be doubled by inflate()) and multiplying
 756 * the left-hand side by 100 (to handle the percentage thing) we
 757 * multiply the left-hand side by 50.
 758 *
 759 * The left-hand side may look a bit weird: child_length(tn)
 760 * - tn->empty_children is of course the number of non-null children
 761 * in the current node. tn->full_children is the number of "full"
 762 * children, that is non-null tnodes with a skip value of 0.
 763 * All of those will be doubled in the resulting inflated tnode, so
 764 * we just count them one extra time here.
 765 *
 766 * A clearer way to write this would be:
 767 *
 768 * to_be_doubled = tn->full_children;
 769 * not_to_be_doubled = child_length(tn) - tn->empty_children -
 770 *     tn->full_children;
 771 *
 772 * new_child_length = child_length(tn) * 2;
 773 *
 774 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
 775 *      new_child_length;
 776 * if (new_fill_factor >= inflate_threshold)
 777 *
 778 * ...and so on, tho it would mess up the while () loop.
 779 *
 780 * anyway,
 781 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
 782 *      inflate_threshold
 783 *
 784 * avoid a division:
 785 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
 786 *      inflate_threshold * new_child_length
 787 *
 788 * expand not_to_be_doubled and to_be_doubled, and shorten:
 789 * 100 * (child_length(tn) - tn->empty_children +
 790 *    tn->full_children) >= inflate_threshold * new_child_length
 791 *
 792 * expand new_child_length:
 793 * 100 * (child_length(tn) - tn->empty_children +
 794 *    tn->full_children) >=
 795 *      inflate_threshold * child_length(tn) * 2
 796 *
 797 * shorten again:
 798 * 50 * (tn->full_children + child_length(tn) -
 799 *    tn->empty_children) >= inflate_threshold *
 800 *    child_length(tn)
 801 *
 802 */
 803static inline bool should_inflate(struct key_vector *tp, struct key_vector *tn)
 804{
 805	unsigned long used = child_length(tn);
 806	unsigned long threshold = used;
 807
 808	/* Keep root node larger */
 809	threshold *= IS_TRIE(tp) ? inflate_threshold_root : inflate_threshold;
 810	used -= tn_info(tn)->empty_children;
 811	used += tn_info(tn)->full_children;
 812
 813	/* if bits == KEYLENGTH then pos = 0, and will fail below */
 
 
 
 
 
 
 
 
 
 
 
 
 814
 815	return (used > 1) && tn->pos && ((50 * used) >= threshold);
 816}
 817
 818static inline bool should_halve(struct key_vector *tp, struct key_vector *tn)
 819{
 820	unsigned long used = child_length(tn);
 821	unsigned long threshold = used;
 
 822
 823	/* Keep root node larger */
 824	threshold *= IS_TRIE(tp) ? halve_threshold_root : halve_threshold;
 825	used -= tn_info(tn)->empty_children;
 826
 827	/* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */
 828
 829	return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold);
 830}
 831
 832static inline bool should_collapse(struct key_vector *tn)
 833{
 834	unsigned long used = child_length(tn);
 835
 836	used -= tn_info(tn)->empty_children;
 837
 838	/* account for bits == KEYLENGTH case */
 839	if ((tn->bits == KEYLENGTH) && tn_info(tn)->full_children)
 840		used -= KEY_MAX;
 841
 842	/* One child or none, time to drop us from the trie */
 843	return used < 2;
 844}
 845
 846#define MAX_WORK 10
 847static struct key_vector *resize(struct trie *t, struct key_vector *tn)
 848{
 849#ifdef CONFIG_IP_FIB_TRIE_STATS
 850	struct trie_use_stats __percpu *stats = t->stats;
 851#endif
 852	struct key_vector *tp = node_parent(tn);
 853	unsigned long cindex = get_index(tn->key, tp);
 854	int max_work = MAX_WORK;
 855
 856	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
 857		 tn, inflate_threshold, halve_threshold);
 858
 859	/* track the tnode via the pointer from the parent instead of
 860	 * doing it ourselves.  This way we can let RCU fully do its
 861	 * thing without us interfering
 862	 */
 863	BUG_ON(tn != get_child(tp, cindex));
 864
 865	/* Double as long as the resulting node has a number of
 866	 * nonempty nodes that are above the threshold.
 867	 */
 868	while (should_inflate(tp, tn) && max_work) {
 869		tp = inflate(t, tn);
 870		if (!tp) {
 871#ifdef CONFIG_IP_FIB_TRIE_STATS
 872			this_cpu_inc(stats->resize_node_skipped);
 873#endif
 874			break;
 875		}
 876
 877		max_work--;
 878		tn = get_child(tp, cindex);
 
 879	}
 
 880
 881	/* update parent in case inflate failed */
 882	tp = node_parent(tn);
 883
 884	/* Return if at least one inflate is run */
 885	if (max_work != MAX_WORK)
 886		return tp;
 887
 888	/* Halve as long as the number of empty children in this
 889	 * node is above threshold.
 890	 */
 891	while (should_halve(tp, tn) && max_work) {
 892		tp = halve(t, tn);
 893		if (!tp) {
 894#ifdef CONFIG_IP_FIB_TRIE_STATS
 895			this_cpu_inc(stats->resize_node_skipped);
 896#endif
 
 
 
 
 
 
 
 
 
 897			break;
 898		}
 899
 900		max_work--;
 901		tn = get_child(tp, cindex);
 902	}
 
 903
 904	/* Only one child remains */
 905	if (should_collapse(tn))
 906		return collapse(t, tn);
 907
 908	/* update parent in case halve failed */
 909	return node_parent(tn);
 910}
 911
 912static void node_pull_suffix(struct key_vector *tn, unsigned char slen)
 913{
 914	unsigned char node_slen = tn->slen;
 915
 916	while ((node_slen > tn->pos) && (node_slen > slen)) {
 917		slen = update_suffix(tn);
 918		if (node_slen == slen)
 919			break;
 920
 921		tn = node_parent(tn);
 922		node_slen = tn->slen;
 923	}
 924}
 925
 926static void node_push_suffix(struct key_vector *tn, unsigned char slen)
 927{
 928	while (tn->slen < slen) {
 929		tn->slen = slen;
 930		tn = node_parent(tn);
 931	}
 932}
 933
 934/* rcu_read_lock needs to be hold by caller from readside */
 935static struct key_vector *fib_find_node(struct trie *t,
 936					struct key_vector **tp, u32 key)
 937{
 938	struct key_vector *pn, *n = t->kv;
 939	unsigned long index = 0;
 940
 941	do {
 942		pn = n;
 943		n = get_child_rcu(n, index);
 
 944
 945		if (!n)
 946			break;
 947
 948		index = get_cindex(key, n);
 
 
 949
 950		/* This bit of code is a bit tricky but it combines multiple
 951		 * checks into a single check.  The prefix consists of the
 952		 * prefix plus zeros for the bits in the cindex. The index
 953		 * is the difference between the key and this value.  From
 954		 * this we can actually derive several pieces of data.
 955		 *   if (index >= (1ul << bits))
 956		 *     we have a mismatch in skip bits and failed
 957		 *   else
 958		 *     we know the value is cindex
 959		 *
 960		 * This check is safe even if bits == KEYLENGTH due to the
 961		 * fact that we can only allocate a node with 32 bits if a
 962		 * long is greater than 32 bits.
 963		 */
 964		if (index >= (1ul << n->bits)) {
 965			n = NULL;
 966			break;
 967		}
 
 968
 969		/* keep searching until we find a perfect match leaf or NULL */
 970	} while (IS_TNODE(n));
 
 971
 972	*tp = pn;
 
 
 973
 974	return n;
 975}
 976
 977/* Return the first fib alias matching DSCP with
 978 * priority less than or equal to PRIO.
 979 * If 'find_first' is set, return the first matching
 980 * fib alias, regardless of DSCP and priority.
 981 */
 982static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
 983					dscp_t dscp, u32 prio, u32 tb_id,
 984					bool find_first)
 985{
 986	struct fib_alias *fa;
 
 
 
 
 
 
 
 987
 988	if (!fah)
 989		return NULL;
 990
 991	hlist_for_each_entry(fa, fah, fa_list) {
 992		/* Avoid Sparse warning when using dscp_t in inequalities */
 993		u8 __fa_dscp = inet_dscp_to_dsfield(fa->fa_dscp);
 994		u8 __dscp = inet_dscp_to_dsfield(dscp);
 
 
 
 
 
 
 
 
 
 
 
 
 
 995
 996		if (fa->fa_slen < slen)
 997			continue;
 998		if (fa->fa_slen != slen)
 999			break;
1000		if (fa->tb_id > tb_id)
1001			continue;
1002		if (fa->tb_id != tb_id)
1003			break;
1004		if (find_first)
1005			return fa;
1006		if (__fa_dscp > __dscp)
1007			continue;
1008		if (fa->fa_info->fib_priority >= prio || __fa_dscp < __dscp)
1009			return fa;
1010	}
1011
1012	return NULL;
1013}
1014
1015static struct fib_alias *
1016fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri)
1017{
1018	u8 slen = KEYLENGTH - fri->dst_len;
1019	struct key_vector *l, *tp;
1020	struct fib_table *tb;
1021	struct fib_alias *fa;
1022	struct trie *t;
1023
1024	tb = fib_get_table(net, fri->tb_id);
1025	if (!tb)
1026		return NULL;
1027
1028	t = (struct trie *)tb->tb_data;
1029	l = fib_find_node(t, &tp, be32_to_cpu(fri->dst));
1030	if (!l)
1031		return NULL;
1032
1033	hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
1034		if (fa->fa_slen == slen && fa->tb_id == fri->tb_id &&
1035		    fa->fa_dscp == fri->dscp && fa->fa_info == fri->fi &&
1036		    fa->fa_type == fri->type)
1037			return fa;
1038	}
1039
1040	return NULL;
1041}
1042
1043void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri)
1044{
1045	u8 fib_notify_on_flag_change;
1046	struct fib_alias *fa_match;
1047	struct sk_buff *skb;
1048	int err;
1049
1050	rcu_read_lock();
1051
1052	fa_match = fib_find_matching_alias(net, fri);
1053	if (!fa_match)
1054		goto out;
1055
1056	/* These are paired with the WRITE_ONCE() happening in this function.
1057	 * The reason is that we are only protected by RCU at this point.
1058	 */
1059	if (READ_ONCE(fa_match->offload) == fri->offload &&
1060	    READ_ONCE(fa_match->trap) == fri->trap &&
1061	    READ_ONCE(fa_match->offload_failed) == fri->offload_failed)
1062		goto out;
1063
1064	WRITE_ONCE(fa_match->offload, fri->offload);
1065	WRITE_ONCE(fa_match->trap, fri->trap);
1066
1067	fib_notify_on_flag_change = READ_ONCE(net->ipv4.sysctl_fib_notify_on_flag_change);
1068
1069	/* 2 means send notifications only if offload_failed was changed. */
1070	if (fib_notify_on_flag_change == 2 &&
1071	    READ_ONCE(fa_match->offload_failed) == fri->offload_failed)
1072		goto out;
1073
1074	WRITE_ONCE(fa_match->offload_failed, fri->offload_failed);
1075
1076	if (!fib_notify_on_flag_change)
1077		goto out;
 
1078
1079	skb = nlmsg_new(fib_nlmsg_size(fa_match->fa_info), GFP_ATOMIC);
1080	if (!skb) {
1081		err = -ENOBUFS;
1082		goto errout;
1083	}
1084
1085	err = fib_dump_info(skb, 0, 0, RTM_NEWROUTE, fri, 0);
1086	if (err < 0) {
1087		/* -EMSGSIZE implies BUG in fib_nlmsg_size() */
1088		WARN_ON(err == -EMSGSIZE);
1089		kfree_skb(skb);
1090		goto errout;
1091	}
 
1092
1093	rtnl_notify(skb, net, 0, RTNLGRP_IPV4_ROUTE, NULL, GFP_ATOMIC);
1094	goto out;
1095
1096errout:
1097	rtnl_set_sk_err(net, RTNLGRP_IPV4_ROUTE, err);
1098out:
1099	rcu_read_unlock();
1100}
1101EXPORT_SYMBOL_GPL(fib_alias_hw_flags_set);
1102
1103static void trie_rebalance(struct trie *t, struct key_vector *tn)
1104{
1105	while (!IS_TRIE(tn))
1106		tn = resize(t, tn);
1107}
1108
1109static int fib_insert_node(struct trie *t, struct key_vector *tp,
1110			   struct fib_alias *new, t_key key)
1111{
1112	struct key_vector *n, *l;
1113
1114	l = leaf_new(key, new);
1115	if (!l)
1116		goto noleaf;
1117
1118	/* retrieve child from parent node */
1119	n = get_child(tp, get_index(key, tp));
1120
1121	/* Case 2: n is a LEAF or a TNODE and the key doesn't match.
1122	 *
1123	 *  Add a new tnode here
1124	 *  first tnode need some special handling
1125	 *  leaves us in position for handling as case 3
1126	 */
1127	if (n) {
1128		struct key_vector *tn;
1129
1130		tn = tnode_new(key, __fls(key ^ n->key), 1);
1131		if (!tn)
1132			goto notnode;
1133
1134		/* initialize routes out of node */
1135		NODE_INIT_PARENT(tn, tp);
1136		put_child(tn, get_index(key, tn) ^ 1, n);
1137
1138		/* start adding routes into the node */
1139		put_child_root(tp, key, tn);
1140		node_set_parent(n, tn);
1141
1142		/* parent now has a NULL spot where the leaf can go */
1143		tp = tn;
1144	}
1145
1146	/* Case 3: n is NULL, and will just insert a new leaf */
1147	node_push_suffix(tp, new->fa_slen);
1148	NODE_INIT_PARENT(l, tp);
1149	put_child_root(tp, key, l);
1150	trie_rebalance(t, tp);
1151
1152	return 0;
1153notnode:
1154	node_free(l);
1155noleaf:
1156	return -ENOMEM;
1157}
1158
1159static int fib_insert_alias(struct trie *t, struct key_vector *tp,
1160			    struct key_vector *l, struct fib_alias *new,
1161			    struct fib_alias *fa, t_key key)
1162{
1163	if (!l)
1164		return fib_insert_node(t, tp, new, key);
1165
1166	if (fa) {
1167		hlist_add_before_rcu(&new->fa_list, &fa->fa_list);
1168	} else {
1169		struct fib_alias *last;
 
 
 
 
 
 
 
 
 
1170
1171		hlist_for_each_entry(last, &l->leaf, fa_list) {
1172			if (new->fa_slen < last->fa_slen)
1173				break;
1174			if ((new->fa_slen == last->fa_slen) &&
1175			    (new->tb_id > last->tb_id))
1176				break;
1177			fa = last;
1178		}
1179
1180		if (fa)
1181			hlist_add_behind_rcu(&new->fa_list, &fa->fa_list);
1182		else
1183			hlist_add_head_rcu(&new->fa_list, &l->leaf);
1184	}
1185
1186	/* if we added to the tail node then we need to update slen */
1187	if (l->slen < new->fa_slen) {
1188		l->slen = new->fa_slen;
1189		node_push_suffix(tp, new->fa_slen);
1190	}
1191
1192	return 0;
1193}
 
1194
1195static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack)
1196{
1197	if (plen > KEYLENGTH) {
1198		NL_SET_ERR_MSG(extack, "Invalid prefix length");
1199		return false;
 
 
 
1200	}
1201
1202	if ((plen < KEYLENGTH) && (key << plen)) {
1203		NL_SET_ERR_MSG(extack,
1204			       "Invalid prefix for given prefix length");
1205		return false;
1206	}
 
1207
1208	return true;
 
 
1209}
1210
1211static void fib_remove_alias(struct trie *t, struct key_vector *tp,
1212			     struct key_vector *l, struct fib_alias *old);
1213
1214/* Caller must hold RTNL. */
1215int fib_table_insert(struct net *net, struct fib_table *tb,
1216		     struct fib_config *cfg, struct netlink_ext_ack *extack)
1217{
1218	struct trie *t = (struct trie *)tb->tb_data;
1219	struct fib_alias *fa, *new_fa;
1220	struct key_vector *l, *tp;
1221	u16 nlflags = NLM_F_EXCL;
1222	struct fib_info *fi;
1223	u8 plen = cfg->fc_dst_len;
1224	u8 slen = KEYLENGTH - plen;
1225	dscp_t dscp;
1226	u32 key;
1227	int err;
 
 
 
 
1228
1229	key = ntohl(cfg->fc_dst);
1230
1231	if (!fib_valid_key_len(key, plen, extack))
 
 
 
 
1232		return -EINVAL;
1233
1234	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1235
1236	fi = fib_create_info(cfg, extack);
1237	if (IS_ERR(fi)) {
1238		err = PTR_ERR(fi);
1239		goto err;
1240	}
1241
1242	dscp = cfg->fc_dscp;
1243	l = fib_find_node(t, &tp, key);
1244	fa = l ? fib_find_alias(&l->leaf, slen, dscp, fi->fib_priority,
1245				tb->tb_id, false) : NULL;
 
 
 
1246
1247	/* Now fa, if non-NULL, points to the first fib alias
1248	 * with the same keys [prefix,dscp,priority], if such key already
1249	 * exists or to the node before which we will insert new one.
1250	 *
1251	 * If fa is NULL, we will need to allocate a new one and
1252	 * insert to the tail of the section matching the suffix length
1253	 * of the new alias.
 
 
1254	 */
1255
1256	if (fa && fa->fa_dscp == dscp &&
1257	    fa->fa_info->fib_priority == fi->fib_priority) {
1258		struct fib_alias *fa_first, *fa_match;
1259
1260		err = -EEXIST;
1261		if (cfg->fc_nlflags & NLM_F_EXCL)
1262			goto out;
1263
1264		nlflags &= ~NLM_F_EXCL;
1265
1266		/* We have 2 goals:
1267		 * 1. Find exact match for type, scope, fib_info to avoid
1268		 * duplicate routes
1269		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1270		 */
1271		fa_match = NULL;
1272		fa_first = fa;
1273		hlist_for_each_entry_from(fa, fa_list) {
1274			if ((fa->fa_slen != slen) ||
1275			    (fa->tb_id != tb->tb_id) ||
1276			    (fa->fa_dscp != dscp))
1277				break;
1278			if (fa->fa_info->fib_priority != fi->fib_priority)
1279				break;
1280			if (fa->fa_type == cfg->fc_type &&
1281			    fa->fa_info == fi) {
1282				fa_match = fa;
1283				break;
1284			}
1285		}
1286
1287		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1288			struct fib_info *fi_drop;
1289			u8 state;
1290
1291			nlflags |= NLM_F_REPLACE;
1292			fa = fa_first;
1293			if (fa_match) {
1294				if (fa == fa_match)
1295					err = 0;
1296				goto out;
1297			}
1298			err = -ENOBUFS;
1299			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1300			if (!new_fa)
1301				goto out;
1302
1303			fi_drop = fa->fa_info;
1304			new_fa->fa_dscp = fa->fa_dscp;
1305			new_fa->fa_info = fi;
1306			new_fa->fa_type = cfg->fc_type;
1307			state = fa->fa_state;
1308			new_fa->fa_state = state & ~FA_S_ACCESSED;
1309			new_fa->fa_slen = fa->fa_slen;
1310			new_fa->tb_id = tb->tb_id;
1311			new_fa->fa_default = -1;
1312			new_fa->offload = 0;
1313			new_fa->trap = 0;
1314			new_fa->offload_failed = 0;
1315
1316			hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1317
1318			if (fib_find_alias(&l->leaf, fa->fa_slen, 0, 0,
1319					   tb->tb_id, true) == new_fa) {
1320				enum fib_event_type fib_event;
1321
1322				fib_event = FIB_EVENT_ENTRY_REPLACE;
1323				err = call_fib_entry_notifiers(net, fib_event,
1324							       key, plen,
1325							       new_fa, extack);
1326				if (err) {
1327					hlist_replace_rcu(&new_fa->fa_list,
1328							  &fa->fa_list);
1329					goto out_free_new_fa;
1330				}
1331			}
1332
1333			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1334				  tb->tb_id, &cfg->fc_nlinfo, nlflags);
1335
 
1336			alias_free_mem_rcu(fa);
1337
1338			fib_release_info(fi_drop);
1339			if (state & FA_S_ACCESSED)
1340				rt_cache_flush(cfg->fc_nlinfo.nl_net);
 
 
1341
1342			goto succeeded;
1343		}
1344		/* Error if we find a perfect match which
1345		 * uses the same scope, type, and nexthop
1346		 * information.
1347		 */
1348		if (fa_match)
1349			goto out;
1350
1351		if (cfg->fc_nlflags & NLM_F_APPEND)
1352			nlflags |= NLM_F_APPEND;
1353		else
1354			fa = fa_first;
1355	}
1356	err = -ENOENT;
1357	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1358		goto out;
1359
1360	nlflags |= NLM_F_CREATE;
1361	err = -ENOBUFS;
1362	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1363	if (!new_fa)
1364		goto out;
1365
1366	new_fa->fa_info = fi;
1367	new_fa->fa_dscp = dscp;
1368	new_fa->fa_type = cfg->fc_type;
1369	new_fa->fa_state = 0;
1370	new_fa->fa_slen = slen;
1371	new_fa->tb_id = tb->tb_id;
1372	new_fa->fa_default = -1;
1373	new_fa->offload = 0;
1374	new_fa->trap = 0;
1375	new_fa->offload_failed = 0;
1376
1377	/* Insert new entry to the list. */
1378	err = fib_insert_alias(t, tp, l, new_fa, fa, key);
1379	if (err)
1380		goto out_free_new_fa;
1381
1382	/* The alias was already inserted, so the node must exist. */
1383	l = l ? l : fib_find_node(t, &tp, key);
1384	if (WARN_ON_ONCE(!l)) {
1385		err = -ENOENT;
1386		goto out_free_new_fa;
1387	}
1388
1389	if (fib_find_alias(&l->leaf, new_fa->fa_slen, 0, 0, tb->tb_id, true) ==
1390	    new_fa) {
1391		enum fib_event_type fib_event;
1392
1393		fib_event = FIB_EVENT_ENTRY_REPLACE;
1394		err = call_fib_entry_notifiers(net, fib_event, key, plen,
1395					       new_fa, extack);
1396		if (err)
1397			goto out_remove_new_fa;
1398	}
1399
1400	if (!plen)
1401		tb->tb_num_default++;
1402
1403	rt_cache_flush(cfg->fc_nlinfo.nl_net);
1404	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
1405		  &cfg->fc_nlinfo, nlflags);
 
 
 
1406succeeded:
1407	return 0;
1408
1409out_remove_new_fa:
1410	fib_remove_alias(t, tp, l, new_fa);
1411out_free_new_fa:
1412	kmem_cache_free(fn_alias_kmem, new_fa);
1413out:
1414	fib_release_info(fi);
1415err:
1416	return err;
1417}
1418
1419static inline t_key prefix_mismatch(t_key key, struct key_vector *n)
1420{
1421	t_key prefix = n->key;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1422
1423	return (key ^ prefix) & (prefix | -prefix);
1424}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1425
1426bool fib_lookup_good_nhc(const struct fib_nh_common *nhc, int fib_flags,
1427			 const struct flowi4 *flp)
1428{
1429	if (nhc->nhc_flags & RTNH_F_DEAD)
1430		return false;
1431
1432	if (ip_ignore_linkdown(nhc->nhc_dev) &&
1433	    nhc->nhc_flags & RTNH_F_LINKDOWN &&
1434	    !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE))
1435		return false;
 
 
 
 
 
 
 
 
 
 
 
1436
1437	if (flp->flowi4_oif && flp->flowi4_oif != nhc->nhc_oif)
1438		return false;
 
 
1439
1440	return true;
1441}
1442
1443/* should be called with rcu_read_lock */
1444int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1445		     struct fib_result *res, int fib_flags)
1446{
1447	struct trie *t = (struct trie *) tb->tb_data;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1448#ifdef CONFIG_IP_FIB_TRIE_STATS
1449	struct trie_use_stats __percpu *stats = t->stats;
1450#endif
1451	const t_key key = ntohl(flp->daddr);
1452	struct key_vector *n, *pn;
1453	struct fib_alias *fa;
1454	unsigned long index;
1455	t_key cindex;
1456
1457	pn = t->kv;
1458	cindex = 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1459
1460	n = get_child_rcu(pn, cindex);
1461	if (!n) {
1462		trace_fib_table_lookup(tb->tb_id, flp, NULL, -EAGAIN);
1463		return -EAGAIN;
1464	}
1465
 
1466#ifdef CONFIG_IP_FIB_TRIE_STATS
1467	this_cpu_inc(stats->gets);
1468#endif
 
 
 
 
 
 
 
 
 
1469
1470	/* Step 1: Travel to the longest prefix match in the trie */
1471	for (;;) {
1472		index = get_cindex(key, n);
1473
1474		/* This bit of code is a bit tricky but it combines multiple
1475		 * checks into a single check.  The prefix consists of the
1476		 * prefix plus zeros for the "bits" in the prefix. The index
1477		 * is the difference between the key and this value.  From
1478		 * this we can actually derive several pieces of data.
1479		 *   if (index >= (1ul << bits))
1480		 *     we have a mismatch in skip bits and failed
1481		 *   else
1482		 *     we know the value is cindex
 
1483		 *
1484		 * This check is safe even if bits == KEYLENGTH due to the
1485		 * fact that we can only allocate a node with 32 bits if a
1486		 * long is greater than 32 bits.
 
 
 
 
 
 
 
 
1487		 */
1488		if (index >= (1ul << n->bits))
1489			break;
1490
1491		/* we have found a leaf. Prefixes have already been compared */
1492		if (IS_LEAF(n))
1493			goto found;
1494
1495		/* only record pn and cindex if we are going to be chopping
1496		 * bits later.  Otherwise we are just wasting cycles.
 
 
 
 
 
 
 
 
 
 
 
 
 
1497		 */
1498		if (n->slen > n->pos) {
1499			pn = n;
1500			cindex = index;
1501		}
1502
1503		n = get_child_rcu(n, index);
1504		if (unlikely(!n))
1505			goto backtrace;
1506	}
 
 
 
 
1507
1508	/* Step 2: Sort out leaves and begin backtracing for longest prefix */
1509	for (;;) {
1510		/* record the pointer where our next node pointer is stored */
1511		struct key_vector __rcu **cptr = n->tnode;
1512
1513		/* This test verifies that none of the bits that differ
1514		 * between the key and the prefix exist in the region of
1515		 * the lsb and higher in the prefix.
 
 
 
1516		 */
1517		if (unlikely(prefix_mismatch(key, n)) || (n->slen == n->pos))
1518			goto backtrace;
1519
1520		/* exit out and process leaf */
1521		if (unlikely(IS_LEAF(n)))
1522			break;
1523
1524		/* Don't bother recording parent info.  Since we are in
1525		 * prefix match mode we will have to come back to wherever
1526		 * we started this traversal anyway
 
1527		 */
 
 
1528
1529		while ((n = rcu_dereference(*cptr)) == NULL) {
1530backtrace:
1531#ifdef CONFIG_IP_FIB_TRIE_STATS
1532			if (!n)
1533				this_cpu_inc(stats->null_node_hit);
1534#endif
1535			/* If we are at cindex 0 there are no more bits for
1536			 * us to strip at this level so we must ascend back
1537			 * up one level to see if there are any more bits to
1538			 * be stripped there.
1539			 */
1540			while (!cindex) {
1541				t_key pkey = pn->key;
1542
1543				/* If we don't have a parent then there is
1544				 * nothing for us to do as we do not have any
1545				 * further nodes to parse.
1546				 */
1547				if (IS_TRIE(pn)) {
1548					trace_fib_table_lookup(tb->tb_id, flp,
1549							       NULL, -EAGAIN);
1550					return -EAGAIN;
1551				}
1552#ifdef CONFIG_IP_FIB_TRIE_STATS
1553				this_cpu_inc(stats->backtrack);
1554#endif
1555				/* Get Child's index */
1556				pn = node_parent_rcu(pn);
1557				cindex = get_index(pkey, pn);
1558			}
1559
1560			/* strip the least significant bit from the cindex */
1561			cindex &= cindex - 1;
1562
1563			/* grab pointer for next child node */
1564			cptr = &pn->tnode[cindex];
1565		}
1566	}
1567
1568found:
1569	/* this line carries forward the xor from earlier in the function */
1570	index = key ^ n->key;
1571
1572	/* Step 3: Process the leaf, if that fails fall back to backtracing */
1573	hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
1574		struct fib_info *fi = fa->fa_info;
1575		struct fib_nh_common *nhc;
1576		int nhsel, err;
1577
1578		if ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen < KEYLENGTH)) {
1579			if (index >= (1ul << fa->fa_slen))
1580				continue;
1581		}
1582		if (fa->fa_dscp &&
1583		    inet_dscp_to_dsfield(fa->fa_dscp) != flp->flowi4_tos)
1584			continue;
1585		if (fi->fib_dead)
1586			continue;
1587		if (fa->fa_info->fib_scope < flp->flowi4_scope)
1588			continue;
1589		fib_alias_accessed(fa);
1590		err = fib_props[fa->fa_type].error;
1591		if (unlikely(err < 0)) {
1592out_reject:
1593#ifdef CONFIG_IP_FIB_TRIE_STATS
1594			this_cpu_inc(stats->semantic_match_passed);
1595#endif
1596			trace_fib_table_lookup(tb->tb_id, flp, NULL, err);
1597			return err;
1598		}
1599		if (fi->fib_flags & RTNH_F_DEAD)
1600			continue;
1601
1602		if (unlikely(fi->nh)) {
1603			if (nexthop_is_blackhole(fi->nh)) {
1604				err = fib_props[RTN_BLACKHOLE].error;
1605				goto out_reject;
1606			}
 
 
 
 
 
 
1607
1608			nhc = nexthop_get_nhc_lookup(fi->nh, fib_flags, flp,
1609						     &nhsel);
1610			if (nhc)
1611				goto set_result;
1612			goto miss;
1613		}
1614
1615		for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
1616			nhc = fib_info_nhc(fi, nhsel);
1617
1618			if (!fib_lookup_good_nhc(nhc, fib_flags, flp))
1619				continue;
1620set_result:
1621			if (!(fib_flags & FIB_LOOKUP_NOREF))
1622				refcount_inc(&fi->fib_clntref);
1623
1624			res->prefix = htonl(n->key);
1625			res->prefixlen = KEYLENGTH - fa->fa_slen;
1626			res->nh_sel = nhsel;
1627			res->nhc = nhc;
1628			res->type = fa->fa_type;
1629			res->scope = fi->fib_scope;
1630			res->fi = fi;
1631			res->table = tb;
1632			res->fa_head = &n->leaf;
1633#ifdef CONFIG_IP_FIB_TRIE_STATS
1634			this_cpu_inc(stats->semantic_match_passed);
1635#endif
1636			trace_fib_table_lookup(tb->tb_id, flp, nhc, err);
1637
1638			return err;
1639		}
1640	}
1641miss:
1642#ifdef CONFIG_IP_FIB_TRIE_STATS
1643	this_cpu_inc(stats->semantic_match_miss);
1644#endif
1645	goto backtrace;
1646}
1647EXPORT_SYMBOL_GPL(fib_table_lookup);
1648
1649static void fib_remove_alias(struct trie *t, struct key_vector *tp,
1650			     struct key_vector *l, struct fib_alias *old)
 
 
1651{
1652	/* record the location of the previous list_info entry */
1653	struct hlist_node **pprev = old->fa_list.pprev;
1654	struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next);
1655
1656	/* remove the fib_alias from the list */
1657	hlist_del_rcu(&old->fa_list);
1658
1659	/* if we emptied the list this leaf will be freed and we can sort
1660	 * out parent suffix lengths as a part of trie_rebalance
1661	 */
1662	if (hlist_empty(&l->leaf)) {
1663		if (tp->slen == l->slen)
1664			node_pull_suffix(tp, tp->pos);
1665		put_child_root(tp, l->key, NULL);
1666		node_free(l);
1667		trie_rebalance(t, tp);
1668		return;
1669	}
1670
1671	/* only access fa if it is pointing at the last valid hlist_node */
1672	if (*pprev)
1673		return;
1674
1675	/* update the trie with the latest suffix length */
1676	l->slen = fa->fa_slen;
1677	node_pull_suffix(tp, fa->fa_slen);
1678}
1679
1680static void fib_notify_alias_delete(struct net *net, u32 key,
1681				    struct hlist_head *fah,
1682				    struct fib_alias *fa_to_delete,
1683				    struct netlink_ext_ack *extack)
1684{
1685	struct fib_alias *fa_next, *fa_to_notify;
1686	u32 tb_id = fa_to_delete->tb_id;
1687	u8 slen = fa_to_delete->fa_slen;
1688	enum fib_event_type fib_event;
1689
1690	/* Do not notify if we do not care about the route. */
1691	if (fib_find_alias(fah, slen, 0, 0, tb_id, true) != fa_to_delete)
1692		return;
1693
1694	/* Determine if the route should be replaced by the next route in the
1695	 * list.
1696	 */
1697	fa_next = hlist_entry_safe(fa_to_delete->fa_list.next,
1698				   struct fib_alias, fa_list);
1699	if (fa_next && fa_next->fa_slen == slen && fa_next->tb_id == tb_id) {
1700		fib_event = FIB_EVENT_ENTRY_REPLACE;
1701		fa_to_notify = fa_next;
1702	} else {
1703		fib_event = FIB_EVENT_ENTRY_DEL;
1704		fa_to_notify = fa_to_delete;
1705	}
1706	call_fib_entry_notifiers(net, fib_event, key, KEYLENGTH - slen,
1707				 fa_to_notify, extack);
1708}
1709
1710/* Caller must hold RTNL. */
1711int fib_table_delete(struct net *net, struct fib_table *tb,
1712		     struct fib_config *cfg, struct netlink_ext_ack *extack)
1713{
1714	struct trie *t = (struct trie *) tb->tb_data;
 
 
 
1715	struct fib_alias *fa, *fa_to_delete;
1716	struct key_vector *l, *tp;
1717	u8 plen = cfg->fc_dst_len;
1718	u8 slen = KEYLENGTH - plen;
1719	dscp_t dscp;
1720	u32 key;
 
1721
1722	key = ntohl(cfg->fc_dst);
 
1723
1724	if (!fib_valid_key_len(key, plen, extack))
1725		return -EINVAL;
1726
1727	l = fib_find_node(t, &tp, key);
 
 
1728	if (!l)
1729		return -ESRCH;
1730
1731	dscp = cfg->fc_dscp;
1732	fa = fib_find_alias(&l->leaf, slen, dscp, 0, tb->tb_id, false);
 
1733	if (!fa)
1734		return -ESRCH;
1735
1736	pr_debug("Deleting %08x/%d dsfield=0x%02x t=%p\n", key, plen,
1737		 inet_dscp_to_dsfield(dscp), t);
1738
1739	fa_to_delete = NULL;
1740	hlist_for_each_entry_from(fa, fa_list) {
 
1741		struct fib_info *fi = fa->fa_info;
1742
1743		if ((fa->fa_slen != slen) ||
1744		    (fa->tb_id != tb->tb_id) ||
1745		    (fa->fa_dscp != dscp))
1746			break;
1747
1748		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1749		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1750		     fa->fa_info->fib_scope == cfg->fc_scope) &&
1751		    (!cfg->fc_prefsrc ||
1752		     fi->fib_prefsrc == cfg->fc_prefsrc) &&
1753		    (!cfg->fc_protocol ||
1754		     fi->fib_protocol == cfg->fc_protocol) &&
1755		    fib_nh_match(net, cfg, fi, extack) == 0 &&
1756		    fib_metrics_match(cfg, fi)) {
1757			fa_to_delete = fa;
1758			break;
1759		}
1760	}
1761
1762	if (!fa_to_delete)
1763		return -ESRCH;
1764
1765	fib_notify_alias_delete(net, key, &l->leaf, fa_to_delete, extack);
1766	rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
1767		  &cfg->fc_nlinfo, 0);
1768
 
 
 
 
 
1769	if (!plen)
1770		tb->tb_num_default--;
1771
1772	fib_remove_alias(t, tp, l, fa_to_delete);
 
 
 
 
 
 
1773
1774	if (fa_to_delete->fa_state & FA_S_ACCESSED)
1775		rt_cache_flush(cfg->fc_nlinfo.nl_net);
1776
1777	fib_release_info(fa_to_delete->fa_info);
1778	alias_free_mem_rcu(fa_to_delete);
1779	return 0;
1780}
1781
1782/* Scan for the next leaf starting at the provided key value */
1783static struct key_vector *leaf_walk_rcu(struct key_vector **tn, t_key key)
1784{
1785	struct key_vector *pn, *n = *tn;
1786	unsigned long cindex;
1787
1788	/* this loop is meant to try and find the key in the trie */
1789	do {
1790		/* record parent and next child index */
1791		pn = n;
1792		cindex = (key > pn->key) ? get_index(key, pn) : 0;
1793
1794		if (cindex >> pn->bits)
1795			break;
1796
1797		/* descend into the next child */
1798		n = get_child_rcu(pn, cindex++);
1799		if (!n)
1800			break;
1801
1802		/* guarantee forward progress on the keys */
1803		if (IS_LEAF(n) && (n->key >= key))
1804			goto found;
1805	} while (IS_TNODE(n));
1806
1807	/* this loop will search for the next leaf with a greater key */
1808	while (!IS_TRIE(pn)) {
1809		/* if we exhausted the parent node we will need to climb */
1810		if (cindex >= (1ul << pn->bits)) {
1811			t_key pkey = pn->key;
1812
1813			pn = node_parent_rcu(pn);
1814			cindex = get_index(pkey, pn) + 1;
1815			continue;
1816		}
1817
1818		/* grab the next available node */
1819		n = get_child_rcu(pn, cindex++);
1820		if (!n)
1821			continue;
1822
1823		/* no need to compare keys since we bumped the index */
1824		if (IS_LEAF(n))
1825			goto found;
1826
1827		/* Rescan start scanning in new node */
1828		pn = n;
1829		cindex = 0;
1830	}
1831
1832	*tn = pn;
1833	return NULL; /* Root of trie */
1834found:
1835	/* if we are at the limit for keys just return NULL for the tnode */
1836	*tn = pn;
1837	return n;
1838}
1839
1840static void fib_trie_free(struct fib_table *tb)
1841{
1842	struct trie *t = (struct trie *)tb->tb_data;
1843	struct key_vector *pn = t->kv;
1844	unsigned long cindex = 1;
1845	struct hlist_node *tmp;
1846	struct fib_alias *fa;
1847
1848	/* walk trie in reverse order and free everything */
1849	for (;;) {
1850		struct key_vector *n;
1851
1852		if (!(cindex--)) {
1853			t_key pkey = pn->key;
1854
1855			if (IS_TRIE(pn))
1856				break;
1857
1858			n = pn;
1859			pn = node_parent(pn);
1860
1861			/* drop emptied tnode */
1862			put_child_root(pn, n->key, NULL);
1863			node_free(n);
1864
1865			cindex = get_index(pkey, pn);
1866
1867			continue;
1868		}
1869
1870		/* grab the next available node */
1871		n = get_child(pn, cindex);
1872		if (!n)
1873			continue;
1874
1875		if (IS_TNODE(n)) {
1876			/* record pn and cindex for leaf walking */
1877			pn = n;
1878			cindex = 1ul << n->bits;
1879
1880			continue;
1881		}
1882
1883		hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
1884			hlist_del_rcu(&fa->fa_list);
1885			alias_free_mem_rcu(fa);
1886		}
1887
1888		put_child_root(pn, n->key, NULL);
1889		node_free(n);
1890	}
1891
1892#ifdef CONFIG_IP_FIB_TRIE_STATS
1893	free_percpu(t->stats);
1894#endif
1895	kfree(tb);
1896}
1897
1898struct fib_table *fib_trie_unmerge(struct fib_table *oldtb)
 
 
 
 
1899{
1900	struct trie *ot = (struct trie *)oldtb->tb_data;
1901	struct key_vector *l, *tp = ot->kv;
1902	struct fib_table *local_tb;
1903	struct fib_alias *fa;
1904	struct trie *lt;
1905	t_key key = 0;
1906
1907	if (oldtb->tb_data == oldtb->__data)
1908		return oldtb;
1909
1910	local_tb = fib_trie_table(RT_TABLE_LOCAL, NULL);
1911	if (!local_tb)
1912		return NULL;
1913
1914	lt = (struct trie *)local_tb->tb_data;
1915
1916	while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
1917		struct key_vector *local_l = NULL, *local_tp;
1918
1919		hlist_for_each_entry(fa, &l->leaf, fa_list) {
1920			struct fib_alias *new_fa;
1921
1922			if (local_tb->tb_id != fa->tb_id)
 
 
1923				continue;
1924
1925			/* clone fa for new local table */
1926			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1927			if (!new_fa)
1928				goto out;
1929
1930			memcpy(new_fa, fa, sizeof(*fa));
1931
1932			/* insert clone into table */
1933			if (!local_l)
1934				local_l = fib_find_node(lt, &local_tp, l->key);
1935
1936			if (fib_insert_alias(lt, local_tp, local_l, new_fa,
1937					     NULL, l->key)) {
1938				kmem_cache_free(fn_alias_kmem, new_fa);
1939				goto out;
1940			}
1941		}
1942
1943		/* stop loop if key wrapped back to 0 */
1944		key = l->key + 1;
1945		if (key < l->key)
1946			break;
1947	}
1948
1949	return local_tb;
1950out:
1951	fib_trie_free(local_tb);
1952
1953	return NULL;
1954}
1955
1956/* Caller must hold RTNL */
1957void fib_table_flush_external(struct fib_table *tb)
1958{
1959	struct trie *t = (struct trie *)tb->tb_data;
1960	struct key_vector *pn = t->kv;
1961	unsigned long cindex = 1;
1962	struct hlist_node *tmp;
1963	struct fib_alias *fa;
1964
1965	/* walk trie in reverse order */
1966	for (;;) {
1967		unsigned char slen = 0;
1968		struct key_vector *n;
1969
1970		if (!(cindex--)) {
1971			t_key pkey = pn->key;
1972
1973			/* cannot resize the trie vector */
1974			if (IS_TRIE(pn))
1975				break;
1976
1977			/* update the suffix to address pulled leaves */
1978			if (pn->slen > pn->pos)
1979				update_suffix(pn);
1980
1981			/* resize completed node */
1982			pn = resize(t, pn);
1983			cindex = get_index(pkey, pn);
1984
1985			continue;
1986		}
1987
1988		/* grab the next available node */
1989		n = get_child(pn, cindex);
1990		if (!n)
1991			continue;
1992
1993		if (IS_TNODE(n)) {
1994			/* record pn and cindex for leaf walking */
1995			pn = n;
1996			cindex = 1ul << n->bits;
1997
1998			continue;
1999		}
2000
2001		hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
2002			/* if alias was cloned to local then we just
2003			 * need to remove the local copy from main
2004			 */
2005			if (tb->tb_id != fa->tb_id) {
2006				hlist_del_rcu(&fa->fa_list);
2007				alias_free_mem_rcu(fa);
2008				continue;
2009			}
2010
2011			/* record local slen */
2012			slen = fa->fa_slen;
2013		}
2014
2015		/* update leaf slen */
2016		n->slen = slen;
2017
2018		if (hlist_empty(&n->leaf)) {
2019			put_child_root(pn, n->key, NULL);
2020			node_free(n);
2021		}
2022	}
2023}
2024
2025/* Caller must hold RTNL. */
2026int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all)
2027{
2028	struct trie *t = (struct trie *)tb->tb_data;
2029	struct key_vector *pn = t->kv;
2030	unsigned long cindex = 1;
2031	struct hlist_node *tmp;
2032	struct fib_alias *fa;
2033	int found = 0;
2034
2035	/* walk trie in reverse order */
2036	for (;;) {
2037		unsigned char slen = 0;
2038		struct key_vector *n;
2039
2040		if (!(cindex--)) {
2041			t_key pkey = pn->key;
2042
2043			/* cannot resize the trie vector */
2044			if (IS_TRIE(pn))
2045				break;
2046
2047			/* update the suffix to address pulled leaves */
2048			if (pn->slen > pn->pos)
2049				update_suffix(pn);
2050
2051			/* resize completed node */
2052			pn = resize(t, pn);
2053			cindex = get_index(pkey, pn);
2054
2055			continue;
2056		}
2057
2058		/* grab the next available node */
2059		n = get_child(pn, cindex);
2060		if (!n)
2061			continue;
2062
2063		if (IS_TNODE(n)) {
2064			/* record pn and cindex for leaf walking */
2065			pn = n;
2066			cindex = 1ul << n->bits;
2067
2068			continue;
2069		}
2070
2071		hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
2072			struct fib_info *fi = fa->fa_info;
2073
2074			if (!fi || tb->tb_id != fa->tb_id ||
2075			    (!(fi->fib_flags & RTNH_F_DEAD) &&
2076			     !fib_props[fa->fa_type].error)) {
2077				slen = fa->fa_slen;
2078				continue;
2079			}
2080
2081			/* Do not flush error routes if network namespace is
2082			 * not being dismantled
2083			 */
2084			if (!flush_all && fib_props[fa->fa_type].error) {
2085				slen = fa->fa_slen;
2086				continue;
2087			}
2088
2089			fib_notify_alias_delete(net, n->key, &n->leaf, fa,
2090						NULL);
2091			hlist_del_rcu(&fa->fa_list);
2092			fib_release_info(fa->fa_info);
2093			alias_free_mem_rcu(fa);
2094			found++;
2095		}
2096
2097		/* update leaf slen */
2098		n->slen = slen;
2099
2100		if (hlist_empty(&n->leaf)) {
2101			put_child_root(pn, n->key, NULL);
2102			node_free(n);
2103		}
2104	}
2105
2106	pr_debug("trie_flush found=%d\n", found);
2107	return found;
2108}
2109
2110/* derived from fib_trie_free */
2111static void __fib_info_notify_update(struct net *net, struct fib_table *tb,
2112				     struct nl_info *info)
2113{
2114	struct trie *t = (struct trie *)tb->tb_data;
2115	struct key_vector *pn = t->kv;
2116	unsigned long cindex = 1;
2117	struct fib_alias *fa;
2118
2119	for (;;) {
2120		struct key_vector *n;
2121
2122		if (!(cindex--)) {
2123			t_key pkey = pn->key;
2124
2125			if (IS_TRIE(pn))
2126				break;
2127
2128			pn = node_parent(pn);
2129			cindex = get_index(pkey, pn);
2130			continue;
2131		}
2132
2133		/* grab the next available node */
2134		n = get_child(pn, cindex);
2135		if (!n)
2136			continue;
2137
2138		if (IS_TNODE(n)) {
2139			/* record pn and cindex for leaf walking */
2140			pn = n;
2141			cindex = 1ul << n->bits;
2142
2143			continue;
2144		}
2145
2146		hlist_for_each_entry(fa, &n->leaf, fa_list) {
2147			struct fib_info *fi = fa->fa_info;
2148
2149			if (!fi || !fi->nh_updated || fa->tb_id != tb->tb_id)
2150				continue;
2151
2152			rtmsg_fib(RTM_NEWROUTE, htonl(n->key), fa,
2153				  KEYLENGTH - fa->fa_slen, tb->tb_id,
2154				  info, NLM_F_REPLACE);
2155		}
2156	}
2157}
2158
2159void fib_info_notify_update(struct net *net, struct nl_info *info)
2160{
2161	unsigned int h;
2162
2163	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2164		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2165		struct fib_table *tb;
2166
2167		hlist_for_each_entry_rcu(tb, head, tb_hlist,
2168					 lockdep_rtnl_is_held())
2169			__fib_info_notify_update(net, tb, info);
2170	}
2171}
2172
2173static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb,
2174			   struct notifier_block *nb,
2175			   struct netlink_ext_ack *extack)
2176{
 
2177	struct fib_alias *fa;
2178	int last_slen = -1;
2179	int err;
2180
2181	hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
2182		struct fib_info *fi = fa->fa_info;
2183
2184		if (!fi)
2185			continue;
2186
2187		/* local and main table can share the same trie,
2188		 * so don't notify twice for the same entry.
2189		 */
2190		if (tb->tb_id != fa->tb_id)
2191			continue;
 
2192
2193		if (fa->fa_slen == last_slen)
2194			continue;
2195
2196		last_slen = fa->fa_slen;
2197		err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_REPLACE,
2198					      l->key, KEYLENGTH - fa->fa_slen,
2199					      fa, extack);
2200		if (err)
2201			return err;
2202	}
2203	return 0;
2204}
2205
2206static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb,
2207			    struct netlink_ext_ack *extack)
2208{
2209	struct trie *t = (struct trie *)tb->tb_data;
2210	struct key_vector *l, *tp = t->kv;
2211	t_key key = 0;
2212	int err;
2213
2214	while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
2215		err = fib_leaf_notify(l, tb, nb, extack);
2216		if (err)
2217			return err;
2218
2219		key = l->key + 1;
2220		/* stop in case of wrap around */
2221		if (key < l->key)
2222			break;
2223	}
2224	return 0;
2225}
2226
2227int fib_notify(struct net *net, struct notifier_block *nb,
2228	       struct netlink_ext_ack *extack)
2229{
2230	unsigned int h;
2231	int err;
2232
2233	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2234		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2235		struct fib_table *tb;
2236
2237		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2238			err = fib_table_notify(tb, nb, extack);
2239			if (err)
2240				return err;
2241		}
 
2242	}
2243	return 0;
 
2244}
2245
2246static void __trie_free_rcu(struct rcu_head *head)
 
2247{
2248	struct fib_table *tb = container_of(head, struct fib_table, rcu);
2249#ifdef CONFIG_IP_FIB_TRIE_STATS
2250	struct trie *t = (struct trie *)tb->tb_data;
2251
2252	if (tb->tb_data == tb->__data)
2253		free_percpu(t->stats);
2254#endif /* CONFIG_IP_FIB_TRIE_STATS */
2255	kfree(tb);
2256}
2257
2258void fib_free_table(struct fib_table *tb)
2259{
2260	call_rcu(&tb->rcu, __trie_free_rcu);
2261}
2262
2263static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
2264			     struct sk_buff *skb, struct netlink_callback *cb,
2265			     struct fib_dump_filter *filter)
2266{
2267	unsigned int flags = NLM_F_MULTI;
2268	__be32 xkey = htonl(l->key);
2269	int i, s_i, i_fa, s_fa, err;
2270	struct fib_alias *fa;
2271
2272	if (filter->filter_set ||
2273	    !filter->dump_exceptions || !filter->dump_routes)
2274		flags |= NLM_F_DUMP_FILTERED;
2275
2276	s_i = cb->args[4];
2277	s_fa = cb->args[5];
2278	i = 0;
2279
2280	/* rcu_read_lock is hold by caller */
2281	hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
2282		struct fib_info *fi = fa->fa_info;
 
 
 
2283
2284		if (i < s_i)
2285			goto next;
2286
2287		i_fa = 0;
2288
2289		if (tb->tb_id != fa->tb_id)
2290			goto next;
2291
2292		if (filter->filter_set) {
2293			if (filter->rt_type && fa->fa_type != filter->rt_type)
2294				goto next;
2295
2296			if ((filter->protocol &&
2297			     fi->fib_protocol != filter->protocol))
2298				goto next;
2299
2300			if (filter->dev &&
2301			    !fib_info_nh_uses_dev(fi, filter->dev))
2302				goto next;
2303		}
2304
2305		if (filter->dump_routes) {
2306			if (!s_fa) {
2307				struct fib_rt_info fri;
2308
2309				fri.fi = fi;
2310				fri.tb_id = tb->tb_id;
2311				fri.dst = xkey;
2312				fri.dst_len = KEYLENGTH - fa->fa_slen;
2313				fri.dscp = fa->fa_dscp;
2314				fri.type = fa->fa_type;
2315				fri.offload = READ_ONCE(fa->offload);
2316				fri.trap = READ_ONCE(fa->trap);
2317				fri.offload_failed = READ_ONCE(fa->offload_failed);
2318				err = fib_dump_info(skb,
2319						    NETLINK_CB(cb->skb).portid,
2320						    cb->nlh->nlmsg_seq,
2321						    RTM_NEWROUTE, &fri, flags);
2322				if (err < 0)
2323					goto stop;
2324			}
2325
2326			i_fa++;
2327		}
2328
2329		if (filter->dump_exceptions) {
2330			err = fib_dump_info_fnhe(skb, cb, tb->tb_id, fi,
2331						 &i_fa, s_fa, flags);
2332			if (err < 0)
2333				goto stop;
2334		}
2335
2336next:
2337		i++;
2338	}
2339
2340	cb->args[4] = i;
2341	return skb->len;
2342
2343stop:
2344	cb->args[4] = i;
2345	cb->args[5] = i_fa;
2346	return err;
2347}
2348
2349/* rcu_read_lock needs to be hold by caller from readside */
2350int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
2351		   struct netlink_callback *cb, struct fib_dump_filter *filter)
2352{
2353	struct trie *t = (struct trie *)tb->tb_data;
2354	struct key_vector *l, *tp = t->kv;
 
 
 
 
2355	/* Dump starting at last key.
2356	 * Note: 0.0.0.0/0 (ie default) is first key.
2357	 */
2358	int count = cb->args[2];
2359	t_key key = cb->args[3];
2360
2361	/* First time here, count and key are both always 0. Count > 0
2362	 * and key == 0 means the dump has wrapped around and we are done.
2363	 */
2364	if (count && !key)
2365		return skb->len;
2366
2367	while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
2368		int err;
2369
2370		err = fn_trie_dump_leaf(l, tb, skb, cb, filter);
2371		if (err < 0) {
2372			cb->args[3] = key;
2373			cb->args[2] = count;
2374			return err;
 
2375		}
2376
2377		++count;
2378		key = l->key + 1;
2379
2380		memset(&cb->args[4], 0,
2381		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
2382
2383		/* stop loop if key wrapped back to 0 */
2384		if (key < l->key)
2385			break;
2386	}
2387
2388	cb->args[3] = key;
2389	cb->args[2] = count;
2390
2391	return skb->len;
2392}
2393
2394void __init fib_trie_init(void)
2395{
2396	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
2397					  sizeof(struct fib_alias),
2398					  0, SLAB_PANIC | SLAB_ACCOUNT, NULL);
2399
2400	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
2401					   LEAF_SIZE,
2402					   0, SLAB_PANIC | SLAB_ACCOUNT, NULL);
 
2403}
2404
2405struct fib_table *fib_trie_table(u32 id, struct fib_table *alias)
 
2406{
2407	struct fib_table *tb;
2408	struct trie *t;
2409	size_t sz = sizeof(*tb);
2410
2411	if (!alias)
2412		sz += sizeof(struct trie);
2413
2414	tb = kzalloc(sz, GFP_KERNEL);
2415	if (!tb)
 
2416		return NULL;
2417
2418	tb->tb_id = id;
 
2419	tb->tb_num_default = 0;
2420	tb->tb_data = (alias ? alias->__data : tb->__data);
2421
2422	if (alias)
2423		return tb;
2424
2425	t = (struct trie *) tb->tb_data;
2426	t->kv[0].pos = KEYLENGTH;
2427	t->kv[0].slen = KEYLENGTH;
2428#ifdef CONFIG_IP_FIB_TRIE_STATS
2429	t->stats = alloc_percpu(struct trie_use_stats);
2430	if (!t->stats) {
2431		kfree(tb);
2432		tb = NULL;
2433	}
2434#endif
2435
2436	return tb;
2437}
2438
2439#ifdef CONFIG_PROC_FS
2440/* Depth first Trie walk iterator */
2441struct fib_trie_iter {
2442	struct seq_net_private p;
2443	struct fib_table *tb;
2444	struct key_vector *tnode;
2445	unsigned int index;
2446	unsigned int depth;
2447};
2448
2449static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter)
2450{
2451	unsigned long cindex = iter->index;
2452	struct key_vector *pn = iter->tnode;
2453	t_key pkey;
 
 
 
 
2454
2455	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2456		 iter->tnode, iter->index, iter->depth);
 
 
 
2457
2458	while (!IS_TRIE(pn)) {
2459		while (cindex < child_length(pn)) {
2460			struct key_vector *n = get_child_rcu(pn, cindex++);
2461
2462			if (!n)
2463				continue;
2464
2465			if (IS_LEAF(n)) {
2466				iter->tnode = pn;
2467				iter->index = cindex;
2468			} else {
2469				/* push down one level */
2470				iter->tnode = n;
2471				iter->index = 0;
2472				++iter->depth;
2473			}
2474
2475			return n;
2476		}
2477
2478		/* Current node exhausted, pop back up */
2479		pkey = pn->key;
2480		pn = node_parent_rcu(pn);
2481		cindex = get_index(pkey, pn) + 1;
 
 
 
 
2482		--iter->depth;
 
2483	}
2484
2485	/* record root node so further searches know we are done */
2486	iter->tnode = pn;
2487	iter->index = 0;
2488
2489	return NULL;
2490}
2491
2492static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter,
2493					     struct trie *t)
2494{
2495	struct key_vector *n, *pn;
2496
2497	if (!t)
2498		return NULL;
2499
2500	pn = t->kv;
2501	n = rcu_dereference(pn->tnode[0]);
2502	if (!n)
2503		return NULL;
2504
2505	if (IS_TNODE(n)) {
2506		iter->tnode = n;
2507		iter->index = 0;
2508		iter->depth = 1;
2509	} else {
2510		iter->tnode = pn;
2511		iter->index = 0;
2512		iter->depth = 0;
2513	}
2514
2515	return n;
2516}
2517
2518static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2519{
2520	struct key_vector *n;
2521	struct fib_trie_iter iter;
2522
2523	memset(s, 0, sizeof(*s));
2524
2525	rcu_read_lock();
2526	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2527		if (IS_LEAF(n)) {
2528			struct fib_alias *fa;
 
 
2529
2530			s->leaves++;
2531			s->totdepth += iter.depth;
2532			if (iter.depth > s->maxdepth)
2533				s->maxdepth = iter.depth;
2534
2535			hlist_for_each_entry_rcu(fa, &n->leaf, fa_list)
2536				++s->prefixes;
2537		} else {
 
 
 
2538			s->tnodes++;
2539			if (n->bits < MAX_STAT_DEPTH)
2540				s->nodesizes[n->bits]++;
2541			s->nullpointers += tn_info(n)->empty_children;
 
 
 
2542		}
2543	}
2544	rcu_read_unlock();
2545}
2546
2547/*
2548 *	This outputs /proc/net/fib_triestats
2549 */
2550static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2551{
2552	unsigned int i, max, pointers, bytes, avdepth;
2553
2554	if (stat->leaves)
2555		avdepth = stat->totdepth*100 / stat->leaves;
2556	else
2557		avdepth = 0;
2558
2559	seq_printf(seq, "\tAver depth:     %u.%02d\n",
2560		   avdepth / 100, avdepth % 100);
2561	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2562
2563	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2564	bytes = LEAF_SIZE * stat->leaves;
2565
2566	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2567	bytes += sizeof(struct fib_alias) * stat->prefixes;
2568
2569	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2570	bytes += TNODE_SIZE(0) * stat->tnodes;
2571
2572	max = MAX_STAT_DEPTH;
2573	while (max > 0 && stat->nodesizes[max-1] == 0)
2574		max--;
2575
2576	pointers = 0;
2577	for (i = 1; i < max; i++)
2578		if (stat->nodesizes[i] != 0) {
2579			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2580			pointers += (1<<i) * stat->nodesizes[i];
2581		}
2582	seq_putc(seq, '\n');
2583	seq_printf(seq, "\tPointers: %u\n", pointers);
2584
2585	bytes += sizeof(struct key_vector *) * pointers;
2586	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2587	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2588}
2589
2590#ifdef CONFIG_IP_FIB_TRIE_STATS
2591static void trie_show_usage(struct seq_file *seq,
2592			    const struct trie_use_stats __percpu *stats)
2593{
2594	struct trie_use_stats s = { 0 };
2595	int cpu;
2596
2597	/* loop through all of the CPUs and gather up the stats */
2598	for_each_possible_cpu(cpu) {
2599		const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);
2600
2601		s.gets += pcpu->gets;
2602		s.backtrack += pcpu->backtrack;
2603		s.semantic_match_passed += pcpu->semantic_match_passed;
2604		s.semantic_match_miss += pcpu->semantic_match_miss;
2605		s.null_node_hit += pcpu->null_node_hit;
2606		s.resize_node_skipped += pcpu->resize_node_skipped;
2607	}
2608
2609	seq_printf(seq, "\nCounters:\n---------\n");
2610	seq_printf(seq, "gets = %u\n", s.gets);
2611	seq_printf(seq, "backtracks = %u\n", s.backtrack);
2612	seq_printf(seq, "semantic match passed = %u\n",
2613		   s.semantic_match_passed);
2614	seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
2615	seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
2616	seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
 
 
2617}
2618#endif /*  CONFIG_IP_FIB_TRIE_STATS */
2619
2620static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2621{
2622	if (tb->tb_id == RT_TABLE_LOCAL)
2623		seq_puts(seq, "Local:\n");
2624	else if (tb->tb_id == RT_TABLE_MAIN)
2625		seq_puts(seq, "Main:\n");
2626	else
2627		seq_printf(seq, "Id %d:\n", tb->tb_id);
2628}
2629
2630
2631static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2632{
2633	struct net *net = seq->private;
2634	unsigned int h;
2635
2636	seq_printf(seq,
2637		   "Basic info: size of leaf:"
2638		   " %zd bytes, size of tnode: %zd bytes.\n",
2639		   LEAF_SIZE, TNODE_SIZE(0));
2640
2641	rcu_read_lock();
2642	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2643		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
 
2644		struct fib_table *tb;
2645
2646		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2647			struct trie *t = (struct trie *) tb->tb_data;
2648			struct trie_stat stat;
2649
2650			if (!t)
2651				continue;
2652
2653			fib_table_print(seq, tb);
2654
2655			trie_collect_stats(t, &stat);
2656			trie_show_stats(seq, &stat);
2657#ifdef CONFIG_IP_FIB_TRIE_STATS
2658			trie_show_usage(seq, t->stats);
2659#endif
2660		}
2661		cond_resched_rcu();
2662	}
2663	rcu_read_unlock();
2664
2665	return 0;
2666}
2667
2668static struct key_vector *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
 
 
 
 
 
 
 
 
 
 
 
 
 
2669{
2670	struct fib_trie_iter *iter = seq->private;
2671	struct net *net = seq_file_net(seq);
2672	loff_t idx = 0;
2673	unsigned int h;
2674
2675	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2676		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
 
2677		struct fib_table *tb;
2678
2679		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2680			struct key_vector *n;
2681
2682			for (n = fib_trie_get_first(iter,
2683						    (struct trie *) tb->tb_data);
2684			     n; n = fib_trie_get_next(iter))
2685				if (pos == idx++) {
2686					iter->tb = tb;
2687					return n;
2688				}
2689		}
2690	}
2691
2692	return NULL;
2693}
2694
2695static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2696	__acquires(RCU)
2697{
2698	rcu_read_lock();
2699	return fib_trie_get_idx(seq, *pos);
2700}
2701
2702static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2703{
2704	struct fib_trie_iter *iter = seq->private;
2705	struct net *net = seq_file_net(seq);
2706	struct fib_table *tb = iter->tb;
2707	struct hlist_node *tb_node;
2708	unsigned int h;
2709	struct key_vector *n;
2710
2711	++*pos;
2712	/* next node in same table */
2713	n = fib_trie_get_next(iter);
2714	if (n)
2715		return n;
2716
2717	/* walk rest of this hash chain */
2718	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2719	while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2720		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2721		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2722		if (n)
2723			goto found;
2724	}
2725
2726	/* new hash chain */
2727	while (++h < FIB_TABLE_HASHSZ) {
2728		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2729		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2730			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2731			if (n)
2732				goto found;
2733		}
2734	}
2735	return NULL;
2736
2737found:
2738	iter->tb = tb;
2739	return n;
2740}
2741
2742static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2743	__releases(RCU)
2744{
2745	rcu_read_unlock();
2746}
2747
2748static void seq_indent(struct seq_file *seq, int n)
2749{
2750	while (n-- > 0)
2751		seq_puts(seq, "   ");
2752}
2753
2754static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2755{
2756	switch (s) {
2757	case RT_SCOPE_UNIVERSE: return "universe";
2758	case RT_SCOPE_SITE:	return "site";
2759	case RT_SCOPE_LINK:	return "link";
2760	case RT_SCOPE_HOST:	return "host";
2761	case RT_SCOPE_NOWHERE:	return "nowhere";
2762	default:
2763		snprintf(buf, len, "scope=%d", s);
2764		return buf;
2765	}
2766}
2767
2768static const char *const rtn_type_names[__RTN_MAX] = {
2769	[RTN_UNSPEC] = "UNSPEC",
2770	[RTN_UNICAST] = "UNICAST",
2771	[RTN_LOCAL] = "LOCAL",
2772	[RTN_BROADCAST] = "BROADCAST",
2773	[RTN_ANYCAST] = "ANYCAST",
2774	[RTN_MULTICAST] = "MULTICAST",
2775	[RTN_BLACKHOLE] = "BLACKHOLE",
2776	[RTN_UNREACHABLE] = "UNREACHABLE",
2777	[RTN_PROHIBIT] = "PROHIBIT",
2778	[RTN_THROW] = "THROW",
2779	[RTN_NAT] = "NAT",
2780	[RTN_XRESOLVE] = "XRESOLVE",
2781};
2782
2783static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2784{
2785	if (t < __RTN_MAX && rtn_type_names[t])
2786		return rtn_type_names[t];
2787	snprintf(buf, len, "type %u", t);
2788	return buf;
2789}
2790
2791/* Pretty print the trie */
2792static int fib_trie_seq_show(struct seq_file *seq, void *v)
2793{
2794	const struct fib_trie_iter *iter = seq->private;
2795	struct key_vector *n = v;
2796
2797	if (IS_TRIE(node_parent_rcu(n)))
2798		fib_table_print(seq, iter->tb);
2799
2800	if (IS_TNODE(n)) {
2801		__be32 prf = htonl(n->key);
 
2802
2803		seq_indent(seq, iter->depth-1);
2804		seq_printf(seq, "  +-- %pI4/%zu %u %u %u\n",
2805			   &prf, KEYLENGTH - n->pos - n->bits, n->bits,
2806			   tn_info(n)->full_children,
2807			   tn_info(n)->empty_children);
2808	} else {
2809		__be32 val = htonl(n->key);
2810		struct fib_alias *fa;
 
 
2811
2812		seq_indent(seq, iter->depth);
2813		seq_printf(seq, "  |-- %pI4\n", &val);
2814
2815		hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
2816			char buf1[32], buf2[32];
 
 
 
2817
2818			seq_indent(seq, iter->depth + 1);
2819			seq_printf(seq, "  /%zu %s %s",
2820				   KEYLENGTH - fa->fa_slen,
2821				   rtn_scope(buf1, sizeof(buf1),
2822					     fa->fa_info->fib_scope),
2823				   rtn_type(buf2, sizeof(buf2),
2824					    fa->fa_type));
2825			if (fa->fa_dscp)
2826				seq_printf(seq, " tos=%d",
2827					   inet_dscp_to_dsfield(fa->fa_dscp));
2828			seq_putc(seq, '\n');
2829		}
2830	}
2831
2832	return 0;
2833}
2834
2835static const struct seq_operations fib_trie_seq_ops = {
2836	.start  = fib_trie_seq_start,
2837	.next   = fib_trie_seq_next,
2838	.stop   = fib_trie_seq_stop,
2839	.show   = fib_trie_seq_show,
2840};
2841
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2842struct fib_route_iter {
2843	struct seq_net_private p;
2844	struct fib_table *main_tb;
2845	struct key_vector *tnode;
2846	loff_t	pos;
2847	t_key	key;
2848};
2849
2850static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter,
2851					    loff_t pos)
2852{
2853	struct key_vector *l, **tp = &iter->tnode;
2854	t_key key;
2855
2856	/* use cached location of previously found key */
2857	if (iter->pos > 0 && pos >= iter->pos) {
2858		key = iter->key;
2859	} else {
2860		iter->pos = 1;
2861		key = 0;
2862	}
2863
2864	pos -= iter->pos;
2865
2866	while ((l = leaf_walk_rcu(tp, key)) && (pos-- > 0)) {
2867		key = l->key + 1;
2868		iter->pos++;
2869		l = NULL;
2870
2871		/* handle unlikely case of a key wrap */
2872		if (!key)
2873			break;
2874	}
2875
2876	if (l)
2877		iter->key = l->key;	/* remember it */
2878	else
2879		iter->pos = 0;		/* forget it */
2880
2881	return l;
2882}
2883
2884static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2885	__acquires(RCU)
2886{
2887	struct fib_route_iter *iter = seq->private;
2888	struct fib_table *tb;
2889	struct trie *t;
2890
2891	rcu_read_lock();
2892
2893	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2894	if (!tb)
2895		return NULL;
2896
2897	iter->main_tb = tb;
2898	t = (struct trie *)tb->tb_data;
2899	iter->tnode = t->kv;
2900
2901	if (*pos != 0)
2902		return fib_route_get_idx(iter, *pos);
2903
2904	iter->pos = 0;
2905	iter->key = KEY_MAX;
2906
2907	return SEQ_START_TOKEN;
2908}
2909
2910static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2911{
2912	struct fib_route_iter *iter = seq->private;
2913	struct key_vector *l = NULL;
2914	t_key key = iter->key + 1;
2915
2916	++*pos;
 
 
 
 
 
 
 
2917
2918	/* only allow key of 0 for start of sequence */
2919	if ((v == SEQ_START_TOKEN) || key)
2920		l = leaf_walk_rcu(&iter->tnode, key);
2921
2922	if (l) {
2923		iter->key = l->key;
2924		iter->pos++;
2925	} else {
2926		iter->pos = 0;
2927	}
2928
2929	return l;
2930}
2931
2932static void fib_route_seq_stop(struct seq_file *seq, void *v)
2933	__releases(RCU)
2934{
2935	rcu_read_unlock();
2936}
2937
2938static unsigned int fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
2939{
2940	unsigned int flags = 0;
2941
2942	if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2943		flags = RTF_REJECT;
2944	if (fi) {
2945		const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
2946
2947		if (nhc->nhc_gw.ipv4)
2948			flags |= RTF_GATEWAY;
2949	}
2950	if (mask == htonl(0xFFFFFFFF))
2951		flags |= RTF_HOST;
2952	flags |= RTF_UP;
2953	return flags;
2954}
2955
2956/*
2957 *	This outputs /proc/net/route.
2958 *	The format of the file is not supposed to be changed
2959 *	and needs to be same as fib_hash output to avoid breaking
2960 *	legacy utilities
2961 */
2962static int fib_route_seq_show(struct seq_file *seq, void *v)
2963{
2964	struct fib_route_iter *iter = seq->private;
2965	struct fib_table *tb = iter->main_tb;
2966	struct fib_alias *fa;
2967	struct key_vector *l = v;
2968	__be32 prefix;
2969
2970	if (v == SEQ_START_TOKEN) {
2971		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2972			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2973			   "\tWindow\tIRTT");
2974		return 0;
2975	}
2976
2977	prefix = htonl(l->key);
 
 
2978
2979	hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
2980		struct fib_info *fi = fa->fa_info;
2981		__be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen);
2982		unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2983
2984		if ((fa->fa_type == RTN_BROADCAST) ||
2985		    (fa->fa_type == RTN_MULTICAST))
2986			continue;
 
2987
2988		if (fa->tb_id != tb->tb_id)
2989			continue;
 
2990
2991		seq_setwidth(seq, 127);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2992
2993		if (fi) {
2994			struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
2995			__be32 gw = 0;
2996
2997			if (nhc->nhc_gw_family == AF_INET)
2998				gw = nhc->nhc_gw.ipv4;
2999
3000			seq_printf(seq,
3001				   "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
3002				   "%d\t%08X\t%d\t%u\t%u",
3003				   nhc->nhc_dev ? nhc->nhc_dev->name : "*",
3004				   prefix, gw, flags, 0, 0,
3005				   fi->fib_priority,
3006				   mask,
3007				   (fi->fib_advmss ?
3008				    fi->fib_advmss + 40 : 0),
3009				   fi->fib_window,
3010				   fi->fib_rtt >> 3);
3011		} else {
3012			seq_printf(seq,
3013				   "*\t%08X\t%08X\t%04X\t%d\t%u\t"
3014				   "%d\t%08X\t%d\t%u\t%u",
3015				   prefix, 0, flags, 0, 0, 0,
3016				   mask, 0, 0, 0);
3017		}
3018		seq_pad(seq, '\n');
3019	}
3020
3021	return 0;
3022}
3023
3024static const struct seq_operations fib_route_seq_ops = {
3025	.start  = fib_route_seq_start,
3026	.next   = fib_route_seq_next,
3027	.stop   = fib_route_seq_stop,
3028	.show   = fib_route_seq_show,
3029};
3030
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3031int __net_init fib_proc_init(struct net *net)
3032{
3033	if (!proc_create_net("fib_trie", 0444, net->proc_net, &fib_trie_seq_ops,
3034			sizeof(struct fib_trie_iter)))
3035		goto out1;
3036
3037	if (!proc_create_net_single("fib_triestat", 0444, net->proc_net,
3038			fib_triestat_seq_show, NULL))
3039		goto out2;
3040
3041	if (!proc_create_net("route", 0444, net->proc_net, &fib_route_seq_ops,
3042			sizeof(struct fib_route_iter)))
3043		goto out3;
3044
3045	return 0;
3046
3047out3:
3048	remove_proc_entry("fib_triestat", net->proc_net);
3049out2:
3050	remove_proc_entry("fib_trie", net->proc_net);
3051out1:
3052	return -ENOMEM;
3053}
3054
3055void __net_exit fib_proc_exit(struct net *net)
3056{
3057	remove_proc_entry("fib_trie", net->proc_net);
3058	remove_proc_entry("fib_triestat", net->proc_net);
3059	remove_proc_entry("route", net->proc_net);
3060}
3061
3062#endif /* CONFIG_PROC_FS */