Loading...
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
12 *
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally described in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 *
26 * Code from fib_hash has been reused which includes the following header:
27 *
28 *
29 * INET An implementation of the TCP/IP protocol suite for the LINUX
30 * operating system. INET is implemented using the BSD Socket
31 * interface as the means of communication with the user level.
32 *
33 * IPv4 FIB: lookup engine and maintenance routines.
34 *
35 *
36 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37 *
38 * This program is free software; you can redistribute it and/or
39 * modify it under the terms of the GNU General Public License
40 * as published by the Free Software Foundation; either version
41 * 2 of the License, or (at your option) any later version.
42 *
43 * Substantial contributions to this work comes from:
44 *
45 * David S. Miller, <davem@davemloft.net>
46 * Stephen Hemminger <shemminger@osdl.org>
47 * Paul E. McKenney <paulmck@us.ibm.com>
48 * Patrick McHardy <kaber@trash.net>
49 */
50
51#define VERSION "0.409"
52
53#include <asm/uaccess.h>
54#include <linux/bitops.h>
55#include <linux/types.h>
56#include <linux/kernel.h>
57#include <linux/mm.h>
58#include <linux/string.h>
59#include <linux/socket.h>
60#include <linux/sockios.h>
61#include <linux/errno.h>
62#include <linux/in.h>
63#include <linux/inet.h>
64#include <linux/inetdevice.h>
65#include <linux/netdevice.h>
66#include <linux/if_arp.h>
67#include <linux/proc_fs.h>
68#include <linux/rcupdate.h>
69#include <linux/skbuff.h>
70#include <linux/netlink.h>
71#include <linux/init.h>
72#include <linux/list.h>
73#include <linux/slab.h>
74#include <linux/export.h>
75#include <net/net_namespace.h>
76#include <net/ip.h>
77#include <net/protocol.h>
78#include <net/route.h>
79#include <net/tcp.h>
80#include <net/sock.h>
81#include <net/ip_fib.h>
82#include "fib_lookup.h"
83
84#define MAX_STAT_DEPTH 32
85
86#define KEYLENGTH (8*sizeof(t_key))
87
88typedef unsigned int t_key;
89
90#define T_TNODE 0
91#define T_LEAF 1
92#define NODE_TYPE_MASK 0x1UL
93#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
94
95#define IS_TNODE(n) (!(n->parent & T_LEAF))
96#define IS_LEAF(n) (n->parent & T_LEAF)
97
98struct rt_trie_node {
99 unsigned long parent;
100 t_key key;
101};
102
103struct leaf {
104 unsigned long parent;
105 t_key key;
106 struct hlist_head list;
107 struct rcu_head rcu;
108};
109
110struct leaf_info {
111 struct hlist_node hlist;
112 int plen;
113 u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
114 struct list_head falh;
115 struct rcu_head rcu;
116};
117
118struct tnode {
119 unsigned long parent;
120 t_key key;
121 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
122 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
123 unsigned int full_children; /* KEYLENGTH bits needed */
124 unsigned int empty_children; /* KEYLENGTH bits needed */
125 union {
126 struct rcu_head rcu;
127 struct tnode *tnode_free;
128 };
129 struct rt_trie_node __rcu *child[0];
130};
131
132#ifdef CONFIG_IP_FIB_TRIE_STATS
133struct trie_use_stats {
134 unsigned int gets;
135 unsigned int backtrack;
136 unsigned int semantic_match_passed;
137 unsigned int semantic_match_miss;
138 unsigned int null_node_hit;
139 unsigned int resize_node_skipped;
140};
141#endif
142
143struct trie_stat {
144 unsigned int totdepth;
145 unsigned int maxdepth;
146 unsigned int tnodes;
147 unsigned int leaves;
148 unsigned int nullpointers;
149 unsigned int prefixes;
150 unsigned int nodesizes[MAX_STAT_DEPTH];
151};
152
153struct trie {
154 struct rt_trie_node __rcu *trie;
155#ifdef CONFIG_IP_FIB_TRIE_STATS
156 struct trie_use_stats stats;
157#endif
158};
159
160static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
161 int wasfull);
162static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
163static struct tnode *inflate(struct trie *t, struct tnode *tn);
164static struct tnode *halve(struct trie *t, struct tnode *tn);
165/* tnodes to free after resize(); protected by RTNL */
166static struct tnode *tnode_free_head;
167static size_t tnode_free_size;
168
169/*
170 * synchronize_rcu after call_rcu for that many pages; it should be especially
171 * useful before resizing the root node with PREEMPT_NONE configs; the value was
172 * obtained experimentally, aiming to avoid visible slowdown.
173 */
174static const int sync_pages = 128;
175
176static struct kmem_cache *fn_alias_kmem __read_mostly;
177static struct kmem_cache *trie_leaf_kmem __read_mostly;
178
179/*
180 * caller must hold RTNL
181 */
182static inline struct tnode *node_parent(const struct rt_trie_node *node)
183{
184 unsigned long parent;
185
186 parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());
187
188 return (struct tnode *)(parent & ~NODE_TYPE_MASK);
189}
190
191/*
192 * caller must hold RCU read lock or RTNL
193 */
194static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
195{
196 unsigned long parent;
197
198 parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
199 lockdep_rtnl_is_held());
200
201 return (struct tnode *)(parent & ~NODE_TYPE_MASK);
202}
203
204/* Same as rcu_assign_pointer
205 * but that macro() assumes that value is a pointer.
206 */
207static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
208{
209 smp_wmb();
210 node->parent = (unsigned long)ptr | NODE_TYPE(node);
211}
212
213/*
214 * caller must hold RTNL
215 */
216static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
217{
218 BUG_ON(i >= 1U << tn->bits);
219
220 return rtnl_dereference(tn->child[i]);
221}
222
223/*
224 * caller must hold RCU read lock or RTNL
225 */
226static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
227{
228 BUG_ON(i >= 1U << tn->bits);
229
230 return rcu_dereference_rtnl(tn->child[i]);
231}
232
233static inline int tnode_child_length(const struct tnode *tn)
234{
235 return 1 << tn->bits;
236}
237
238static inline t_key mask_pfx(t_key k, unsigned int l)
239{
240 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
241}
242
243static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
244{
245 if (offset < KEYLENGTH)
246 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
247 else
248 return 0;
249}
250
251static inline int tkey_equals(t_key a, t_key b)
252{
253 return a == b;
254}
255
256static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
257{
258 if (bits == 0 || offset >= KEYLENGTH)
259 return 1;
260 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
261 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
262}
263
264static inline int tkey_mismatch(t_key a, int offset, t_key b)
265{
266 t_key diff = a ^ b;
267 int i = offset;
268
269 if (!diff)
270 return 0;
271 while ((diff << i) >> (KEYLENGTH-1) == 0)
272 i++;
273 return i;
274}
275
276/*
277 To understand this stuff, an understanding of keys and all their bits is
278 necessary. Every node in the trie has a key associated with it, but not
279 all of the bits in that key are significant.
280
281 Consider a node 'n' and its parent 'tp'.
282
283 If n is a leaf, every bit in its key is significant. Its presence is
284 necessitated by path compression, since during a tree traversal (when
285 searching for a leaf - unless we are doing an insertion) we will completely
286 ignore all skipped bits we encounter. Thus we need to verify, at the end of
287 a potentially successful search, that we have indeed been walking the
288 correct key path.
289
290 Note that we can never "miss" the correct key in the tree if present by
291 following the wrong path. Path compression ensures that segments of the key
292 that are the same for all keys with a given prefix are skipped, but the
293 skipped part *is* identical for each node in the subtrie below the skipped
294 bit! trie_insert() in this implementation takes care of that - note the
295 call to tkey_sub_equals() in trie_insert().
296
297 if n is an internal node - a 'tnode' here, the various parts of its key
298 have many different meanings.
299
300 Example:
301 _________________________________________________________________
302 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
303 -----------------------------------------------------------------
304 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
305
306 _________________________________________________________________
307 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
308 -----------------------------------------------------------------
309 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
310
311 tp->pos = 7
312 tp->bits = 3
313 n->pos = 15
314 n->bits = 4
315
316 First, let's just ignore the bits that come before the parent tp, that is
317 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
318 not use them for anything.
319
320 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
321 index into the parent's child array. That is, they will be used to find
322 'n' among tp's children.
323
324 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
325 for the node n.
326
327 All the bits we have seen so far are significant to the node n. The rest
328 of the bits are really not needed or indeed known in n->key.
329
330 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
331 n's child array, and will of course be different for each child.
332
333
334 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
335 at this point.
336
337*/
338
339static inline void check_tnode(const struct tnode *tn)
340{
341 WARN_ON(tn && tn->pos+tn->bits > 32);
342}
343
344static const int halve_threshold = 25;
345static const int inflate_threshold = 50;
346static const int halve_threshold_root = 15;
347static const int inflate_threshold_root = 30;
348
349static void __alias_free_mem(struct rcu_head *head)
350{
351 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
352 kmem_cache_free(fn_alias_kmem, fa);
353}
354
355static inline void alias_free_mem_rcu(struct fib_alias *fa)
356{
357 call_rcu(&fa->rcu, __alias_free_mem);
358}
359
360static void __leaf_free_rcu(struct rcu_head *head)
361{
362 struct leaf *l = container_of(head, struct leaf, rcu);
363 kmem_cache_free(trie_leaf_kmem, l);
364}
365
366static inline void free_leaf(struct leaf *l)
367{
368 call_rcu(&l->rcu, __leaf_free_rcu);
369}
370
371static inline void free_leaf_info(struct leaf_info *leaf)
372{
373 kfree_rcu(leaf, rcu);
374}
375
376static struct tnode *tnode_alloc(size_t size)
377{
378 if (size <= PAGE_SIZE)
379 return kzalloc(size, GFP_KERNEL);
380 else
381 return vzalloc(size);
382}
383
384static void __tnode_free_rcu(struct rcu_head *head)
385{
386 struct tnode *tn = container_of(head, struct tnode, rcu);
387 size_t size = sizeof(struct tnode) +
388 (sizeof(struct rt_trie_node *) << tn->bits);
389
390 if (size <= PAGE_SIZE)
391 kfree(tn);
392 else
393 vfree(tn);
394}
395
396static inline void tnode_free(struct tnode *tn)
397{
398 if (IS_LEAF(tn))
399 free_leaf((struct leaf *) tn);
400 else
401 call_rcu(&tn->rcu, __tnode_free_rcu);
402}
403
404static void tnode_free_safe(struct tnode *tn)
405{
406 BUG_ON(IS_LEAF(tn));
407 tn->tnode_free = tnode_free_head;
408 tnode_free_head = tn;
409 tnode_free_size += sizeof(struct tnode) +
410 (sizeof(struct rt_trie_node *) << tn->bits);
411}
412
413static void tnode_free_flush(void)
414{
415 struct tnode *tn;
416
417 while ((tn = tnode_free_head)) {
418 tnode_free_head = tn->tnode_free;
419 tn->tnode_free = NULL;
420 tnode_free(tn);
421 }
422
423 if (tnode_free_size >= PAGE_SIZE * sync_pages) {
424 tnode_free_size = 0;
425 synchronize_rcu();
426 }
427}
428
429static struct leaf *leaf_new(void)
430{
431 struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
432 if (l) {
433 l->parent = T_LEAF;
434 INIT_HLIST_HEAD(&l->list);
435 }
436 return l;
437}
438
439static struct leaf_info *leaf_info_new(int plen)
440{
441 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
442 if (li) {
443 li->plen = plen;
444 li->mask_plen = ntohl(inet_make_mask(plen));
445 INIT_LIST_HEAD(&li->falh);
446 }
447 return li;
448}
449
450static struct tnode *tnode_new(t_key key, int pos, int bits)
451{
452 size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
453 struct tnode *tn = tnode_alloc(sz);
454
455 if (tn) {
456 tn->parent = T_TNODE;
457 tn->pos = pos;
458 tn->bits = bits;
459 tn->key = key;
460 tn->full_children = 0;
461 tn->empty_children = 1<<bits;
462 }
463
464 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
465 sizeof(struct rt_trie_node *) << bits);
466 return tn;
467}
468
469/*
470 * Check whether a tnode 'n' is "full", i.e. it is an internal node
471 * and no bits are skipped. See discussion in dyntree paper p. 6
472 */
473
474static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
475{
476 if (n == NULL || IS_LEAF(n))
477 return 0;
478
479 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
480}
481
482static inline void put_child(struct tnode *tn, int i,
483 struct rt_trie_node *n)
484{
485 tnode_put_child_reorg(tn, i, n, -1);
486}
487
488 /*
489 * Add a child at position i overwriting the old value.
490 * Update the value of full_children and empty_children.
491 */
492
493static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
494 int wasfull)
495{
496 struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
497 int isfull;
498
499 BUG_ON(i >= 1<<tn->bits);
500
501 /* update emptyChildren */
502 if (n == NULL && chi != NULL)
503 tn->empty_children++;
504 else if (n != NULL && chi == NULL)
505 tn->empty_children--;
506
507 /* update fullChildren */
508 if (wasfull == -1)
509 wasfull = tnode_full(tn, chi);
510
511 isfull = tnode_full(tn, n);
512 if (wasfull && !isfull)
513 tn->full_children--;
514 else if (!wasfull && isfull)
515 tn->full_children++;
516
517 if (n)
518 node_set_parent(n, tn);
519
520 rcu_assign_pointer(tn->child[i], n);
521}
522
523#define MAX_WORK 10
524static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
525{
526 int i;
527 struct tnode *old_tn;
528 int inflate_threshold_use;
529 int halve_threshold_use;
530 int max_work;
531
532 if (!tn)
533 return NULL;
534
535 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
536 tn, inflate_threshold, halve_threshold);
537
538 /* No children */
539 if (tn->empty_children == tnode_child_length(tn)) {
540 tnode_free_safe(tn);
541 return NULL;
542 }
543 /* One child */
544 if (tn->empty_children == tnode_child_length(tn) - 1)
545 goto one_child;
546 /*
547 * Double as long as the resulting node has a number of
548 * nonempty nodes that are above the threshold.
549 */
550
551 /*
552 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
553 * the Helsinki University of Technology and Matti Tikkanen of Nokia
554 * Telecommunications, page 6:
555 * "A node is doubled if the ratio of non-empty children to all
556 * children in the *doubled* node is at least 'high'."
557 *
558 * 'high' in this instance is the variable 'inflate_threshold'. It
559 * is expressed as a percentage, so we multiply it with
560 * tnode_child_length() and instead of multiplying by 2 (since the
561 * child array will be doubled by inflate()) and multiplying
562 * the left-hand side by 100 (to handle the percentage thing) we
563 * multiply the left-hand side by 50.
564 *
565 * The left-hand side may look a bit weird: tnode_child_length(tn)
566 * - tn->empty_children is of course the number of non-null children
567 * in the current node. tn->full_children is the number of "full"
568 * children, that is non-null tnodes with a skip value of 0.
569 * All of those will be doubled in the resulting inflated tnode, so
570 * we just count them one extra time here.
571 *
572 * A clearer way to write this would be:
573 *
574 * to_be_doubled = tn->full_children;
575 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
576 * tn->full_children;
577 *
578 * new_child_length = tnode_child_length(tn) * 2;
579 *
580 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
581 * new_child_length;
582 * if (new_fill_factor >= inflate_threshold)
583 *
584 * ...and so on, tho it would mess up the while () loop.
585 *
586 * anyway,
587 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
588 * inflate_threshold
589 *
590 * avoid a division:
591 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
592 * inflate_threshold * new_child_length
593 *
594 * expand not_to_be_doubled and to_be_doubled, and shorten:
595 * 100 * (tnode_child_length(tn) - tn->empty_children +
596 * tn->full_children) >= inflate_threshold * new_child_length
597 *
598 * expand new_child_length:
599 * 100 * (tnode_child_length(tn) - tn->empty_children +
600 * tn->full_children) >=
601 * inflate_threshold * tnode_child_length(tn) * 2
602 *
603 * shorten again:
604 * 50 * (tn->full_children + tnode_child_length(tn) -
605 * tn->empty_children) >= inflate_threshold *
606 * tnode_child_length(tn)
607 *
608 */
609
610 check_tnode(tn);
611
612 /* Keep root node larger */
613
614 if (!node_parent((struct rt_trie_node *)tn)) {
615 inflate_threshold_use = inflate_threshold_root;
616 halve_threshold_use = halve_threshold_root;
617 } else {
618 inflate_threshold_use = inflate_threshold;
619 halve_threshold_use = halve_threshold;
620 }
621
622 max_work = MAX_WORK;
623 while ((tn->full_children > 0 && max_work-- &&
624 50 * (tn->full_children + tnode_child_length(tn)
625 - tn->empty_children)
626 >= inflate_threshold_use * tnode_child_length(tn))) {
627
628 old_tn = tn;
629 tn = inflate(t, tn);
630
631 if (IS_ERR(tn)) {
632 tn = old_tn;
633#ifdef CONFIG_IP_FIB_TRIE_STATS
634 t->stats.resize_node_skipped++;
635#endif
636 break;
637 }
638 }
639
640 check_tnode(tn);
641
642 /* Return if at least one inflate is run */
643 if (max_work != MAX_WORK)
644 return (struct rt_trie_node *) tn;
645
646 /*
647 * Halve as long as the number of empty children in this
648 * node is above threshold.
649 */
650
651 max_work = MAX_WORK;
652 while (tn->bits > 1 && max_work-- &&
653 100 * (tnode_child_length(tn) - tn->empty_children) <
654 halve_threshold_use * tnode_child_length(tn)) {
655
656 old_tn = tn;
657 tn = halve(t, tn);
658 if (IS_ERR(tn)) {
659 tn = old_tn;
660#ifdef CONFIG_IP_FIB_TRIE_STATS
661 t->stats.resize_node_skipped++;
662#endif
663 break;
664 }
665 }
666
667
668 /* Only one child remains */
669 if (tn->empty_children == tnode_child_length(tn) - 1) {
670one_child:
671 for (i = 0; i < tnode_child_length(tn); i++) {
672 struct rt_trie_node *n;
673
674 n = rtnl_dereference(tn->child[i]);
675 if (!n)
676 continue;
677
678 /* compress one level */
679
680 node_set_parent(n, NULL);
681 tnode_free_safe(tn);
682 return n;
683 }
684 }
685 return (struct rt_trie_node *) tn;
686}
687
688
689static void tnode_clean_free(struct tnode *tn)
690{
691 int i;
692 struct tnode *tofree;
693
694 for (i = 0; i < tnode_child_length(tn); i++) {
695 tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
696 if (tofree)
697 tnode_free(tofree);
698 }
699 tnode_free(tn);
700}
701
702static struct tnode *inflate(struct trie *t, struct tnode *tn)
703{
704 struct tnode *oldtnode = tn;
705 int olen = tnode_child_length(tn);
706 int i;
707
708 pr_debug("In inflate\n");
709
710 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
711
712 if (!tn)
713 return ERR_PTR(-ENOMEM);
714
715 /*
716 * Preallocate and store tnodes before the actual work so we
717 * don't get into an inconsistent state if memory allocation
718 * fails. In case of failure we return the oldnode and inflate
719 * of tnode is ignored.
720 */
721
722 for (i = 0; i < olen; i++) {
723 struct tnode *inode;
724
725 inode = (struct tnode *) tnode_get_child(oldtnode, i);
726 if (inode &&
727 IS_TNODE(inode) &&
728 inode->pos == oldtnode->pos + oldtnode->bits &&
729 inode->bits > 1) {
730 struct tnode *left, *right;
731 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
732
733 left = tnode_new(inode->key&(~m), inode->pos + 1,
734 inode->bits - 1);
735 if (!left)
736 goto nomem;
737
738 right = tnode_new(inode->key|m, inode->pos + 1,
739 inode->bits - 1);
740
741 if (!right) {
742 tnode_free(left);
743 goto nomem;
744 }
745
746 put_child(tn, 2*i, (struct rt_trie_node *) left);
747 put_child(tn, 2*i+1, (struct rt_trie_node *) right);
748 }
749 }
750
751 for (i = 0; i < olen; i++) {
752 struct tnode *inode;
753 struct rt_trie_node *node = tnode_get_child(oldtnode, i);
754 struct tnode *left, *right;
755 int size, j;
756
757 /* An empty child */
758 if (node == NULL)
759 continue;
760
761 /* A leaf or an internal node with skipped bits */
762
763 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
764 tn->pos + tn->bits - 1) {
765 put_child(tn,
766 tkey_extract_bits(node->key, oldtnode->pos, oldtnode->bits + 1),
767 node);
768 continue;
769 }
770
771 /* An internal node with two children */
772 inode = (struct tnode *) node;
773
774 if (inode->bits == 1) {
775 put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
776 put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
777
778 tnode_free_safe(inode);
779 continue;
780 }
781
782 /* An internal node with more than two children */
783
784 /* We will replace this node 'inode' with two new
785 * ones, 'left' and 'right', each with half of the
786 * original children. The two new nodes will have
787 * a position one bit further down the key and this
788 * means that the "significant" part of their keys
789 * (see the discussion near the top of this file)
790 * will differ by one bit, which will be "0" in
791 * left's key and "1" in right's key. Since we are
792 * moving the key position by one step, the bit that
793 * we are moving away from - the bit at position
794 * (inode->pos) - is the one that will differ between
795 * left and right. So... we synthesize that bit in the
796 * two new keys.
797 * The mask 'm' below will be a single "one" bit at
798 * the position (inode->pos)
799 */
800
801 /* Use the old key, but set the new significant
802 * bit to zero.
803 */
804
805 left = (struct tnode *) tnode_get_child(tn, 2*i);
806 put_child(tn, 2*i, NULL);
807
808 BUG_ON(!left);
809
810 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
811 put_child(tn, 2*i+1, NULL);
812
813 BUG_ON(!right);
814
815 size = tnode_child_length(left);
816 for (j = 0; j < size; j++) {
817 put_child(left, j, rtnl_dereference(inode->child[j]));
818 put_child(right, j, rtnl_dereference(inode->child[j + size]));
819 }
820 put_child(tn, 2*i, resize(t, left));
821 put_child(tn, 2*i+1, resize(t, right));
822
823 tnode_free_safe(inode);
824 }
825 tnode_free_safe(oldtnode);
826 return tn;
827nomem:
828 tnode_clean_free(tn);
829 return ERR_PTR(-ENOMEM);
830}
831
832static struct tnode *halve(struct trie *t, struct tnode *tn)
833{
834 struct tnode *oldtnode = tn;
835 struct rt_trie_node *left, *right;
836 int i;
837 int olen = tnode_child_length(tn);
838
839 pr_debug("In halve\n");
840
841 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
842
843 if (!tn)
844 return ERR_PTR(-ENOMEM);
845
846 /*
847 * Preallocate and store tnodes before the actual work so we
848 * don't get into an inconsistent state if memory allocation
849 * fails. In case of failure we return the oldnode and halve
850 * of tnode is ignored.
851 */
852
853 for (i = 0; i < olen; i += 2) {
854 left = tnode_get_child(oldtnode, i);
855 right = tnode_get_child(oldtnode, i+1);
856
857 /* Two nonempty children */
858 if (left && right) {
859 struct tnode *newn;
860
861 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
862
863 if (!newn)
864 goto nomem;
865
866 put_child(tn, i/2, (struct rt_trie_node *)newn);
867 }
868
869 }
870
871 for (i = 0; i < olen; i += 2) {
872 struct tnode *newBinNode;
873
874 left = tnode_get_child(oldtnode, i);
875 right = tnode_get_child(oldtnode, i+1);
876
877 /* At least one of the children is empty */
878 if (left == NULL) {
879 if (right == NULL) /* Both are empty */
880 continue;
881 put_child(tn, i/2, right);
882 continue;
883 }
884
885 if (right == NULL) {
886 put_child(tn, i/2, left);
887 continue;
888 }
889
890 /* Two nonempty children */
891 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
892 put_child(tn, i/2, NULL);
893 put_child(newBinNode, 0, left);
894 put_child(newBinNode, 1, right);
895 put_child(tn, i/2, resize(t, newBinNode));
896 }
897 tnode_free_safe(oldtnode);
898 return tn;
899nomem:
900 tnode_clean_free(tn);
901 return ERR_PTR(-ENOMEM);
902}
903
904/* readside must use rcu_read_lock currently dump routines
905 via get_fa_head and dump */
906
907static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
908{
909 struct hlist_head *head = &l->list;
910 struct leaf_info *li;
911
912 hlist_for_each_entry_rcu(li, head, hlist)
913 if (li->plen == plen)
914 return li;
915
916 return NULL;
917}
918
919static inline struct list_head *get_fa_head(struct leaf *l, int plen)
920{
921 struct leaf_info *li = find_leaf_info(l, plen);
922
923 if (!li)
924 return NULL;
925
926 return &li->falh;
927}
928
929static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
930{
931 struct leaf_info *li = NULL, *last = NULL;
932
933 if (hlist_empty(head)) {
934 hlist_add_head_rcu(&new->hlist, head);
935 } else {
936 hlist_for_each_entry(li, head, hlist) {
937 if (new->plen > li->plen)
938 break;
939
940 last = li;
941 }
942 if (last)
943 hlist_add_after_rcu(&last->hlist, &new->hlist);
944 else
945 hlist_add_before_rcu(&new->hlist, &li->hlist);
946 }
947}
948
949/* rcu_read_lock needs to be hold by caller from readside */
950
951static struct leaf *
952fib_find_node(struct trie *t, u32 key)
953{
954 int pos;
955 struct tnode *tn;
956 struct rt_trie_node *n;
957
958 pos = 0;
959 n = rcu_dereference_rtnl(t->trie);
960
961 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
962 tn = (struct tnode *) n;
963
964 check_tnode(tn);
965
966 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
967 pos = tn->pos + tn->bits;
968 n = tnode_get_child_rcu(tn,
969 tkey_extract_bits(key,
970 tn->pos,
971 tn->bits));
972 } else
973 break;
974 }
975 /* Case we have found a leaf. Compare prefixes */
976
977 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
978 return (struct leaf *)n;
979
980 return NULL;
981}
982
983static void trie_rebalance(struct trie *t, struct tnode *tn)
984{
985 int wasfull;
986 t_key cindex, key;
987 struct tnode *tp;
988
989 key = tn->key;
990
991 while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
992 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
993 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
994 tn = (struct tnode *)resize(t, tn);
995
996 tnode_put_child_reorg(tp, cindex,
997 (struct rt_trie_node *)tn, wasfull);
998
999 tp = node_parent((struct rt_trie_node *) tn);
1000 if (!tp)
1001 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1002
1003 tnode_free_flush();
1004 if (!tp)
1005 break;
1006 tn = tp;
1007 }
1008
1009 /* Handle last (top) tnode */
1010 if (IS_TNODE(tn))
1011 tn = (struct tnode *)resize(t, tn);
1012
1013 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1014 tnode_free_flush();
1015}
1016
1017/* only used from updater-side */
1018
1019static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1020{
1021 int pos, newpos;
1022 struct tnode *tp = NULL, *tn = NULL;
1023 struct rt_trie_node *n;
1024 struct leaf *l;
1025 int missbit;
1026 struct list_head *fa_head = NULL;
1027 struct leaf_info *li;
1028 t_key cindex;
1029
1030 pos = 0;
1031 n = rtnl_dereference(t->trie);
1032
1033 /* If we point to NULL, stop. Either the tree is empty and we should
1034 * just put a new leaf in if, or we have reached an empty child slot,
1035 * and we should just put our new leaf in that.
1036 * If we point to a T_TNODE, check if it matches our key. Note that
1037 * a T_TNODE might be skipping any number of bits - its 'pos' need
1038 * not be the parent's 'pos'+'bits'!
1039 *
1040 * If it does match the current key, get pos/bits from it, extract
1041 * the index from our key, push the T_TNODE and walk the tree.
1042 *
1043 * If it doesn't, we have to replace it with a new T_TNODE.
1044 *
1045 * If we point to a T_LEAF, it might or might not have the same key
1046 * as we do. If it does, just change the value, update the T_LEAF's
1047 * value, and return it.
1048 * If it doesn't, we need to replace it with a T_TNODE.
1049 */
1050
1051 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1052 tn = (struct tnode *) n;
1053
1054 check_tnode(tn);
1055
1056 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1057 tp = tn;
1058 pos = tn->pos + tn->bits;
1059 n = tnode_get_child(tn,
1060 tkey_extract_bits(key,
1061 tn->pos,
1062 tn->bits));
1063
1064 BUG_ON(n && node_parent(n) != tn);
1065 } else
1066 break;
1067 }
1068
1069 /*
1070 * n ----> NULL, LEAF or TNODE
1071 *
1072 * tp is n's (parent) ----> NULL or TNODE
1073 */
1074
1075 BUG_ON(tp && IS_LEAF(tp));
1076
1077 /* Case 1: n is a leaf. Compare prefixes */
1078
1079 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1080 l = (struct leaf *) n;
1081 li = leaf_info_new(plen);
1082
1083 if (!li)
1084 return NULL;
1085
1086 fa_head = &li->falh;
1087 insert_leaf_info(&l->list, li);
1088 goto done;
1089 }
1090 l = leaf_new();
1091
1092 if (!l)
1093 return NULL;
1094
1095 l->key = key;
1096 li = leaf_info_new(plen);
1097
1098 if (!li) {
1099 free_leaf(l);
1100 return NULL;
1101 }
1102
1103 fa_head = &li->falh;
1104 insert_leaf_info(&l->list, li);
1105
1106 if (t->trie && n == NULL) {
1107 /* Case 2: n is NULL, and will just insert a new leaf */
1108
1109 node_set_parent((struct rt_trie_node *)l, tp);
1110
1111 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1112 put_child(tp, cindex, (struct rt_trie_node *)l);
1113 } else {
1114 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1115 /*
1116 * Add a new tnode here
1117 * first tnode need some special handling
1118 */
1119
1120 if (n) {
1121 pos = tp ? tp->pos+tp->bits : 0;
1122 newpos = tkey_mismatch(key, pos, n->key);
1123 tn = tnode_new(n->key, newpos, 1);
1124 } else {
1125 newpos = 0;
1126 tn = tnode_new(key, newpos, 1); /* First tnode */
1127 }
1128
1129 if (!tn) {
1130 free_leaf_info(li);
1131 free_leaf(l);
1132 return NULL;
1133 }
1134
1135 node_set_parent((struct rt_trie_node *)tn, tp);
1136
1137 missbit = tkey_extract_bits(key, newpos, 1);
1138 put_child(tn, missbit, (struct rt_trie_node *)l);
1139 put_child(tn, 1-missbit, n);
1140
1141 if (tp) {
1142 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1143 put_child(tp, cindex, (struct rt_trie_node *)tn);
1144 } else {
1145 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1146 tp = tn;
1147 }
1148 }
1149
1150 if (tp && tp->pos + tp->bits > 32)
1151 pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1152 tp, tp->pos, tp->bits, key, plen);
1153
1154 /* Rebalance the trie */
1155
1156 trie_rebalance(t, tp);
1157done:
1158 return fa_head;
1159}
1160
1161/*
1162 * Caller must hold RTNL.
1163 */
1164int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1165{
1166 struct trie *t = (struct trie *) tb->tb_data;
1167 struct fib_alias *fa, *new_fa;
1168 struct list_head *fa_head = NULL;
1169 struct fib_info *fi;
1170 int plen = cfg->fc_dst_len;
1171 u8 tos = cfg->fc_tos;
1172 u32 key, mask;
1173 int err;
1174 struct leaf *l;
1175
1176 if (plen > 32)
1177 return -EINVAL;
1178
1179 key = ntohl(cfg->fc_dst);
1180
1181 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1182
1183 mask = ntohl(inet_make_mask(plen));
1184
1185 if (key & ~mask)
1186 return -EINVAL;
1187
1188 key = key & mask;
1189
1190 fi = fib_create_info(cfg);
1191 if (IS_ERR(fi)) {
1192 err = PTR_ERR(fi);
1193 goto err;
1194 }
1195
1196 l = fib_find_node(t, key);
1197 fa = NULL;
1198
1199 if (l) {
1200 fa_head = get_fa_head(l, plen);
1201 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1202 }
1203
1204 /* Now fa, if non-NULL, points to the first fib alias
1205 * with the same keys [prefix,tos,priority], if such key already
1206 * exists or to the node before which we will insert new one.
1207 *
1208 * If fa is NULL, we will need to allocate a new one and
1209 * insert to the head of f.
1210 *
1211 * If f is NULL, no fib node matched the destination key
1212 * and we need to allocate a new one of those as well.
1213 */
1214
1215 if (fa && fa->fa_tos == tos &&
1216 fa->fa_info->fib_priority == fi->fib_priority) {
1217 struct fib_alias *fa_first, *fa_match;
1218
1219 err = -EEXIST;
1220 if (cfg->fc_nlflags & NLM_F_EXCL)
1221 goto out;
1222
1223 /* We have 2 goals:
1224 * 1. Find exact match for type, scope, fib_info to avoid
1225 * duplicate routes
1226 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1227 */
1228 fa_match = NULL;
1229 fa_first = fa;
1230 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1231 list_for_each_entry_continue(fa, fa_head, fa_list) {
1232 if (fa->fa_tos != tos)
1233 break;
1234 if (fa->fa_info->fib_priority != fi->fib_priority)
1235 break;
1236 if (fa->fa_type == cfg->fc_type &&
1237 fa->fa_info == fi) {
1238 fa_match = fa;
1239 break;
1240 }
1241 }
1242
1243 if (cfg->fc_nlflags & NLM_F_REPLACE) {
1244 struct fib_info *fi_drop;
1245 u8 state;
1246
1247 fa = fa_first;
1248 if (fa_match) {
1249 if (fa == fa_match)
1250 err = 0;
1251 goto out;
1252 }
1253 err = -ENOBUFS;
1254 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1255 if (new_fa == NULL)
1256 goto out;
1257
1258 fi_drop = fa->fa_info;
1259 new_fa->fa_tos = fa->fa_tos;
1260 new_fa->fa_info = fi;
1261 new_fa->fa_type = cfg->fc_type;
1262 state = fa->fa_state;
1263 new_fa->fa_state = state & ~FA_S_ACCESSED;
1264
1265 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1266 alias_free_mem_rcu(fa);
1267
1268 fib_release_info(fi_drop);
1269 if (state & FA_S_ACCESSED)
1270 rt_cache_flush(cfg->fc_nlinfo.nl_net);
1271 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1272 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1273
1274 goto succeeded;
1275 }
1276 /* Error if we find a perfect match which
1277 * uses the same scope, type, and nexthop
1278 * information.
1279 */
1280 if (fa_match)
1281 goto out;
1282
1283 if (!(cfg->fc_nlflags & NLM_F_APPEND))
1284 fa = fa_first;
1285 }
1286 err = -ENOENT;
1287 if (!(cfg->fc_nlflags & NLM_F_CREATE))
1288 goto out;
1289
1290 err = -ENOBUFS;
1291 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1292 if (new_fa == NULL)
1293 goto out;
1294
1295 new_fa->fa_info = fi;
1296 new_fa->fa_tos = tos;
1297 new_fa->fa_type = cfg->fc_type;
1298 new_fa->fa_state = 0;
1299 /*
1300 * Insert new entry to the list.
1301 */
1302
1303 if (!fa_head) {
1304 fa_head = fib_insert_node(t, key, plen);
1305 if (unlikely(!fa_head)) {
1306 err = -ENOMEM;
1307 goto out_free_new_fa;
1308 }
1309 }
1310
1311 if (!plen)
1312 tb->tb_num_default++;
1313
1314 list_add_tail_rcu(&new_fa->fa_list,
1315 (fa ? &fa->fa_list : fa_head));
1316
1317 rt_cache_flush(cfg->fc_nlinfo.nl_net);
1318 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1319 &cfg->fc_nlinfo, 0);
1320succeeded:
1321 return 0;
1322
1323out_free_new_fa:
1324 kmem_cache_free(fn_alias_kmem, new_fa);
1325out:
1326 fib_release_info(fi);
1327err:
1328 return err;
1329}
1330
1331/* should be called with rcu_read_lock */
1332static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
1333 t_key key, const struct flowi4 *flp,
1334 struct fib_result *res, int fib_flags)
1335{
1336 struct leaf_info *li;
1337 struct hlist_head *hhead = &l->list;
1338
1339 hlist_for_each_entry_rcu(li, hhead, hlist) {
1340 struct fib_alias *fa;
1341
1342 if (l->key != (key & li->mask_plen))
1343 continue;
1344
1345 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1346 struct fib_info *fi = fa->fa_info;
1347 int nhsel, err;
1348
1349 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1350 continue;
1351 if (fi->fib_dead)
1352 continue;
1353 if (fa->fa_info->fib_scope < flp->flowi4_scope)
1354 continue;
1355 fib_alias_accessed(fa);
1356 err = fib_props[fa->fa_type].error;
1357 if (err) {
1358#ifdef CONFIG_IP_FIB_TRIE_STATS
1359 t->stats.semantic_match_passed++;
1360#endif
1361 return err;
1362 }
1363 if (fi->fib_flags & RTNH_F_DEAD)
1364 continue;
1365 for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1366 const struct fib_nh *nh = &fi->fib_nh[nhsel];
1367
1368 if (nh->nh_flags & RTNH_F_DEAD)
1369 continue;
1370 if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1371 continue;
1372
1373#ifdef CONFIG_IP_FIB_TRIE_STATS
1374 t->stats.semantic_match_passed++;
1375#endif
1376 res->prefixlen = li->plen;
1377 res->nh_sel = nhsel;
1378 res->type = fa->fa_type;
1379 res->scope = fa->fa_info->fib_scope;
1380 res->fi = fi;
1381 res->table = tb;
1382 res->fa_head = &li->falh;
1383 if (!(fib_flags & FIB_LOOKUP_NOREF))
1384 atomic_inc(&fi->fib_clntref);
1385 return 0;
1386 }
1387 }
1388
1389#ifdef CONFIG_IP_FIB_TRIE_STATS
1390 t->stats.semantic_match_miss++;
1391#endif
1392 }
1393
1394 return 1;
1395}
1396
1397int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1398 struct fib_result *res, int fib_flags)
1399{
1400 struct trie *t = (struct trie *) tb->tb_data;
1401 int ret;
1402 struct rt_trie_node *n;
1403 struct tnode *pn;
1404 unsigned int pos, bits;
1405 t_key key = ntohl(flp->daddr);
1406 unsigned int chopped_off;
1407 t_key cindex = 0;
1408 unsigned int current_prefix_length = KEYLENGTH;
1409 struct tnode *cn;
1410 t_key pref_mismatch;
1411
1412 rcu_read_lock();
1413
1414 n = rcu_dereference(t->trie);
1415 if (!n)
1416 goto failed;
1417
1418#ifdef CONFIG_IP_FIB_TRIE_STATS
1419 t->stats.gets++;
1420#endif
1421
1422 /* Just a leaf? */
1423 if (IS_LEAF(n)) {
1424 ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1425 goto found;
1426 }
1427
1428 pn = (struct tnode *) n;
1429 chopped_off = 0;
1430
1431 while (pn) {
1432 pos = pn->pos;
1433 bits = pn->bits;
1434
1435 if (!chopped_off)
1436 cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1437 pos, bits);
1438
1439 n = tnode_get_child_rcu(pn, cindex);
1440
1441 if (n == NULL) {
1442#ifdef CONFIG_IP_FIB_TRIE_STATS
1443 t->stats.null_node_hit++;
1444#endif
1445 goto backtrace;
1446 }
1447
1448 if (IS_LEAF(n)) {
1449 ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1450 if (ret > 0)
1451 goto backtrace;
1452 goto found;
1453 }
1454
1455 cn = (struct tnode *)n;
1456
1457 /*
1458 * It's a tnode, and we can do some extra checks here if we
1459 * like, to avoid descending into a dead-end branch.
1460 * This tnode is in the parent's child array at index
1461 * key[p_pos..p_pos+p_bits] but potentially with some bits
1462 * chopped off, so in reality the index may be just a
1463 * subprefix, padded with zero at the end.
1464 * We can also take a look at any skipped bits in this
1465 * tnode - everything up to p_pos is supposed to be ok,
1466 * and the non-chopped bits of the index (se previous
1467 * paragraph) are also guaranteed ok, but the rest is
1468 * considered unknown.
1469 *
1470 * The skipped bits are key[pos+bits..cn->pos].
1471 */
1472
1473 /* If current_prefix_length < pos+bits, we are already doing
1474 * actual prefix matching, which means everything from
1475 * pos+(bits-chopped_off) onward must be zero along some
1476 * branch of this subtree - otherwise there is *no* valid
1477 * prefix present. Here we can only check the skipped
1478 * bits. Remember, since we have already indexed into the
1479 * parent's child array, we know that the bits we chopped of
1480 * *are* zero.
1481 */
1482
1483 /* NOTA BENE: Checking only skipped bits
1484 for the new node here */
1485
1486 if (current_prefix_length < pos+bits) {
1487 if (tkey_extract_bits(cn->key, current_prefix_length,
1488 cn->pos - current_prefix_length)
1489 || !(cn->child[0]))
1490 goto backtrace;
1491 }
1492
1493 /*
1494 * If chopped_off=0, the index is fully validated and we
1495 * only need to look at the skipped bits for this, the new,
1496 * tnode. What we actually want to do is to find out if
1497 * these skipped bits match our key perfectly, or if we will
1498 * have to count on finding a matching prefix further down,
1499 * because if we do, we would like to have some way of
1500 * verifying the existence of such a prefix at this point.
1501 */
1502
1503 /* The only thing we can do at this point is to verify that
1504 * any such matching prefix can indeed be a prefix to our
1505 * key, and if the bits in the node we are inspecting that
1506 * do not match our key are not ZERO, this cannot be true.
1507 * Thus, find out where there is a mismatch (before cn->pos)
1508 * and verify that all the mismatching bits are zero in the
1509 * new tnode's key.
1510 */
1511
1512 /*
1513 * Note: We aren't very concerned about the piece of
1514 * the key that precede pn->pos+pn->bits, since these
1515 * have already been checked. The bits after cn->pos
1516 * aren't checked since these are by definition
1517 * "unknown" at this point. Thus, what we want to see
1518 * is if we are about to enter the "prefix matching"
1519 * state, and in that case verify that the skipped
1520 * bits that will prevail throughout this subtree are
1521 * zero, as they have to be if we are to find a
1522 * matching prefix.
1523 */
1524
1525 pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
1526
1527 /*
1528 * In short: If skipped bits in this node do not match
1529 * the search key, enter the "prefix matching"
1530 * state.directly.
1531 */
1532 if (pref_mismatch) {
1533 /* fls(x) = __fls(x) + 1 */
1534 int mp = KEYLENGTH - __fls(pref_mismatch) - 1;
1535
1536 if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
1537 goto backtrace;
1538
1539 if (current_prefix_length >= cn->pos)
1540 current_prefix_length = mp;
1541 }
1542
1543 pn = (struct tnode *)n; /* Descend */
1544 chopped_off = 0;
1545 continue;
1546
1547backtrace:
1548 chopped_off++;
1549
1550 /* As zero don't change the child key (cindex) */
1551 while ((chopped_off <= pn->bits)
1552 && !(cindex & (1<<(chopped_off-1))))
1553 chopped_off++;
1554
1555 /* Decrease current_... with bits chopped off */
1556 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1557 current_prefix_length = pn->pos + pn->bits
1558 - chopped_off;
1559
1560 /*
1561 * Either we do the actual chop off according or if we have
1562 * chopped off all bits in this tnode walk up to our parent.
1563 */
1564
1565 if (chopped_off <= pn->bits) {
1566 cindex &= ~(1 << (chopped_off-1));
1567 } else {
1568 struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
1569 if (!parent)
1570 goto failed;
1571
1572 /* Get Child's index */
1573 cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1574 pn = parent;
1575 chopped_off = 0;
1576
1577#ifdef CONFIG_IP_FIB_TRIE_STATS
1578 t->stats.backtrack++;
1579#endif
1580 goto backtrace;
1581 }
1582 }
1583failed:
1584 ret = 1;
1585found:
1586 rcu_read_unlock();
1587 return ret;
1588}
1589EXPORT_SYMBOL_GPL(fib_table_lookup);
1590
1591/*
1592 * Remove the leaf and return parent.
1593 */
1594static void trie_leaf_remove(struct trie *t, struct leaf *l)
1595{
1596 struct tnode *tp = node_parent((struct rt_trie_node *) l);
1597
1598 pr_debug("entering trie_leaf_remove(%p)\n", l);
1599
1600 if (tp) {
1601 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1602 put_child(tp, cindex, NULL);
1603 trie_rebalance(t, tp);
1604 } else
1605 RCU_INIT_POINTER(t->trie, NULL);
1606
1607 free_leaf(l);
1608}
1609
1610/*
1611 * Caller must hold RTNL.
1612 */
1613int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1614{
1615 struct trie *t = (struct trie *) tb->tb_data;
1616 u32 key, mask;
1617 int plen = cfg->fc_dst_len;
1618 u8 tos = cfg->fc_tos;
1619 struct fib_alias *fa, *fa_to_delete;
1620 struct list_head *fa_head;
1621 struct leaf *l;
1622 struct leaf_info *li;
1623
1624 if (plen > 32)
1625 return -EINVAL;
1626
1627 key = ntohl(cfg->fc_dst);
1628 mask = ntohl(inet_make_mask(plen));
1629
1630 if (key & ~mask)
1631 return -EINVAL;
1632
1633 key = key & mask;
1634 l = fib_find_node(t, key);
1635
1636 if (!l)
1637 return -ESRCH;
1638
1639 li = find_leaf_info(l, plen);
1640
1641 if (!li)
1642 return -ESRCH;
1643
1644 fa_head = &li->falh;
1645 fa = fib_find_alias(fa_head, tos, 0);
1646
1647 if (!fa)
1648 return -ESRCH;
1649
1650 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1651
1652 fa_to_delete = NULL;
1653 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1654 list_for_each_entry_continue(fa, fa_head, fa_list) {
1655 struct fib_info *fi = fa->fa_info;
1656
1657 if (fa->fa_tos != tos)
1658 break;
1659
1660 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1661 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1662 fa->fa_info->fib_scope == cfg->fc_scope) &&
1663 (!cfg->fc_prefsrc ||
1664 fi->fib_prefsrc == cfg->fc_prefsrc) &&
1665 (!cfg->fc_protocol ||
1666 fi->fib_protocol == cfg->fc_protocol) &&
1667 fib_nh_match(cfg, fi) == 0) {
1668 fa_to_delete = fa;
1669 break;
1670 }
1671 }
1672
1673 if (!fa_to_delete)
1674 return -ESRCH;
1675
1676 fa = fa_to_delete;
1677 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1678 &cfg->fc_nlinfo, 0);
1679
1680 list_del_rcu(&fa->fa_list);
1681
1682 if (!plen)
1683 tb->tb_num_default--;
1684
1685 if (list_empty(fa_head)) {
1686 hlist_del_rcu(&li->hlist);
1687 free_leaf_info(li);
1688 }
1689
1690 if (hlist_empty(&l->list))
1691 trie_leaf_remove(t, l);
1692
1693 if (fa->fa_state & FA_S_ACCESSED)
1694 rt_cache_flush(cfg->fc_nlinfo.nl_net);
1695
1696 fib_release_info(fa->fa_info);
1697 alias_free_mem_rcu(fa);
1698 return 0;
1699}
1700
1701static int trie_flush_list(struct list_head *head)
1702{
1703 struct fib_alias *fa, *fa_node;
1704 int found = 0;
1705
1706 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1707 struct fib_info *fi = fa->fa_info;
1708
1709 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1710 list_del_rcu(&fa->fa_list);
1711 fib_release_info(fa->fa_info);
1712 alias_free_mem_rcu(fa);
1713 found++;
1714 }
1715 }
1716 return found;
1717}
1718
1719static int trie_flush_leaf(struct leaf *l)
1720{
1721 int found = 0;
1722 struct hlist_head *lih = &l->list;
1723 struct hlist_node *tmp;
1724 struct leaf_info *li = NULL;
1725
1726 hlist_for_each_entry_safe(li, tmp, lih, hlist) {
1727 found += trie_flush_list(&li->falh);
1728
1729 if (list_empty(&li->falh)) {
1730 hlist_del_rcu(&li->hlist);
1731 free_leaf_info(li);
1732 }
1733 }
1734 return found;
1735}
1736
1737/*
1738 * Scan for the next right leaf starting at node p->child[idx]
1739 * Since we have back pointer, no recursion necessary.
1740 */
1741static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
1742{
1743 do {
1744 t_key idx;
1745
1746 if (c)
1747 idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1748 else
1749 idx = 0;
1750
1751 while (idx < 1u << p->bits) {
1752 c = tnode_get_child_rcu(p, idx++);
1753 if (!c)
1754 continue;
1755
1756 if (IS_LEAF(c))
1757 return (struct leaf *) c;
1758
1759 /* Rescan start scanning in new node */
1760 p = (struct tnode *) c;
1761 idx = 0;
1762 }
1763
1764 /* Node empty, walk back up to parent */
1765 c = (struct rt_trie_node *) p;
1766 } while ((p = node_parent_rcu(c)) != NULL);
1767
1768 return NULL; /* Root of trie */
1769}
1770
1771static struct leaf *trie_firstleaf(struct trie *t)
1772{
1773 struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
1774
1775 if (!n)
1776 return NULL;
1777
1778 if (IS_LEAF(n)) /* trie is just a leaf */
1779 return (struct leaf *) n;
1780
1781 return leaf_walk_rcu(n, NULL);
1782}
1783
1784static struct leaf *trie_nextleaf(struct leaf *l)
1785{
1786 struct rt_trie_node *c = (struct rt_trie_node *) l;
1787 struct tnode *p = node_parent_rcu(c);
1788
1789 if (!p)
1790 return NULL; /* trie with just one leaf */
1791
1792 return leaf_walk_rcu(p, c);
1793}
1794
1795static struct leaf *trie_leafindex(struct trie *t, int index)
1796{
1797 struct leaf *l = trie_firstleaf(t);
1798
1799 while (l && index-- > 0)
1800 l = trie_nextleaf(l);
1801
1802 return l;
1803}
1804
1805
1806/*
1807 * Caller must hold RTNL.
1808 */
1809int fib_table_flush(struct fib_table *tb)
1810{
1811 struct trie *t = (struct trie *) tb->tb_data;
1812 struct leaf *l, *ll = NULL;
1813 int found = 0;
1814
1815 for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1816 found += trie_flush_leaf(l);
1817
1818 if (ll && hlist_empty(&ll->list))
1819 trie_leaf_remove(t, ll);
1820 ll = l;
1821 }
1822
1823 if (ll && hlist_empty(&ll->list))
1824 trie_leaf_remove(t, ll);
1825
1826 pr_debug("trie_flush found=%d\n", found);
1827 return found;
1828}
1829
1830void fib_free_table(struct fib_table *tb)
1831{
1832 kfree(tb);
1833}
1834
1835static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1836 struct fib_table *tb,
1837 struct sk_buff *skb, struct netlink_callback *cb)
1838{
1839 int i, s_i;
1840 struct fib_alias *fa;
1841 __be32 xkey = htonl(key);
1842
1843 s_i = cb->args[5];
1844 i = 0;
1845
1846 /* rcu_read_lock is hold by caller */
1847
1848 list_for_each_entry_rcu(fa, fah, fa_list) {
1849 if (i < s_i) {
1850 i++;
1851 continue;
1852 }
1853
1854 if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
1855 cb->nlh->nlmsg_seq,
1856 RTM_NEWROUTE,
1857 tb->tb_id,
1858 fa->fa_type,
1859 xkey,
1860 plen,
1861 fa->fa_tos,
1862 fa->fa_info, NLM_F_MULTI) < 0) {
1863 cb->args[5] = i;
1864 return -1;
1865 }
1866 i++;
1867 }
1868 cb->args[5] = i;
1869 return skb->len;
1870}
1871
1872static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1873 struct sk_buff *skb, struct netlink_callback *cb)
1874{
1875 struct leaf_info *li;
1876 int i, s_i;
1877
1878 s_i = cb->args[4];
1879 i = 0;
1880
1881 /* rcu_read_lock is hold by caller */
1882 hlist_for_each_entry_rcu(li, &l->list, hlist) {
1883 if (i < s_i) {
1884 i++;
1885 continue;
1886 }
1887
1888 if (i > s_i)
1889 cb->args[5] = 0;
1890
1891 if (list_empty(&li->falh))
1892 continue;
1893
1894 if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1895 cb->args[4] = i;
1896 return -1;
1897 }
1898 i++;
1899 }
1900
1901 cb->args[4] = i;
1902 return skb->len;
1903}
1904
1905int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1906 struct netlink_callback *cb)
1907{
1908 struct leaf *l;
1909 struct trie *t = (struct trie *) tb->tb_data;
1910 t_key key = cb->args[2];
1911 int count = cb->args[3];
1912
1913 rcu_read_lock();
1914 /* Dump starting at last key.
1915 * Note: 0.0.0.0/0 (ie default) is first key.
1916 */
1917 if (count == 0)
1918 l = trie_firstleaf(t);
1919 else {
1920 /* Normally, continue from last key, but if that is missing
1921 * fallback to using slow rescan
1922 */
1923 l = fib_find_node(t, key);
1924 if (!l)
1925 l = trie_leafindex(t, count);
1926 }
1927
1928 while (l) {
1929 cb->args[2] = l->key;
1930 if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1931 cb->args[3] = count;
1932 rcu_read_unlock();
1933 return -1;
1934 }
1935
1936 ++count;
1937 l = trie_nextleaf(l);
1938 memset(&cb->args[4], 0,
1939 sizeof(cb->args) - 4*sizeof(cb->args[0]));
1940 }
1941 cb->args[3] = count;
1942 rcu_read_unlock();
1943
1944 return skb->len;
1945}
1946
1947void __init fib_trie_init(void)
1948{
1949 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1950 sizeof(struct fib_alias),
1951 0, SLAB_PANIC, NULL);
1952
1953 trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1954 max(sizeof(struct leaf),
1955 sizeof(struct leaf_info)),
1956 0, SLAB_PANIC, NULL);
1957}
1958
1959
1960struct fib_table *fib_trie_table(u32 id)
1961{
1962 struct fib_table *tb;
1963 struct trie *t;
1964
1965 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1966 GFP_KERNEL);
1967 if (tb == NULL)
1968 return NULL;
1969
1970 tb->tb_id = id;
1971 tb->tb_default = -1;
1972 tb->tb_num_default = 0;
1973
1974 t = (struct trie *) tb->tb_data;
1975 memset(t, 0, sizeof(*t));
1976
1977 return tb;
1978}
1979
1980#ifdef CONFIG_PROC_FS
1981/* Depth first Trie walk iterator */
1982struct fib_trie_iter {
1983 struct seq_net_private p;
1984 struct fib_table *tb;
1985 struct tnode *tnode;
1986 unsigned int index;
1987 unsigned int depth;
1988};
1989
1990static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
1991{
1992 struct tnode *tn = iter->tnode;
1993 unsigned int cindex = iter->index;
1994 struct tnode *p;
1995
1996 /* A single entry routing table */
1997 if (!tn)
1998 return NULL;
1999
2000 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2001 iter->tnode, iter->index, iter->depth);
2002rescan:
2003 while (cindex < (1<<tn->bits)) {
2004 struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
2005
2006 if (n) {
2007 if (IS_LEAF(n)) {
2008 iter->tnode = tn;
2009 iter->index = cindex + 1;
2010 } else {
2011 /* push down one level */
2012 iter->tnode = (struct tnode *) n;
2013 iter->index = 0;
2014 ++iter->depth;
2015 }
2016 return n;
2017 }
2018
2019 ++cindex;
2020 }
2021
2022 /* Current node exhausted, pop back up */
2023 p = node_parent_rcu((struct rt_trie_node *)tn);
2024 if (p) {
2025 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2026 tn = p;
2027 --iter->depth;
2028 goto rescan;
2029 }
2030
2031 /* got root? */
2032 return NULL;
2033}
2034
2035static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
2036 struct trie *t)
2037{
2038 struct rt_trie_node *n;
2039
2040 if (!t)
2041 return NULL;
2042
2043 n = rcu_dereference(t->trie);
2044 if (!n)
2045 return NULL;
2046
2047 if (IS_TNODE(n)) {
2048 iter->tnode = (struct tnode *) n;
2049 iter->index = 0;
2050 iter->depth = 1;
2051 } else {
2052 iter->tnode = NULL;
2053 iter->index = 0;
2054 iter->depth = 0;
2055 }
2056
2057 return n;
2058}
2059
2060static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2061{
2062 struct rt_trie_node *n;
2063 struct fib_trie_iter iter;
2064
2065 memset(s, 0, sizeof(*s));
2066
2067 rcu_read_lock();
2068 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2069 if (IS_LEAF(n)) {
2070 struct leaf *l = (struct leaf *)n;
2071 struct leaf_info *li;
2072
2073 s->leaves++;
2074 s->totdepth += iter.depth;
2075 if (iter.depth > s->maxdepth)
2076 s->maxdepth = iter.depth;
2077
2078 hlist_for_each_entry_rcu(li, &l->list, hlist)
2079 ++s->prefixes;
2080 } else {
2081 const struct tnode *tn = (const struct tnode *) n;
2082 int i;
2083
2084 s->tnodes++;
2085 if (tn->bits < MAX_STAT_DEPTH)
2086 s->nodesizes[tn->bits]++;
2087
2088 for (i = 0; i < (1<<tn->bits); i++)
2089 if (!tn->child[i])
2090 s->nullpointers++;
2091 }
2092 }
2093 rcu_read_unlock();
2094}
2095
2096/*
2097 * This outputs /proc/net/fib_triestats
2098 */
2099static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2100{
2101 unsigned int i, max, pointers, bytes, avdepth;
2102
2103 if (stat->leaves)
2104 avdepth = stat->totdepth*100 / stat->leaves;
2105 else
2106 avdepth = 0;
2107
2108 seq_printf(seq, "\tAver depth: %u.%02d\n",
2109 avdepth / 100, avdepth % 100);
2110 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
2111
2112 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
2113 bytes = sizeof(struct leaf) * stat->leaves;
2114
2115 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
2116 bytes += sizeof(struct leaf_info) * stat->prefixes;
2117
2118 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2119 bytes += sizeof(struct tnode) * stat->tnodes;
2120
2121 max = MAX_STAT_DEPTH;
2122 while (max > 0 && stat->nodesizes[max-1] == 0)
2123 max--;
2124
2125 pointers = 0;
2126 for (i = 1; i < max; i++)
2127 if (stat->nodesizes[i] != 0) {
2128 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
2129 pointers += (1<<i) * stat->nodesizes[i];
2130 }
2131 seq_putc(seq, '\n');
2132 seq_printf(seq, "\tPointers: %u\n", pointers);
2133
2134 bytes += sizeof(struct rt_trie_node *) * pointers;
2135 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2136 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
2137}
2138
2139#ifdef CONFIG_IP_FIB_TRIE_STATS
2140static void trie_show_usage(struct seq_file *seq,
2141 const struct trie_use_stats *stats)
2142{
2143 seq_printf(seq, "\nCounters:\n---------\n");
2144 seq_printf(seq, "gets = %u\n", stats->gets);
2145 seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2146 seq_printf(seq, "semantic match passed = %u\n",
2147 stats->semantic_match_passed);
2148 seq_printf(seq, "semantic match miss = %u\n",
2149 stats->semantic_match_miss);
2150 seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2151 seq_printf(seq, "skipped node resize = %u\n\n",
2152 stats->resize_node_skipped);
2153}
2154#endif /* CONFIG_IP_FIB_TRIE_STATS */
2155
2156static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2157{
2158 if (tb->tb_id == RT_TABLE_LOCAL)
2159 seq_puts(seq, "Local:\n");
2160 else if (tb->tb_id == RT_TABLE_MAIN)
2161 seq_puts(seq, "Main:\n");
2162 else
2163 seq_printf(seq, "Id %d:\n", tb->tb_id);
2164}
2165
2166
2167static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2168{
2169 struct net *net = (struct net *)seq->private;
2170 unsigned int h;
2171
2172 seq_printf(seq,
2173 "Basic info: size of leaf:"
2174 " %Zd bytes, size of tnode: %Zd bytes.\n",
2175 sizeof(struct leaf), sizeof(struct tnode));
2176
2177 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2178 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2179 struct fib_table *tb;
2180
2181 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2182 struct trie *t = (struct trie *) tb->tb_data;
2183 struct trie_stat stat;
2184
2185 if (!t)
2186 continue;
2187
2188 fib_table_print(seq, tb);
2189
2190 trie_collect_stats(t, &stat);
2191 trie_show_stats(seq, &stat);
2192#ifdef CONFIG_IP_FIB_TRIE_STATS
2193 trie_show_usage(seq, &t->stats);
2194#endif
2195 }
2196 }
2197
2198 return 0;
2199}
2200
2201static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2202{
2203 return single_open_net(inode, file, fib_triestat_seq_show);
2204}
2205
2206static const struct file_operations fib_triestat_fops = {
2207 .owner = THIS_MODULE,
2208 .open = fib_triestat_seq_open,
2209 .read = seq_read,
2210 .llseek = seq_lseek,
2211 .release = single_release_net,
2212};
2213
2214static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2215{
2216 struct fib_trie_iter *iter = seq->private;
2217 struct net *net = seq_file_net(seq);
2218 loff_t idx = 0;
2219 unsigned int h;
2220
2221 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2222 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2223 struct fib_table *tb;
2224
2225 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2226 struct rt_trie_node *n;
2227
2228 for (n = fib_trie_get_first(iter,
2229 (struct trie *) tb->tb_data);
2230 n; n = fib_trie_get_next(iter))
2231 if (pos == idx++) {
2232 iter->tb = tb;
2233 return n;
2234 }
2235 }
2236 }
2237
2238 return NULL;
2239}
2240
2241static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2242 __acquires(RCU)
2243{
2244 rcu_read_lock();
2245 return fib_trie_get_idx(seq, *pos);
2246}
2247
2248static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2249{
2250 struct fib_trie_iter *iter = seq->private;
2251 struct net *net = seq_file_net(seq);
2252 struct fib_table *tb = iter->tb;
2253 struct hlist_node *tb_node;
2254 unsigned int h;
2255 struct rt_trie_node *n;
2256
2257 ++*pos;
2258 /* next node in same table */
2259 n = fib_trie_get_next(iter);
2260 if (n)
2261 return n;
2262
2263 /* walk rest of this hash chain */
2264 h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2265 while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2266 tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2267 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2268 if (n)
2269 goto found;
2270 }
2271
2272 /* new hash chain */
2273 while (++h < FIB_TABLE_HASHSZ) {
2274 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2275 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2276 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2277 if (n)
2278 goto found;
2279 }
2280 }
2281 return NULL;
2282
2283found:
2284 iter->tb = tb;
2285 return n;
2286}
2287
2288static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2289 __releases(RCU)
2290{
2291 rcu_read_unlock();
2292}
2293
2294static void seq_indent(struct seq_file *seq, int n)
2295{
2296 while (n-- > 0)
2297 seq_puts(seq, " ");
2298}
2299
2300static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2301{
2302 switch (s) {
2303 case RT_SCOPE_UNIVERSE: return "universe";
2304 case RT_SCOPE_SITE: return "site";
2305 case RT_SCOPE_LINK: return "link";
2306 case RT_SCOPE_HOST: return "host";
2307 case RT_SCOPE_NOWHERE: return "nowhere";
2308 default:
2309 snprintf(buf, len, "scope=%d", s);
2310 return buf;
2311 }
2312}
2313
2314static const char *const rtn_type_names[__RTN_MAX] = {
2315 [RTN_UNSPEC] = "UNSPEC",
2316 [RTN_UNICAST] = "UNICAST",
2317 [RTN_LOCAL] = "LOCAL",
2318 [RTN_BROADCAST] = "BROADCAST",
2319 [RTN_ANYCAST] = "ANYCAST",
2320 [RTN_MULTICAST] = "MULTICAST",
2321 [RTN_BLACKHOLE] = "BLACKHOLE",
2322 [RTN_UNREACHABLE] = "UNREACHABLE",
2323 [RTN_PROHIBIT] = "PROHIBIT",
2324 [RTN_THROW] = "THROW",
2325 [RTN_NAT] = "NAT",
2326 [RTN_XRESOLVE] = "XRESOLVE",
2327};
2328
2329static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2330{
2331 if (t < __RTN_MAX && rtn_type_names[t])
2332 return rtn_type_names[t];
2333 snprintf(buf, len, "type %u", t);
2334 return buf;
2335}
2336
2337/* Pretty print the trie */
2338static int fib_trie_seq_show(struct seq_file *seq, void *v)
2339{
2340 const struct fib_trie_iter *iter = seq->private;
2341 struct rt_trie_node *n = v;
2342
2343 if (!node_parent_rcu(n))
2344 fib_table_print(seq, iter->tb);
2345
2346 if (IS_TNODE(n)) {
2347 struct tnode *tn = (struct tnode *) n;
2348 __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2349
2350 seq_indent(seq, iter->depth-1);
2351 seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
2352 &prf, tn->pos, tn->bits, tn->full_children,
2353 tn->empty_children);
2354
2355 } else {
2356 struct leaf *l = (struct leaf *) n;
2357 struct leaf_info *li;
2358 __be32 val = htonl(l->key);
2359
2360 seq_indent(seq, iter->depth);
2361 seq_printf(seq, " |-- %pI4\n", &val);
2362
2363 hlist_for_each_entry_rcu(li, &l->list, hlist) {
2364 struct fib_alias *fa;
2365
2366 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2367 char buf1[32], buf2[32];
2368
2369 seq_indent(seq, iter->depth+1);
2370 seq_printf(seq, " /%d %s %s", li->plen,
2371 rtn_scope(buf1, sizeof(buf1),
2372 fa->fa_info->fib_scope),
2373 rtn_type(buf2, sizeof(buf2),
2374 fa->fa_type));
2375 if (fa->fa_tos)
2376 seq_printf(seq, " tos=%d", fa->fa_tos);
2377 seq_putc(seq, '\n');
2378 }
2379 }
2380 }
2381
2382 return 0;
2383}
2384
2385static const struct seq_operations fib_trie_seq_ops = {
2386 .start = fib_trie_seq_start,
2387 .next = fib_trie_seq_next,
2388 .stop = fib_trie_seq_stop,
2389 .show = fib_trie_seq_show,
2390};
2391
2392static int fib_trie_seq_open(struct inode *inode, struct file *file)
2393{
2394 return seq_open_net(inode, file, &fib_trie_seq_ops,
2395 sizeof(struct fib_trie_iter));
2396}
2397
2398static const struct file_operations fib_trie_fops = {
2399 .owner = THIS_MODULE,
2400 .open = fib_trie_seq_open,
2401 .read = seq_read,
2402 .llseek = seq_lseek,
2403 .release = seq_release_net,
2404};
2405
2406struct fib_route_iter {
2407 struct seq_net_private p;
2408 struct trie *main_trie;
2409 loff_t pos;
2410 t_key key;
2411};
2412
2413static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2414{
2415 struct leaf *l = NULL;
2416 struct trie *t = iter->main_trie;
2417
2418 /* use cache location of last found key */
2419 if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2420 pos -= iter->pos;
2421 else {
2422 iter->pos = 0;
2423 l = trie_firstleaf(t);
2424 }
2425
2426 while (l && pos-- > 0) {
2427 iter->pos++;
2428 l = trie_nextleaf(l);
2429 }
2430
2431 if (l)
2432 iter->key = pos; /* remember it */
2433 else
2434 iter->pos = 0; /* forget it */
2435
2436 return l;
2437}
2438
2439static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2440 __acquires(RCU)
2441{
2442 struct fib_route_iter *iter = seq->private;
2443 struct fib_table *tb;
2444
2445 rcu_read_lock();
2446 tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2447 if (!tb)
2448 return NULL;
2449
2450 iter->main_trie = (struct trie *) tb->tb_data;
2451 if (*pos == 0)
2452 return SEQ_START_TOKEN;
2453 else
2454 return fib_route_get_idx(iter, *pos - 1);
2455}
2456
2457static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2458{
2459 struct fib_route_iter *iter = seq->private;
2460 struct leaf *l = v;
2461
2462 ++*pos;
2463 if (v == SEQ_START_TOKEN) {
2464 iter->pos = 0;
2465 l = trie_firstleaf(iter->main_trie);
2466 } else {
2467 iter->pos++;
2468 l = trie_nextleaf(l);
2469 }
2470
2471 if (l)
2472 iter->key = l->key;
2473 else
2474 iter->pos = 0;
2475 return l;
2476}
2477
2478static void fib_route_seq_stop(struct seq_file *seq, void *v)
2479 __releases(RCU)
2480{
2481 rcu_read_unlock();
2482}
2483
2484static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2485{
2486 unsigned int flags = 0;
2487
2488 if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2489 flags = RTF_REJECT;
2490 if (fi && fi->fib_nh->nh_gw)
2491 flags |= RTF_GATEWAY;
2492 if (mask == htonl(0xFFFFFFFF))
2493 flags |= RTF_HOST;
2494 flags |= RTF_UP;
2495 return flags;
2496}
2497
2498/*
2499 * This outputs /proc/net/route.
2500 * The format of the file is not supposed to be changed
2501 * and needs to be same as fib_hash output to avoid breaking
2502 * legacy utilities
2503 */
2504static int fib_route_seq_show(struct seq_file *seq, void *v)
2505{
2506 struct leaf *l = v;
2507 struct leaf_info *li;
2508
2509 if (v == SEQ_START_TOKEN) {
2510 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2511 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2512 "\tWindow\tIRTT");
2513 return 0;
2514 }
2515
2516 hlist_for_each_entry_rcu(li, &l->list, hlist) {
2517 struct fib_alias *fa;
2518 __be32 mask, prefix;
2519
2520 mask = inet_make_mask(li->plen);
2521 prefix = htonl(l->key);
2522
2523 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2524 const struct fib_info *fi = fa->fa_info;
2525 unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2526
2527 if (fa->fa_type == RTN_BROADCAST
2528 || fa->fa_type == RTN_MULTICAST)
2529 continue;
2530
2531 seq_setwidth(seq, 127);
2532
2533 if (fi)
2534 seq_printf(seq,
2535 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2536 "%d\t%08X\t%d\t%u\t%u",
2537 fi->fib_dev ? fi->fib_dev->name : "*",
2538 prefix,
2539 fi->fib_nh->nh_gw, flags, 0, 0,
2540 fi->fib_priority,
2541 mask,
2542 (fi->fib_advmss ?
2543 fi->fib_advmss + 40 : 0),
2544 fi->fib_window,
2545 fi->fib_rtt >> 3);
2546 else
2547 seq_printf(seq,
2548 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2549 "%d\t%08X\t%d\t%u\t%u",
2550 prefix, 0, flags, 0, 0, 0,
2551 mask, 0, 0, 0);
2552
2553 seq_pad(seq, '\n');
2554 }
2555 }
2556
2557 return 0;
2558}
2559
2560static const struct seq_operations fib_route_seq_ops = {
2561 .start = fib_route_seq_start,
2562 .next = fib_route_seq_next,
2563 .stop = fib_route_seq_stop,
2564 .show = fib_route_seq_show,
2565};
2566
2567static int fib_route_seq_open(struct inode *inode, struct file *file)
2568{
2569 return seq_open_net(inode, file, &fib_route_seq_ops,
2570 sizeof(struct fib_route_iter));
2571}
2572
2573static const struct file_operations fib_route_fops = {
2574 .owner = THIS_MODULE,
2575 .open = fib_route_seq_open,
2576 .read = seq_read,
2577 .llseek = seq_lseek,
2578 .release = seq_release_net,
2579};
2580
2581int __net_init fib_proc_init(struct net *net)
2582{
2583 if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
2584 goto out1;
2585
2586 if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
2587 &fib_triestat_fops))
2588 goto out2;
2589
2590 if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
2591 goto out3;
2592
2593 return 0;
2594
2595out3:
2596 remove_proc_entry("fib_triestat", net->proc_net);
2597out2:
2598 remove_proc_entry("fib_trie", net->proc_net);
2599out1:
2600 return -ENOMEM;
2601}
2602
2603void __net_exit fib_proc_exit(struct net *net)
2604{
2605 remove_proc_entry("fib_trie", net->proc_net);
2606 remove_proc_entry("fib_triestat", net->proc_net);
2607 remove_proc_entry("route", net->proc_net);
2608}
2609
2610#endif /* CONFIG_PROC_FS */
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
12 *
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally described in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 *
26 * Code from fib_hash has been reused which includes the following header:
27 *
28 *
29 * INET An implementation of the TCP/IP protocol suite for the LINUX
30 * operating system. INET is implemented using the BSD Socket
31 * interface as the means of communication with the user level.
32 *
33 * IPv4 FIB: lookup engine and maintenance routines.
34 *
35 *
36 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37 *
38 * This program is free software; you can redistribute it and/or
39 * modify it under the terms of the GNU General Public License
40 * as published by the Free Software Foundation; either version
41 * 2 of the License, or (at your option) any later version.
42 *
43 * Substantial contributions to this work comes from:
44 *
45 * David S. Miller, <davem@davemloft.net>
46 * Stephen Hemminger <shemminger@osdl.org>
47 * Paul E. McKenney <paulmck@us.ibm.com>
48 * Patrick McHardy <kaber@trash.net>
49 */
50
51#define VERSION "0.409"
52
53#include <linux/cache.h>
54#include <linux/uaccess.h>
55#include <linux/bitops.h>
56#include <linux/types.h>
57#include <linux/kernel.h>
58#include <linux/mm.h>
59#include <linux/string.h>
60#include <linux/socket.h>
61#include <linux/sockios.h>
62#include <linux/errno.h>
63#include <linux/in.h>
64#include <linux/inet.h>
65#include <linux/inetdevice.h>
66#include <linux/netdevice.h>
67#include <linux/if_arp.h>
68#include <linux/proc_fs.h>
69#include <linux/rcupdate.h>
70#include <linux/skbuff.h>
71#include <linux/netlink.h>
72#include <linux/init.h>
73#include <linux/list.h>
74#include <linux/slab.h>
75#include <linux/export.h>
76#include <linux/vmalloc.h>
77#include <linux/notifier.h>
78#include <net/net_namespace.h>
79#include <net/ip.h>
80#include <net/protocol.h>
81#include <net/route.h>
82#include <net/tcp.h>
83#include <net/sock.h>
84#include <net/ip_fib.h>
85#include <net/fib_notifier.h>
86#include <trace/events/fib.h>
87#include "fib_lookup.h"
88
89static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
90 enum fib_event_type event_type, u32 dst,
91 int dst_len, struct fib_alias *fa)
92{
93 struct fib_entry_notifier_info info = {
94 .dst = dst,
95 .dst_len = dst_len,
96 .fi = fa->fa_info,
97 .tos = fa->fa_tos,
98 .type = fa->fa_type,
99 .tb_id = fa->tb_id,
100 };
101 return call_fib4_notifier(nb, net, event_type, &info.info);
102}
103
104static int call_fib_entry_notifiers(struct net *net,
105 enum fib_event_type event_type, u32 dst,
106 int dst_len, struct fib_alias *fa,
107 struct netlink_ext_ack *extack)
108{
109 struct fib_entry_notifier_info info = {
110 .info.extack = extack,
111 .dst = dst,
112 .dst_len = dst_len,
113 .fi = fa->fa_info,
114 .tos = fa->fa_tos,
115 .type = fa->fa_type,
116 .tb_id = fa->tb_id,
117 };
118 return call_fib4_notifiers(net, event_type, &info.info);
119}
120
121#define MAX_STAT_DEPTH 32
122
123#define KEYLENGTH (8*sizeof(t_key))
124#define KEY_MAX ((t_key)~0)
125
126typedef unsigned int t_key;
127
128#define IS_TRIE(n) ((n)->pos >= KEYLENGTH)
129#define IS_TNODE(n) ((n)->bits)
130#define IS_LEAF(n) (!(n)->bits)
131
132struct key_vector {
133 t_key key;
134 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
135 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
136 unsigned char slen;
137 union {
138 /* This list pointer if valid if (pos | bits) == 0 (LEAF) */
139 struct hlist_head leaf;
140 /* This array is valid if (pos | bits) > 0 (TNODE) */
141 struct key_vector __rcu *tnode[0];
142 };
143};
144
145struct tnode {
146 struct rcu_head rcu;
147 t_key empty_children; /* KEYLENGTH bits needed */
148 t_key full_children; /* KEYLENGTH bits needed */
149 struct key_vector __rcu *parent;
150 struct key_vector kv[1];
151#define tn_bits kv[0].bits
152};
153
154#define TNODE_SIZE(n) offsetof(struct tnode, kv[0].tnode[n])
155#define LEAF_SIZE TNODE_SIZE(1)
156
157#ifdef CONFIG_IP_FIB_TRIE_STATS
158struct trie_use_stats {
159 unsigned int gets;
160 unsigned int backtrack;
161 unsigned int semantic_match_passed;
162 unsigned int semantic_match_miss;
163 unsigned int null_node_hit;
164 unsigned int resize_node_skipped;
165};
166#endif
167
168struct trie_stat {
169 unsigned int totdepth;
170 unsigned int maxdepth;
171 unsigned int tnodes;
172 unsigned int leaves;
173 unsigned int nullpointers;
174 unsigned int prefixes;
175 unsigned int nodesizes[MAX_STAT_DEPTH];
176};
177
178struct trie {
179 struct key_vector kv[1];
180#ifdef CONFIG_IP_FIB_TRIE_STATS
181 struct trie_use_stats __percpu *stats;
182#endif
183};
184
185static struct key_vector *resize(struct trie *t, struct key_vector *tn);
186static size_t tnode_free_size;
187
188/*
189 * synchronize_rcu after call_rcu for that many pages; it should be especially
190 * useful before resizing the root node with PREEMPT_NONE configs; the value was
191 * obtained experimentally, aiming to avoid visible slowdown.
192 */
193static const int sync_pages = 128;
194
195static struct kmem_cache *fn_alias_kmem __ro_after_init;
196static struct kmem_cache *trie_leaf_kmem __ro_after_init;
197
198static inline struct tnode *tn_info(struct key_vector *kv)
199{
200 return container_of(kv, struct tnode, kv[0]);
201}
202
203/* caller must hold RTNL */
204#define node_parent(tn) rtnl_dereference(tn_info(tn)->parent)
205#define get_child(tn, i) rtnl_dereference((tn)->tnode[i])
206
207/* caller must hold RCU read lock or RTNL */
208#define node_parent_rcu(tn) rcu_dereference_rtnl(tn_info(tn)->parent)
209#define get_child_rcu(tn, i) rcu_dereference_rtnl((tn)->tnode[i])
210
211/* wrapper for rcu_assign_pointer */
212static inline void node_set_parent(struct key_vector *n, struct key_vector *tp)
213{
214 if (n)
215 rcu_assign_pointer(tn_info(n)->parent, tp);
216}
217
218#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER(tn_info(n)->parent, p)
219
220/* This provides us with the number of children in this node, in the case of a
221 * leaf this will return 0 meaning none of the children are accessible.
222 */
223static inline unsigned long child_length(const struct key_vector *tn)
224{
225 return (1ul << tn->bits) & ~(1ul);
226}
227
228#define get_cindex(key, kv) (((key) ^ (kv)->key) >> (kv)->pos)
229
230static inline unsigned long get_index(t_key key, struct key_vector *kv)
231{
232 unsigned long index = key ^ kv->key;
233
234 if ((BITS_PER_LONG <= KEYLENGTH) && (KEYLENGTH == kv->pos))
235 return 0;
236
237 return index >> kv->pos;
238}
239
240/* To understand this stuff, an understanding of keys and all their bits is
241 * necessary. Every node in the trie has a key associated with it, but not
242 * all of the bits in that key are significant.
243 *
244 * Consider a node 'n' and its parent 'tp'.
245 *
246 * If n is a leaf, every bit in its key is significant. Its presence is
247 * necessitated by path compression, since during a tree traversal (when
248 * searching for a leaf - unless we are doing an insertion) we will completely
249 * ignore all skipped bits we encounter. Thus we need to verify, at the end of
250 * a potentially successful search, that we have indeed been walking the
251 * correct key path.
252 *
253 * Note that we can never "miss" the correct key in the tree if present by
254 * following the wrong path. Path compression ensures that segments of the key
255 * that are the same for all keys with a given prefix are skipped, but the
256 * skipped part *is* identical for each node in the subtrie below the skipped
257 * bit! trie_insert() in this implementation takes care of that.
258 *
259 * if n is an internal node - a 'tnode' here, the various parts of its key
260 * have many different meanings.
261 *
262 * Example:
263 * _________________________________________________________________
264 * | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
265 * -----------------------------------------------------------------
266 * 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16
267 *
268 * _________________________________________________________________
269 * | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
270 * -----------------------------------------------------------------
271 * 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
272 *
273 * tp->pos = 22
274 * tp->bits = 3
275 * n->pos = 13
276 * n->bits = 4
277 *
278 * First, let's just ignore the bits that come before the parent tp, that is
279 * the bits from (tp->pos + tp->bits) to 31. They are *known* but at this
280 * point we do not use them for anything.
281 *
282 * The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
283 * index into the parent's child array. That is, they will be used to find
284 * 'n' among tp's children.
285 *
286 * The bits from (n->pos + n->bits) to (tp->pos - 1) - "S" - are skipped bits
287 * for the node n.
288 *
289 * All the bits we have seen so far are significant to the node n. The rest
290 * of the bits are really not needed or indeed known in n->key.
291 *
292 * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
293 * n's child array, and will of course be different for each child.
294 *
295 * The rest of the bits, from 0 to (n->pos -1) - "u" - are completely unknown
296 * at this point.
297 */
298
299static const int halve_threshold = 25;
300static const int inflate_threshold = 50;
301static const int halve_threshold_root = 15;
302static const int inflate_threshold_root = 30;
303
304static void __alias_free_mem(struct rcu_head *head)
305{
306 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
307 kmem_cache_free(fn_alias_kmem, fa);
308}
309
310static inline void alias_free_mem_rcu(struct fib_alias *fa)
311{
312 call_rcu(&fa->rcu, __alias_free_mem);
313}
314
315#define TNODE_KMALLOC_MAX \
316 ilog2((PAGE_SIZE - TNODE_SIZE(0)) / sizeof(struct key_vector *))
317#define TNODE_VMALLOC_MAX \
318 ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct key_vector *))
319
320static void __node_free_rcu(struct rcu_head *head)
321{
322 struct tnode *n = container_of(head, struct tnode, rcu);
323
324 if (!n->tn_bits)
325 kmem_cache_free(trie_leaf_kmem, n);
326 else
327 kvfree(n);
328}
329
330#define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu)
331
332static struct tnode *tnode_alloc(int bits)
333{
334 size_t size;
335
336 /* verify bits is within bounds */
337 if (bits > TNODE_VMALLOC_MAX)
338 return NULL;
339
340 /* determine size and verify it is non-zero and didn't overflow */
341 size = TNODE_SIZE(1ul << bits);
342
343 if (size <= PAGE_SIZE)
344 return kzalloc(size, GFP_KERNEL);
345 else
346 return vzalloc(size);
347}
348
349static inline void empty_child_inc(struct key_vector *n)
350{
351 ++tn_info(n)->empty_children ? : ++tn_info(n)->full_children;
352}
353
354static inline void empty_child_dec(struct key_vector *n)
355{
356 tn_info(n)->empty_children-- ? : tn_info(n)->full_children--;
357}
358
359static struct key_vector *leaf_new(t_key key, struct fib_alias *fa)
360{
361 struct key_vector *l;
362 struct tnode *kv;
363
364 kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
365 if (!kv)
366 return NULL;
367
368 /* initialize key vector */
369 l = kv->kv;
370 l->key = key;
371 l->pos = 0;
372 l->bits = 0;
373 l->slen = fa->fa_slen;
374
375 /* link leaf to fib alias */
376 INIT_HLIST_HEAD(&l->leaf);
377 hlist_add_head(&fa->fa_list, &l->leaf);
378
379 return l;
380}
381
382static struct key_vector *tnode_new(t_key key, int pos, int bits)
383{
384 unsigned int shift = pos + bits;
385 struct key_vector *tn;
386 struct tnode *tnode;
387
388 /* verify bits and pos their msb bits clear and values are valid */
389 BUG_ON(!bits || (shift > KEYLENGTH));
390
391 tnode = tnode_alloc(bits);
392 if (!tnode)
393 return NULL;
394
395 pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0),
396 sizeof(struct key_vector *) << bits);
397
398 if (bits == KEYLENGTH)
399 tnode->full_children = 1;
400 else
401 tnode->empty_children = 1ul << bits;
402
403 tn = tnode->kv;
404 tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
405 tn->pos = pos;
406 tn->bits = bits;
407 tn->slen = pos;
408
409 return tn;
410}
411
412/* Check whether a tnode 'n' is "full", i.e. it is an internal node
413 * and no bits are skipped. See discussion in dyntree paper p. 6
414 */
415static inline int tnode_full(struct key_vector *tn, struct key_vector *n)
416{
417 return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
418}
419
420/* Add a child at position i overwriting the old value.
421 * Update the value of full_children and empty_children.
422 */
423static void put_child(struct key_vector *tn, unsigned long i,
424 struct key_vector *n)
425{
426 struct key_vector *chi = get_child(tn, i);
427 int isfull, wasfull;
428
429 BUG_ON(i >= child_length(tn));
430
431 /* update emptyChildren, overflow into fullChildren */
432 if (!n && chi)
433 empty_child_inc(tn);
434 if (n && !chi)
435 empty_child_dec(tn);
436
437 /* update fullChildren */
438 wasfull = tnode_full(tn, chi);
439 isfull = tnode_full(tn, n);
440
441 if (wasfull && !isfull)
442 tn_info(tn)->full_children--;
443 else if (!wasfull && isfull)
444 tn_info(tn)->full_children++;
445
446 if (n && (tn->slen < n->slen))
447 tn->slen = n->slen;
448
449 rcu_assign_pointer(tn->tnode[i], n);
450}
451
452static void update_children(struct key_vector *tn)
453{
454 unsigned long i;
455
456 /* update all of the child parent pointers */
457 for (i = child_length(tn); i;) {
458 struct key_vector *inode = get_child(tn, --i);
459
460 if (!inode)
461 continue;
462
463 /* Either update the children of a tnode that
464 * already belongs to us or update the child
465 * to point to ourselves.
466 */
467 if (node_parent(inode) == tn)
468 update_children(inode);
469 else
470 node_set_parent(inode, tn);
471 }
472}
473
474static inline void put_child_root(struct key_vector *tp, t_key key,
475 struct key_vector *n)
476{
477 if (IS_TRIE(tp))
478 rcu_assign_pointer(tp->tnode[0], n);
479 else
480 put_child(tp, get_index(key, tp), n);
481}
482
483static inline void tnode_free_init(struct key_vector *tn)
484{
485 tn_info(tn)->rcu.next = NULL;
486}
487
488static inline void tnode_free_append(struct key_vector *tn,
489 struct key_vector *n)
490{
491 tn_info(n)->rcu.next = tn_info(tn)->rcu.next;
492 tn_info(tn)->rcu.next = &tn_info(n)->rcu;
493}
494
495static void tnode_free(struct key_vector *tn)
496{
497 struct callback_head *head = &tn_info(tn)->rcu;
498
499 while (head) {
500 head = head->next;
501 tnode_free_size += TNODE_SIZE(1ul << tn->bits);
502 node_free(tn);
503
504 tn = container_of(head, struct tnode, rcu)->kv;
505 }
506
507 if (tnode_free_size >= PAGE_SIZE * sync_pages) {
508 tnode_free_size = 0;
509 synchronize_rcu();
510 }
511}
512
513static struct key_vector *replace(struct trie *t,
514 struct key_vector *oldtnode,
515 struct key_vector *tn)
516{
517 struct key_vector *tp = node_parent(oldtnode);
518 unsigned long i;
519
520 /* setup the parent pointer out of and back into this node */
521 NODE_INIT_PARENT(tn, tp);
522 put_child_root(tp, tn->key, tn);
523
524 /* update all of the child parent pointers */
525 update_children(tn);
526
527 /* all pointers should be clean so we are done */
528 tnode_free(oldtnode);
529
530 /* resize children now that oldtnode is freed */
531 for (i = child_length(tn); i;) {
532 struct key_vector *inode = get_child(tn, --i);
533
534 /* resize child node */
535 if (tnode_full(tn, inode))
536 tn = resize(t, inode);
537 }
538
539 return tp;
540}
541
542static struct key_vector *inflate(struct trie *t,
543 struct key_vector *oldtnode)
544{
545 struct key_vector *tn;
546 unsigned long i;
547 t_key m;
548
549 pr_debug("In inflate\n");
550
551 tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1);
552 if (!tn)
553 goto notnode;
554
555 /* prepare oldtnode to be freed */
556 tnode_free_init(oldtnode);
557
558 /* Assemble all of the pointers in our cluster, in this case that
559 * represents all of the pointers out of our allocated nodes that
560 * point to existing tnodes and the links between our allocated
561 * nodes.
562 */
563 for (i = child_length(oldtnode), m = 1u << tn->pos; i;) {
564 struct key_vector *inode = get_child(oldtnode, --i);
565 struct key_vector *node0, *node1;
566 unsigned long j, k;
567
568 /* An empty child */
569 if (!inode)
570 continue;
571
572 /* A leaf or an internal node with skipped bits */
573 if (!tnode_full(oldtnode, inode)) {
574 put_child(tn, get_index(inode->key, tn), inode);
575 continue;
576 }
577
578 /* drop the node in the old tnode free list */
579 tnode_free_append(oldtnode, inode);
580
581 /* An internal node with two children */
582 if (inode->bits == 1) {
583 put_child(tn, 2 * i + 1, get_child(inode, 1));
584 put_child(tn, 2 * i, get_child(inode, 0));
585 continue;
586 }
587
588 /* We will replace this node 'inode' with two new
589 * ones, 'node0' and 'node1', each with half of the
590 * original children. The two new nodes will have
591 * a position one bit further down the key and this
592 * means that the "significant" part of their keys
593 * (see the discussion near the top of this file)
594 * will differ by one bit, which will be "0" in
595 * node0's key and "1" in node1's key. Since we are
596 * moving the key position by one step, the bit that
597 * we are moving away from - the bit at position
598 * (tn->pos) - is the one that will differ between
599 * node0 and node1. So... we synthesize that bit in the
600 * two new keys.
601 */
602 node1 = tnode_new(inode->key | m, inode->pos, inode->bits - 1);
603 if (!node1)
604 goto nomem;
605 node0 = tnode_new(inode->key, inode->pos, inode->bits - 1);
606
607 tnode_free_append(tn, node1);
608 if (!node0)
609 goto nomem;
610 tnode_free_append(tn, node0);
611
612 /* populate child pointers in new nodes */
613 for (k = child_length(inode), j = k / 2; j;) {
614 put_child(node1, --j, get_child(inode, --k));
615 put_child(node0, j, get_child(inode, j));
616 put_child(node1, --j, get_child(inode, --k));
617 put_child(node0, j, get_child(inode, j));
618 }
619
620 /* link new nodes to parent */
621 NODE_INIT_PARENT(node1, tn);
622 NODE_INIT_PARENT(node0, tn);
623
624 /* link parent to nodes */
625 put_child(tn, 2 * i + 1, node1);
626 put_child(tn, 2 * i, node0);
627 }
628
629 /* setup the parent pointers into and out of this node */
630 return replace(t, oldtnode, tn);
631nomem:
632 /* all pointers should be clean so we are done */
633 tnode_free(tn);
634notnode:
635 return NULL;
636}
637
638static struct key_vector *halve(struct trie *t,
639 struct key_vector *oldtnode)
640{
641 struct key_vector *tn;
642 unsigned long i;
643
644 pr_debug("In halve\n");
645
646 tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1);
647 if (!tn)
648 goto notnode;
649
650 /* prepare oldtnode to be freed */
651 tnode_free_init(oldtnode);
652
653 /* Assemble all of the pointers in our cluster, in this case that
654 * represents all of the pointers out of our allocated nodes that
655 * point to existing tnodes and the links between our allocated
656 * nodes.
657 */
658 for (i = child_length(oldtnode); i;) {
659 struct key_vector *node1 = get_child(oldtnode, --i);
660 struct key_vector *node0 = get_child(oldtnode, --i);
661 struct key_vector *inode;
662
663 /* At least one of the children is empty */
664 if (!node1 || !node0) {
665 put_child(tn, i / 2, node1 ? : node0);
666 continue;
667 }
668
669 /* Two nonempty children */
670 inode = tnode_new(node0->key, oldtnode->pos, 1);
671 if (!inode)
672 goto nomem;
673 tnode_free_append(tn, inode);
674
675 /* initialize pointers out of node */
676 put_child(inode, 1, node1);
677 put_child(inode, 0, node0);
678 NODE_INIT_PARENT(inode, tn);
679
680 /* link parent to node */
681 put_child(tn, i / 2, inode);
682 }
683
684 /* setup the parent pointers into and out of this node */
685 return replace(t, oldtnode, tn);
686nomem:
687 /* all pointers should be clean so we are done */
688 tnode_free(tn);
689notnode:
690 return NULL;
691}
692
693static struct key_vector *collapse(struct trie *t,
694 struct key_vector *oldtnode)
695{
696 struct key_vector *n, *tp;
697 unsigned long i;
698
699 /* scan the tnode looking for that one child that might still exist */
700 for (n = NULL, i = child_length(oldtnode); !n && i;)
701 n = get_child(oldtnode, --i);
702
703 /* compress one level */
704 tp = node_parent(oldtnode);
705 put_child_root(tp, oldtnode->key, n);
706 node_set_parent(n, tp);
707
708 /* drop dead node */
709 node_free(oldtnode);
710
711 return tp;
712}
713
714static unsigned char update_suffix(struct key_vector *tn)
715{
716 unsigned char slen = tn->pos;
717 unsigned long stride, i;
718 unsigned char slen_max;
719
720 /* only vector 0 can have a suffix length greater than or equal to
721 * tn->pos + tn->bits, the second highest node will have a suffix
722 * length at most of tn->pos + tn->bits - 1
723 */
724 slen_max = min_t(unsigned char, tn->pos + tn->bits - 1, tn->slen);
725
726 /* search though the list of children looking for nodes that might
727 * have a suffix greater than the one we currently have. This is
728 * why we start with a stride of 2 since a stride of 1 would
729 * represent the nodes with suffix length equal to tn->pos
730 */
731 for (i = 0, stride = 0x2ul ; i < child_length(tn); i += stride) {
732 struct key_vector *n = get_child(tn, i);
733
734 if (!n || (n->slen <= slen))
735 continue;
736
737 /* update stride and slen based on new value */
738 stride <<= (n->slen - slen);
739 slen = n->slen;
740 i &= ~(stride - 1);
741
742 /* stop searching if we have hit the maximum possible value */
743 if (slen >= slen_max)
744 break;
745 }
746
747 tn->slen = slen;
748
749 return slen;
750}
751
752/* From "Implementing a dynamic compressed trie" by Stefan Nilsson of
753 * the Helsinki University of Technology and Matti Tikkanen of Nokia
754 * Telecommunications, page 6:
755 * "A node is doubled if the ratio of non-empty children to all
756 * children in the *doubled* node is at least 'high'."
757 *
758 * 'high' in this instance is the variable 'inflate_threshold'. It
759 * is expressed as a percentage, so we multiply it with
760 * child_length() and instead of multiplying by 2 (since the
761 * child array will be doubled by inflate()) and multiplying
762 * the left-hand side by 100 (to handle the percentage thing) we
763 * multiply the left-hand side by 50.
764 *
765 * The left-hand side may look a bit weird: child_length(tn)
766 * - tn->empty_children is of course the number of non-null children
767 * in the current node. tn->full_children is the number of "full"
768 * children, that is non-null tnodes with a skip value of 0.
769 * All of those will be doubled in the resulting inflated tnode, so
770 * we just count them one extra time here.
771 *
772 * A clearer way to write this would be:
773 *
774 * to_be_doubled = tn->full_children;
775 * not_to_be_doubled = child_length(tn) - tn->empty_children -
776 * tn->full_children;
777 *
778 * new_child_length = child_length(tn) * 2;
779 *
780 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
781 * new_child_length;
782 * if (new_fill_factor >= inflate_threshold)
783 *
784 * ...and so on, tho it would mess up the while () loop.
785 *
786 * anyway,
787 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
788 * inflate_threshold
789 *
790 * avoid a division:
791 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
792 * inflate_threshold * new_child_length
793 *
794 * expand not_to_be_doubled and to_be_doubled, and shorten:
795 * 100 * (child_length(tn) - tn->empty_children +
796 * tn->full_children) >= inflate_threshold * new_child_length
797 *
798 * expand new_child_length:
799 * 100 * (child_length(tn) - tn->empty_children +
800 * tn->full_children) >=
801 * inflate_threshold * child_length(tn) * 2
802 *
803 * shorten again:
804 * 50 * (tn->full_children + child_length(tn) -
805 * tn->empty_children) >= inflate_threshold *
806 * child_length(tn)
807 *
808 */
809static inline bool should_inflate(struct key_vector *tp, struct key_vector *tn)
810{
811 unsigned long used = child_length(tn);
812 unsigned long threshold = used;
813
814 /* Keep root node larger */
815 threshold *= IS_TRIE(tp) ? inflate_threshold_root : inflate_threshold;
816 used -= tn_info(tn)->empty_children;
817 used += tn_info(tn)->full_children;
818
819 /* if bits == KEYLENGTH then pos = 0, and will fail below */
820
821 return (used > 1) && tn->pos && ((50 * used) >= threshold);
822}
823
824static inline bool should_halve(struct key_vector *tp, struct key_vector *tn)
825{
826 unsigned long used = child_length(tn);
827 unsigned long threshold = used;
828
829 /* Keep root node larger */
830 threshold *= IS_TRIE(tp) ? halve_threshold_root : halve_threshold;
831 used -= tn_info(tn)->empty_children;
832
833 /* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */
834
835 return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold);
836}
837
838static inline bool should_collapse(struct key_vector *tn)
839{
840 unsigned long used = child_length(tn);
841
842 used -= tn_info(tn)->empty_children;
843
844 /* account for bits == KEYLENGTH case */
845 if ((tn->bits == KEYLENGTH) && tn_info(tn)->full_children)
846 used -= KEY_MAX;
847
848 /* One child or none, time to drop us from the trie */
849 return used < 2;
850}
851
852#define MAX_WORK 10
853static struct key_vector *resize(struct trie *t, struct key_vector *tn)
854{
855#ifdef CONFIG_IP_FIB_TRIE_STATS
856 struct trie_use_stats __percpu *stats = t->stats;
857#endif
858 struct key_vector *tp = node_parent(tn);
859 unsigned long cindex = get_index(tn->key, tp);
860 int max_work = MAX_WORK;
861
862 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
863 tn, inflate_threshold, halve_threshold);
864
865 /* track the tnode via the pointer from the parent instead of
866 * doing it ourselves. This way we can let RCU fully do its
867 * thing without us interfering
868 */
869 BUG_ON(tn != get_child(tp, cindex));
870
871 /* Double as long as the resulting node has a number of
872 * nonempty nodes that are above the threshold.
873 */
874 while (should_inflate(tp, tn) && max_work) {
875 tp = inflate(t, tn);
876 if (!tp) {
877#ifdef CONFIG_IP_FIB_TRIE_STATS
878 this_cpu_inc(stats->resize_node_skipped);
879#endif
880 break;
881 }
882
883 max_work--;
884 tn = get_child(tp, cindex);
885 }
886
887 /* update parent in case inflate failed */
888 tp = node_parent(tn);
889
890 /* Return if at least one inflate is run */
891 if (max_work != MAX_WORK)
892 return tp;
893
894 /* Halve as long as the number of empty children in this
895 * node is above threshold.
896 */
897 while (should_halve(tp, tn) && max_work) {
898 tp = halve(t, tn);
899 if (!tp) {
900#ifdef CONFIG_IP_FIB_TRIE_STATS
901 this_cpu_inc(stats->resize_node_skipped);
902#endif
903 break;
904 }
905
906 max_work--;
907 tn = get_child(tp, cindex);
908 }
909
910 /* Only one child remains */
911 if (should_collapse(tn))
912 return collapse(t, tn);
913
914 /* update parent in case halve failed */
915 return node_parent(tn);
916}
917
918static void node_pull_suffix(struct key_vector *tn, unsigned char slen)
919{
920 unsigned char node_slen = tn->slen;
921
922 while ((node_slen > tn->pos) && (node_slen > slen)) {
923 slen = update_suffix(tn);
924 if (node_slen == slen)
925 break;
926
927 tn = node_parent(tn);
928 node_slen = tn->slen;
929 }
930}
931
932static void node_push_suffix(struct key_vector *tn, unsigned char slen)
933{
934 while (tn->slen < slen) {
935 tn->slen = slen;
936 tn = node_parent(tn);
937 }
938}
939
940/* rcu_read_lock needs to be hold by caller from readside */
941static struct key_vector *fib_find_node(struct trie *t,
942 struct key_vector **tp, u32 key)
943{
944 struct key_vector *pn, *n = t->kv;
945 unsigned long index = 0;
946
947 do {
948 pn = n;
949 n = get_child_rcu(n, index);
950
951 if (!n)
952 break;
953
954 index = get_cindex(key, n);
955
956 /* This bit of code is a bit tricky but it combines multiple
957 * checks into a single check. The prefix consists of the
958 * prefix plus zeros for the bits in the cindex. The index
959 * is the difference between the key and this value. From
960 * this we can actually derive several pieces of data.
961 * if (index >= (1ul << bits))
962 * we have a mismatch in skip bits and failed
963 * else
964 * we know the value is cindex
965 *
966 * This check is safe even if bits == KEYLENGTH due to the
967 * fact that we can only allocate a node with 32 bits if a
968 * long is greater than 32 bits.
969 */
970 if (index >= (1ul << n->bits)) {
971 n = NULL;
972 break;
973 }
974
975 /* keep searching until we find a perfect match leaf or NULL */
976 } while (IS_TNODE(n));
977
978 *tp = pn;
979
980 return n;
981}
982
983/* Return the first fib alias matching TOS with
984 * priority less than or equal to PRIO.
985 */
986static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
987 u8 tos, u32 prio, u32 tb_id)
988{
989 struct fib_alias *fa;
990
991 if (!fah)
992 return NULL;
993
994 hlist_for_each_entry(fa, fah, fa_list) {
995 if (fa->fa_slen < slen)
996 continue;
997 if (fa->fa_slen != slen)
998 break;
999 if (fa->tb_id > tb_id)
1000 continue;
1001 if (fa->tb_id != tb_id)
1002 break;
1003 if (fa->fa_tos > tos)
1004 continue;
1005 if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos)
1006 return fa;
1007 }
1008
1009 return NULL;
1010}
1011
1012static void trie_rebalance(struct trie *t, struct key_vector *tn)
1013{
1014 while (!IS_TRIE(tn))
1015 tn = resize(t, tn);
1016}
1017
1018static int fib_insert_node(struct trie *t, struct key_vector *tp,
1019 struct fib_alias *new, t_key key)
1020{
1021 struct key_vector *n, *l;
1022
1023 l = leaf_new(key, new);
1024 if (!l)
1025 goto noleaf;
1026
1027 /* retrieve child from parent node */
1028 n = get_child(tp, get_index(key, tp));
1029
1030 /* Case 2: n is a LEAF or a TNODE and the key doesn't match.
1031 *
1032 * Add a new tnode here
1033 * first tnode need some special handling
1034 * leaves us in position for handling as case 3
1035 */
1036 if (n) {
1037 struct key_vector *tn;
1038
1039 tn = tnode_new(key, __fls(key ^ n->key), 1);
1040 if (!tn)
1041 goto notnode;
1042
1043 /* initialize routes out of node */
1044 NODE_INIT_PARENT(tn, tp);
1045 put_child(tn, get_index(key, tn) ^ 1, n);
1046
1047 /* start adding routes into the node */
1048 put_child_root(tp, key, tn);
1049 node_set_parent(n, tn);
1050
1051 /* parent now has a NULL spot where the leaf can go */
1052 tp = tn;
1053 }
1054
1055 /* Case 3: n is NULL, and will just insert a new leaf */
1056 node_push_suffix(tp, new->fa_slen);
1057 NODE_INIT_PARENT(l, tp);
1058 put_child_root(tp, key, l);
1059 trie_rebalance(t, tp);
1060
1061 return 0;
1062notnode:
1063 node_free(l);
1064noleaf:
1065 return -ENOMEM;
1066}
1067
1068/* fib notifier for ADD is sent before calling fib_insert_alias with
1069 * the expectation that the only possible failure ENOMEM
1070 */
1071static int fib_insert_alias(struct trie *t, struct key_vector *tp,
1072 struct key_vector *l, struct fib_alias *new,
1073 struct fib_alias *fa, t_key key)
1074{
1075 if (!l)
1076 return fib_insert_node(t, tp, new, key);
1077
1078 if (fa) {
1079 hlist_add_before_rcu(&new->fa_list, &fa->fa_list);
1080 } else {
1081 struct fib_alias *last;
1082
1083 hlist_for_each_entry(last, &l->leaf, fa_list) {
1084 if (new->fa_slen < last->fa_slen)
1085 break;
1086 if ((new->fa_slen == last->fa_slen) &&
1087 (new->tb_id > last->tb_id))
1088 break;
1089 fa = last;
1090 }
1091
1092 if (fa)
1093 hlist_add_behind_rcu(&new->fa_list, &fa->fa_list);
1094 else
1095 hlist_add_head_rcu(&new->fa_list, &l->leaf);
1096 }
1097
1098 /* if we added to the tail node then we need to update slen */
1099 if (l->slen < new->fa_slen) {
1100 l->slen = new->fa_slen;
1101 node_push_suffix(tp, new->fa_slen);
1102 }
1103
1104 return 0;
1105}
1106
1107static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack)
1108{
1109 if (plen > KEYLENGTH) {
1110 NL_SET_ERR_MSG(extack, "Invalid prefix length");
1111 return false;
1112 }
1113
1114 if ((plen < KEYLENGTH) && (key << plen)) {
1115 NL_SET_ERR_MSG(extack,
1116 "Invalid prefix for given prefix length");
1117 return false;
1118 }
1119
1120 return true;
1121}
1122
1123/* Caller must hold RTNL. */
1124int fib_table_insert(struct net *net, struct fib_table *tb,
1125 struct fib_config *cfg, struct netlink_ext_ack *extack)
1126{
1127 enum fib_event_type event = FIB_EVENT_ENTRY_ADD;
1128 struct trie *t = (struct trie *)tb->tb_data;
1129 struct fib_alias *fa, *new_fa;
1130 struct key_vector *l, *tp;
1131 u16 nlflags = NLM_F_EXCL;
1132 struct fib_info *fi;
1133 u8 plen = cfg->fc_dst_len;
1134 u8 slen = KEYLENGTH - plen;
1135 u8 tos = cfg->fc_tos;
1136 u32 key;
1137 int err;
1138
1139 key = ntohl(cfg->fc_dst);
1140
1141 if (!fib_valid_key_len(key, plen, extack))
1142 return -EINVAL;
1143
1144 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1145
1146 fi = fib_create_info(cfg, extack);
1147 if (IS_ERR(fi)) {
1148 err = PTR_ERR(fi);
1149 goto err;
1150 }
1151
1152 l = fib_find_node(t, &tp, key);
1153 fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority,
1154 tb->tb_id) : NULL;
1155
1156 /* Now fa, if non-NULL, points to the first fib alias
1157 * with the same keys [prefix,tos,priority], if such key already
1158 * exists or to the node before which we will insert new one.
1159 *
1160 * If fa is NULL, we will need to allocate a new one and
1161 * insert to the tail of the section matching the suffix length
1162 * of the new alias.
1163 */
1164
1165 if (fa && fa->fa_tos == tos &&
1166 fa->fa_info->fib_priority == fi->fib_priority) {
1167 struct fib_alias *fa_first, *fa_match;
1168
1169 err = -EEXIST;
1170 if (cfg->fc_nlflags & NLM_F_EXCL)
1171 goto out;
1172
1173 nlflags &= ~NLM_F_EXCL;
1174
1175 /* We have 2 goals:
1176 * 1. Find exact match for type, scope, fib_info to avoid
1177 * duplicate routes
1178 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1179 */
1180 fa_match = NULL;
1181 fa_first = fa;
1182 hlist_for_each_entry_from(fa, fa_list) {
1183 if ((fa->fa_slen != slen) ||
1184 (fa->tb_id != tb->tb_id) ||
1185 (fa->fa_tos != tos))
1186 break;
1187 if (fa->fa_info->fib_priority != fi->fib_priority)
1188 break;
1189 if (fa->fa_type == cfg->fc_type &&
1190 fa->fa_info == fi) {
1191 fa_match = fa;
1192 break;
1193 }
1194 }
1195
1196 if (cfg->fc_nlflags & NLM_F_REPLACE) {
1197 struct fib_info *fi_drop;
1198 u8 state;
1199
1200 nlflags |= NLM_F_REPLACE;
1201 fa = fa_first;
1202 if (fa_match) {
1203 if (fa == fa_match)
1204 err = 0;
1205 goto out;
1206 }
1207 err = -ENOBUFS;
1208 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1209 if (!new_fa)
1210 goto out;
1211
1212 fi_drop = fa->fa_info;
1213 new_fa->fa_tos = fa->fa_tos;
1214 new_fa->fa_info = fi;
1215 new_fa->fa_type = cfg->fc_type;
1216 state = fa->fa_state;
1217 new_fa->fa_state = state & ~FA_S_ACCESSED;
1218 new_fa->fa_slen = fa->fa_slen;
1219 new_fa->tb_id = tb->tb_id;
1220 new_fa->fa_default = -1;
1221
1222 err = call_fib_entry_notifiers(net,
1223 FIB_EVENT_ENTRY_REPLACE,
1224 key, plen, new_fa,
1225 extack);
1226 if (err)
1227 goto out_free_new_fa;
1228
1229 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1230 tb->tb_id, &cfg->fc_nlinfo, nlflags);
1231
1232 hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1233
1234 alias_free_mem_rcu(fa);
1235
1236 fib_release_info(fi_drop);
1237 if (state & FA_S_ACCESSED)
1238 rt_cache_flush(cfg->fc_nlinfo.nl_net);
1239
1240 goto succeeded;
1241 }
1242 /* Error if we find a perfect match which
1243 * uses the same scope, type, and nexthop
1244 * information.
1245 */
1246 if (fa_match)
1247 goto out;
1248
1249 if (cfg->fc_nlflags & NLM_F_APPEND) {
1250 event = FIB_EVENT_ENTRY_APPEND;
1251 nlflags |= NLM_F_APPEND;
1252 } else {
1253 fa = fa_first;
1254 }
1255 }
1256 err = -ENOENT;
1257 if (!(cfg->fc_nlflags & NLM_F_CREATE))
1258 goto out;
1259
1260 nlflags |= NLM_F_CREATE;
1261 err = -ENOBUFS;
1262 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1263 if (!new_fa)
1264 goto out;
1265
1266 new_fa->fa_info = fi;
1267 new_fa->fa_tos = tos;
1268 new_fa->fa_type = cfg->fc_type;
1269 new_fa->fa_state = 0;
1270 new_fa->fa_slen = slen;
1271 new_fa->tb_id = tb->tb_id;
1272 new_fa->fa_default = -1;
1273
1274 err = call_fib_entry_notifiers(net, event, key, plen, new_fa, extack);
1275 if (err)
1276 goto out_free_new_fa;
1277
1278 /* Insert new entry to the list. */
1279 err = fib_insert_alias(t, tp, l, new_fa, fa, key);
1280 if (err)
1281 goto out_fib_notif;
1282
1283 if (!plen)
1284 tb->tb_num_default++;
1285
1286 rt_cache_flush(cfg->fc_nlinfo.nl_net);
1287 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
1288 &cfg->fc_nlinfo, nlflags);
1289succeeded:
1290 return 0;
1291
1292out_fib_notif:
1293 /* notifier was sent that entry would be added to trie, but
1294 * the add failed and need to recover. Only failure for
1295 * fib_insert_alias is ENOMEM.
1296 */
1297 NL_SET_ERR_MSG(extack, "Failed to insert route into trie");
1298 call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key,
1299 plen, new_fa, NULL);
1300out_free_new_fa:
1301 kmem_cache_free(fn_alias_kmem, new_fa);
1302out:
1303 fib_release_info(fi);
1304err:
1305 return err;
1306}
1307
1308static inline t_key prefix_mismatch(t_key key, struct key_vector *n)
1309{
1310 t_key prefix = n->key;
1311
1312 return (key ^ prefix) & (prefix | -prefix);
1313}
1314
1315/* should be called with rcu_read_lock */
1316int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1317 struct fib_result *res, int fib_flags)
1318{
1319 struct trie *t = (struct trie *) tb->tb_data;
1320#ifdef CONFIG_IP_FIB_TRIE_STATS
1321 struct trie_use_stats __percpu *stats = t->stats;
1322#endif
1323 const t_key key = ntohl(flp->daddr);
1324 struct key_vector *n, *pn;
1325 struct fib_alias *fa;
1326 unsigned long index;
1327 t_key cindex;
1328
1329 trace_fib_table_lookup(tb->tb_id, flp);
1330
1331 pn = t->kv;
1332 cindex = 0;
1333
1334 n = get_child_rcu(pn, cindex);
1335 if (!n)
1336 return -EAGAIN;
1337
1338#ifdef CONFIG_IP_FIB_TRIE_STATS
1339 this_cpu_inc(stats->gets);
1340#endif
1341
1342 /* Step 1: Travel to the longest prefix match in the trie */
1343 for (;;) {
1344 index = get_cindex(key, n);
1345
1346 /* This bit of code is a bit tricky but it combines multiple
1347 * checks into a single check. The prefix consists of the
1348 * prefix plus zeros for the "bits" in the prefix. The index
1349 * is the difference between the key and this value. From
1350 * this we can actually derive several pieces of data.
1351 * if (index >= (1ul << bits))
1352 * we have a mismatch in skip bits and failed
1353 * else
1354 * we know the value is cindex
1355 *
1356 * This check is safe even if bits == KEYLENGTH due to the
1357 * fact that we can only allocate a node with 32 bits if a
1358 * long is greater than 32 bits.
1359 */
1360 if (index >= (1ul << n->bits))
1361 break;
1362
1363 /* we have found a leaf. Prefixes have already been compared */
1364 if (IS_LEAF(n))
1365 goto found;
1366
1367 /* only record pn and cindex if we are going to be chopping
1368 * bits later. Otherwise we are just wasting cycles.
1369 */
1370 if (n->slen > n->pos) {
1371 pn = n;
1372 cindex = index;
1373 }
1374
1375 n = get_child_rcu(n, index);
1376 if (unlikely(!n))
1377 goto backtrace;
1378 }
1379
1380 /* Step 2: Sort out leaves and begin backtracing for longest prefix */
1381 for (;;) {
1382 /* record the pointer where our next node pointer is stored */
1383 struct key_vector __rcu **cptr = n->tnode;
1384
1385 /* This test verifies that none of the bits that differ
1386 * between the key and the prefix exist in the region of
1387 * the lsb and higher in the prefix.
1388 */
1389 if (unlikely(prefix_mismatch(key, n)) || (n->slen == n->pos))
1390 goto backtrace;
1391
1392 /* exit out and process leaf */
1393 if (unlikely(IS_LEAF(n)))
1394 break;
1395
1396 /* Don't bother recording parent info. Since we are in
1397 * prefix match mode we will have to come back to wherever
1398 * we started this traversal anyway
1399 */
1400
1401 while ((n = rcu_dereference(*cptr)) == NULL) {
1402backtrace:
1403#ifdef CONFIG_IP_FIB_TRIE_STATS
1404 if (!n)
1405 this_cpu_inc(stats->null_node_hit);
1406#endif
1407 /* If we are at cindex 0 there are no more bits for
1408 * us to strip at this level so we must ascend back
1409 * up one level to see if there are any more bits to
1410 * be stripped there.
1411 */
1412 while (!cindex) {
1413 t_key pkey = pn->key;
1414
1415 /* If we don't have a parent then there is
1416 * nothing for us to do as we do not have any
1417 * further nodes to parse.
1418 */
1419 if (IS_TRIE(pn))
1420 return -EAGAIN;
1421#ifdef CONFIG_IP_FIB_TRIE_STATS
1422 this_cpu_inc(stats->backtrack);
1423#endif
1424 /* Get Child's index */
1425 pn = node_parent_rcu(pn);
1426 cindex = get_index(pkey, pn);
1427 }
1428
1429 /* strip the least significant bit from the cindex */
1430 cindex &= cindex - 1;
1431
1432 /* grab pointer for next child node */
1433 cptr = &pn->tnode[cindex];
1434 }
1435 }
1436
1437found:
1438 /* this line carries forward the xor from earlier in the function */
1439 index = key ^ n->key;
1440
1441 /* Step 3: Process the leaf, if that fails fall back to backtracing */
1442 hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
1443 struct fib_info *fi = fa->fa_info;
1444 int nhsel, err;
1445
1446 if ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen < KEYLENGTH)) {
1447 if (index >= (1ul << fa->fa_slen))
1448 continue;
1449 }
1450 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1451 continue;
1452 if (fi->fib_dead)
1453 continue;
1454 if (fa->fa_info->fib_scope < flp->flowi4_scope)
1455 continue;
1456 fib_alias_accessed(fa);
1457 err = fib_props[fa->fa_type].error;
1458 if (unlikely(err < 0)) {
1459#ifdef CONFIG_IP_FIB_TRIE_STATS
1460 this_cpu_inc(stats->semantic_match_passed);
1461#endif
1462 return err;
1463 }
1464 if (fi->fib_flags & RTNH_F_DEAD)
1465 continue;
1466 for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1467 const struct fib_nh *nh = &fi->fib_nh[nhsel];
1468 struct in_device *in_dev = __in_dev_get_rcu(nh->nh_dev);
1469
1470 if (nh->nh_flags & RTNH_F_DEAD)
1471 continue;
1472 if (in_dev &&
1473 IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
1474 nh->nh_flags & RTNH_F_LINKDOWN &&
1475 !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE))
1476 continue;
1477 if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) {
1478 if (flp->flowi4_oif &&
1479 flp->flowi4_oif != nh->nh_oif)
1480 continue;
1481 }
1482
1483 if (!(fib_flags & FIB_LOOKUP_NOREF))
1484 refcount_inc(&fi->fib_clntref);
1485
1486 res->prefix = htonl(n->key);
1487 res->prefixlen = KEYLENGTH - fa->fa_slen;
1488 res->nh_sel = nhsel;
1489 res->type = fa->fa_type;
1490 res->scope = fi->fib_scope;
1491 res->fi = fi;
1492 res->table = tb;
1493 res->fa_head = &n->leaf;
1494#ifdef CONFIG_IP_FIB_TRIE_STATS
1495 this_cpu_inc(stats->semantic_match_passed);
1496#endif
1497 trace_fib_table_lookup_nh(nh);
1498
1499 return err;
1500 }
1501 }
1502#ifdef CONFIG_IP_FIB_TRIE_STATS
1503 this_cpu_inc(stats->semantic_match_miss);
1504#endif
1505 goto backtrace;
1506}
1507EXPORT_SYMBOL_GPL(fib_table_lookup);
1508
1509static void fib_remove_alias(struct trie *t, struct key_vector *tp,
1510 struct key_vector *l, struct fib_alias *old)
1511{
1512 /* record the location of the previous list_info entry */
1513 struct hlist_node **pprev = old->fa_list.pprev;
1514 struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next);
1515
1516 /* remove the fib_alias from the list */
1517 hlist_del_rcu(&old->fa_list);
1518
1519 /* if we emptied the list this leaf will be freed and we can sort
1520 * out parent suffix lengths as a part of trie_rebalance
1521 */
1522 if (hlist_empty(&l->leaf)) {
1523 if (tp->slen == l->slen)
1524 node_pull_suffix(tp, tp->pos);
1525 put_child_root(tp, l->key, NULL);
1526 node_free(l);
1527 trie_rebalance(t, tp);
1528 return;
1529 }
1530
1531 /* only access fa if it is pointing at the last valid hlist_node */
1532 if (*pprev)
1533 return;
1534
1535 /* update the trie with the latest suffix length */
1536 l->slen = fa->fa_slen;
1537 node_pull_suffix(tp, fa->fa_slen);
1538}
1539
1540/* Caller must hold RTNL. */
1541int fib_table_delete(struct net *net, struct fib_table *tb,
1542 struct fib_config *cfg, struct netlink_ext_ack *extack)
1543{
1544 struct trie *t = (struct trie *) tb->tb_data;
1545 struct fib_alias *fa, *fa_to_delete;
1546 struct key_vector *l, *tp;
1547 u8 plen = cfg->fc_dst_len;
1548 u8 slen = KEYLENGTH - plen;
1549 u8 tos = cfg->fc_tos;
1550 u32 key;
1551
1552 key = ntohl(cfg->fc_dst);
1553
1554 if (!fib_valid_key_len(key, plen, extack))
1555 return -EINVAL;
1556
1557 l = fib_find_node(t, &tp, key);
1558 if (!l)
1559 return -ESRCH;
1560
1561 fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id);
1562 if (!fa)
1563 return -ESRCH;
1564
1565 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1566
1567 fa_to_delete = NULL;
1568 hlist_for_each_entry_from(fa, fa_list) {
1569 struct fib_info *fi = fa->fa_info;
1570
1571 if ((fa->fa_slen != slen) ||
1572 (fa->tb_id != tb->tb_id) ||
1573 (fa->fa_tos != tos))
1574 break;
1575
1576 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1577 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1578 fa->fa_info->fib_scope == cfg->fc_scope) &&
1579 (!cfg->fc_prefsrc ||
1580 fi->fib_prefsrc == cfg->fc_prefsrc) &&
1581 (!cfg->fc_protocol ||
1582 fi->fib_protocol == cfg->fc_protocol) &&
1583 fib_nh_match(cfg, fi, extack) == 0 &&
1584 fib_metrics_match(cfg, fi)) {
1585 fa_to_delete = fa;
1586 break;
1587 }
1588 }
1589
1590 if (!fa_to_delete)
1591 return -ESRCH;
1592
1593 call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen,
1594 fa_to_delete, extack);
1595 rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
1596 &cfg->fc_nlinfo, 0);
1597
1598 if (!plen)
1599 tb->tb_num_default--;
1600
1601 fib_remove_alias(t, tp, l, fa_to_delete);
1602
1603 if (fa_to_delete->fa_state & FA_S_ACCESSED)
1604 rt_cache_flush(cfg->fc_nlinfo.nl_net);
1605
1606 fib_release_info(fa_to_delete->fa_info);
1607 alias_free_mem_rcu(fa_to_delete);
1608 return 0;
1609}
1610
1611/* Scan for the next leaf starting at the provided key value */
1612static struct key_vector *leaf_walk_rcu(struct key_vector **tn, t_key key)
1613{
1614 struct key_vector *pn, *n = *tn;
1615 unsigned long cindex;
1616
1617 /* this loop is meant to try and find the key in the trie */
1618 do {
1619 /* record parent and next child index */
1620 pn = n;
1621 cindex = (key > pn->key) ? get_index(key, pn) : 0;
1622
1623 if (cindex >> pn->bits)
1624 break;
1625
1626 /* descend into the next child */
1627 n = get_child_rcu(pn, cindex++);
1628 if (!n)
1629 break;
1630
1631 /* guarantee forward progress on the keys */
1632 if (IS_LEAF(n) && (n->key >= key))
1633 goto found;
1634 } while (IS_TNODE(n));
1635
1636 /* this loop will search for the next leaf with a greater key */
1637 while (!IS_TRIE(pn)) {
1638 /* if we exhausted the parent node we will need to climb */
1639 if (cindex >= (1ul << pn->bits)) {
1640 t_key pkey = pn->key;
1641
1642 pn = node_parent_rcu(pn);
1643 cindex = get_index(pkey, pn) + 1;
1644 continue;
1645 }
1646
1647 /* grab the next available node */
1648 n = get_child_rcu(pn, cindex++);
1649 if (!n)
1650 continue;
1651
1652 /* no need to compare keys since we bumped the index */
1653 if (IS_LEAF(n))
1654 goto found;
1655
1656 /* Rescan start scanning in new node */
1657 pn = n;
1658 cindex = 0;
1659 }
1660
1661 *tn = pn;
1662 return NULL; /* Root of trie */
1663found:
1664 /* if we are at the limit for keys just return NULL for the tnode */
1665 *tn = pn;
1666 return n;
1667}
1668
1669static void fib_trie_free(struct fib_table *tb)
1670{
1671 struct trie *t = (struct trie *)tb->tb_data;
1672 struct key_vector *pn = t->kv;
1673 unsigned long cindex = 1;
1674 struct hlist_node *tmp;
1675 struct fib_alias *fa;
1676
1677 /* walk trie in reverse order and free everything */
1678 for (;;) {
1679 struct key_vector *n;
1680
1681 if (!(cindex--)) {
1682 t_key pkey = pn->key;
1683
1684 if (IS_TRIE(pn))
1685 break;
1686
1687 n = pn;
1688 pn = node_parent(pn);
1689
1690 /* drop emptied tnode */
1691 put_child_root(pn, n->key, NULL);
1692 node_free(n);
1693
1694 cindex = get_index(pkey, pn);
1695
1696 continue;
1697 }
1698
1699 /* grab the next available node */
1700 n = get_child(pn, cindex);
1701 if (!n)
1702 continue;
1703
1704 if (IS_TNODE(n)) {
1705 /* record pn and cindex for leaf walking */
1706 pn = n;
1707 cindex = 1ul << n->bits;
1708
1709 continue;
1710 }
1711
1712 hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
1713 hlist_del_rcu(&fa->fa_list);
1714 alias_free_mem_rcu(fa);
1715 }
1716
1717 put_child_root(pn, n->key, NULL);
1718 node_free(n);
1719 }
1720
1721#ifdef CONFIG_IP_FIB_TRIE_STATS
1722 free_percpu(t->stats);
1723#endif
1724 kfree(tb);
1725}
1726
1727struct fib_table *fib_trie_unmerge(struct fib_table *oldtb)
1728{
1729 struct trie *ot = (struct trie *)oldtb->tb_data;
1730 struct key_vector *l, *tp = ot->kv;
1731 struct fib_table *local_tb;
1732 struct fib_alias *fa;
1733 struct trie *lt;
1734 t_key key = 0;
1735
1736 if (oldtb->tb_data == oldtb->__data)
1737 return oldtb;
1738
1739 local_tb = fib_trie_table(RT_TABLE_LOCAL, NULL);
1740 if (!local_tb)
1741 return NULL;
1742
1743 lt = (struct trie *)local_tb->tb_data;
1744
1745 while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
1746 struct key_vector *local_l = NULL, *local_tp;
1747
1748 hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
1749 struct fib_alias *new_fa;
1750
1751 if (local_tb->tb_id != fa->tb_id)
1752 continue;
1753
1754 /* clone fa for new local table */
1755 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1756 if (!new_fa)
1757 goto out;
1758
1759 memcpy(new_fa, fa, sizeof(*fa));
1760
1761 /* insert clone into table */
1762 if (!local_l)
1763 local_l = fib_find_node(lt, &local_tp, l->key);
1764
1765 if (fib_insert_alias(lt, local_tp, local_l, new_fa,
1766 NULL, l->key)) {
1767 kmem_cache_free(fn_alias_kmem, new_fa);
1768 goto out;
1769 }
1770 }
1771
1772 /* stop loop if key wrapped back to 0 */
1773 key = l->key + 1;
1774 if (key < l->key)
1775 break;
1776 }
1777
1778 return local_tb;
1779out:
1780 fib_trie_free(local_tb);
1781
1782 return NULL;
1783}
1784
1785/* Caller must hold RTNL */
1786void fib_table_flush_external(struct fib_table *tb)
1787{
1788 struct trie *t = (struct trie *)tb->tb_data;
1789 struct key_vector *pn = t->kv;
1790 unsigned long cindex = 1;
1791 struct hlist_node *tmp;
1792 struct fib_alias *fa;
1793
1794 /* walk trie in reverse order */
1795 for (;;) {
1796 unsigned char slen = 0;
1797 struct key_vector *n;
1798
1799 if (!(cindex--)) {
1800 t_key pkey = pn->key;
1801
1802 /* cannot resize the trie vector */
1803 if (IS_TRIE(pn))
1804 break;
1805
1806 /* update the suffix to address pulled leaves */
1807 if (pn->slen > pn->pos)
1808 update_suffix(pn);
1809
1810 /* resize completed node */
1811 pn = resize(t, pn);
1812 cindex = get_index(pkey, pn);
1813
1814 continue;
1815 }
1816
1817 /* grab the next available node */
1818 n = get_child(pn, cindex);
1819 if (!n)
1820 continue;
1821
1822 if (IS_TNODE(n)) {
1823 /* record pn and cindex for leaf walking */
1824 pn = n;
1825 cindex = 1ul << n->bits;
1826
1827 continue;
1828 }
1829
1830 hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
1831 /* if alias was cloned to local then we just
1832 * need to remove the local copy from main
1833 */
1834 if (tb->tb_id != fa->tb_id) {
1835 hlist_del_rcu(&fa->fa_list);
1836 alias_free_mem_rcu(fa);
1837 continue;
1838 }
1839
1840 /* record local slen */
1841 slen = fa->fa_slen;
1842 }
1843
1844 /* update leaf slen */
1845 n->slen = slen;
1846
1847 if (hlist_empty(&n->leaf)) {
1848 put_child_root(pn, n->key, NULL);
1849 node_free(n);
1850 }
1851 }
1852}
1853
1854/* Caller must hold RTNL. */
1855int fib_table_flush(struct net *net, struct fib_table *tb)
1856{
1857 struct trie *t = (struct trie *)tb->tb_data;
1858 struct key_vector *pn = t->kv;
1859 unsigned long cindex = 1;
1860 struct hlist_node *tmp;
1861 struct fib_alias *fa;
1862 int found = 0;
1863
1864 /* walk trie in reverse order */
1865 for (;;) {
1866 unsigned char slen = 0;
1867 struct key_vector *n;
1868
1869 if (!(cindex--)) {
1870 t_key pkey = pn->key;
1871
1872 /* cannot resize the trie vector */
1873 if (IS_TRIE(pn))
1874 break;
1875
1876 /* update the suffix to address pulled leaves */
1877 if (pn->slen > pn->pos)
1878 update_suffix(pn);
1879
1880 /* resize completed node */
1881 pn = resize(t, pn);
1882 cindex = get_index(pkey, pn);
1883
1884 continue;
1885 }
1886
1887 /* grab the next available node */
1888 n = get_child(pn, cindex);
1889 if (!n)
1890 continue;
1891
1892 if (IS_TNODE(n)) {
1893 /* record pn and cindex for leaf walking */
1894 pn = n;
1895 cindex = 1ul << n->bits;
1896
1897 continue;
1898 }
1899
1900 hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
1901 struct fib_info *fi = fa->fa_info;
1902
1903 if (!fi || !(fi->fib_flags & RTNH_F_DEAD) ||
1904 tb->tb_id != fa->tb_id) {
1905 slen = fa->fa_slen;
1906 continue;
1907 }
1908
1909 call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL,
1910 n->key,
1911 KEYLENGTH - fa->fa_slen, fa,
1912 NULL);
1913 hlist_del_rcu(&fa->fa_list);
1914 fib_release_info(fa->fa_info);
1915 alias_free_mem_rcu(fa);
1916 found++;
1917 }
1918
1919 /* update leaf slen */
1920 n->slen = slen;
1921
1922 if (hlist_empty(&n->leaf)) {
1923 put_child_root(pn, n->key, NULL);
1924 node_free(n);
1925 }
1926 }
1927
1928 pr_debug("trie_flush found=%d\n", found);
1929 return found;
1930}
1931
1932static void fib_leaf_notify(struct net *net, struct key_vector *l,
1933 struct fib_table *tb, struct notifier_block *nb)
1934{
1935 struct fib_alias *fa;
1936
1937 hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
1938 struct fib_info *fi = fa->fa_info;
1939
1940 if (!fi)
1941 continue;
1942
1943 /* local and main table can share the same trie,
1944 * so don't notify twice for the same entry.
1945 */
1946 if (tb->tb_id != fa->tb_id)
1947 continue;
1948
1949 call_fib_entry_notifier(nb, net, FIB_EVENT_ENTRY_ADD, l->key,
1950 KEYLENGTH - fa->fa_slen, fa);
1951 }
1952}
1953
1954static void fib_table_notify(struct net *net, struct fib_table *tb,
1955 struct notifier_block *nb)
1956{
1957 struct trie *t = (struct trie *)tb->tb_data;
1958 struct key_vector *l, *tp = t->kv;
1959 t_key key = 0;
1960
1961 while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
1962 fib_leaf_notify(net, l, tb, nb);
1963
1964 key = l->key + 1;
1965 /* stop in case of wrap around */
1966 if (key < l->key)
1967 break;
1968 }
1969}
1970
1971void fib_notify(struct net *net, struct notifier_block *nb)
1972{
1973 unsigned int h;
1974
1975 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
1976 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
1977 struct fib_table *tb;
1978
1979 hlist_for_each_entry_rcu(tb, head, tb_hlist)
1980 fib_table_notify(net, tb, nb);
1981 }
1982}
1983
1984static void __trie_free_rcu(struct rcu_head *head)
1985{
1986 struct fib_table *tb = container_of(head, struct fib_table, rcu);
1987#ifdef CONFIG_IP_FIB_TRIE_STATS
1988 struct trie *t = (struct trie *)tb->tb_data;
1989
1990 if (tb->tb_data == tb->__data)
1991 free_percpu(t->stats);
1992#endif /* CONFIG_IP_FIB_TRIE_STATS */
1993 kfree(tb);
1994}
1995
1996void fib_free_table(struct fib_table *tb)
1997{
1998 call_rcu(&tb->rcu, __trie_free_rcu);
1999}
2000
2001static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
2002 struct sk_buff *skb, struct netlink_callback *cb)
2003{
2004 __be32 xkey = htonl(l->key);
2005 struct fib_alias *fa;
2006 int i, s_i;
2007
2008 s_i = cb->args[4];
2009 i = 0;
2010
2011 /* rcu_read_lock is hold by caller */
2012 hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
2013 int err;
2014
2015 if (i < s_i) {
2016 i++;
2017 continue;
2018 }
2019
2020 if (tb->tb_id != fa->tb_id) {
2021 i++;
2022 continue;
2023 }
2024
2025 err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
2026 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2027 tb->tb_id, fa->fa_type,
2028 xkey, KEYLENGTH - fa->fa_slen,
2029 fa->fa_tos, fa->fa_info, NLM_F_MULTI);
2030 if (err < 0) {
2031 cb->args[4] = i;
2032 return err;
2033 }
2034 i++;
2035 }
2036
2037 cb->args[4] = i;
2038 return skb->len;
2039}
2040
2041/* rcu_read_lock needs to be hold by caller from readside */
2042int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
2043 struct netlink_callback *cb)
2044{
2045 struct trie *t = (struct trie *)tb->tb_data;
2046 struct key_vector *l, *tp = t->kv;
2047 /* Dump starting at last key.
2048 * Note: 0.0.0.0/0 (ie default) is first key.
2049 */
2050 int count = cb->args[2];
2051 t_key key = cb->args[3];
2052
2053 while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
2054 int err;
2055
2056 err = fn_trie_dump_leaf(l, tb, skb, cb);
2057 if (err < 0) {
2058 cb->args[3] = key;
2059 cb->args[2] = count;
2060 return err;
2061 }
2062
2063 ++count;
2064 key = l->key + 1;
2065
2066 memset(&cb->args[4], 0,
2067 sizeof(cb->args) - 4*sizeof(cb->args[0]));
2068
2069 /* stop loop if key wrapped back to 0 */
2070 if (key < l->key)
2071 break;
2072 }
2073
2074 cb->args[3] = key;
2075 cb->args[2] = count;
2076
2077 return skb->len;
2078}
2079
2080void __init fib_trie_init(void)
2081{
2082 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
2083 sizeof(struct fib_alias),
2084 0, SLAB_PANIC, NULL);
2085
2086 trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
2087 LEAF_SIZE,
2088 0, SLAB_PANIC, NULL);
2089}
2090
2091struct fib_table *fib_trie_table(u32 id, struct fib_table *alias)
2092{
2093 struct fib_table *tb;
2094 struct trie *t;
2095 size_t sz = sizeof(*tb);
2096
2097 if (!alias)
2098 sz += sizeof(struct trie);
2099
2100 tb = kzalloc(sz, GFP_KERNEL);
2101 if (!tb)
2102 return NULL;
2103
2104 tb->tb_id = id;
2105 tb->tb_num_default = 0;
2106 tb->tb_data = (alias ? alias->__data : tb->__data);
2107
2108 if (alias)
2109 return tb;
2110
2111 t = (struct trie *) tb->tb_data;
2112 t->kv[0].pos = KEYLENGTH;
2113 t->kv[0].slen = KEYLENGTH;
2114#ifdef CONFIG_IP_FIB_TRIE_STATS
2115 t->stats = alloc_percpu(struct trie_use_stats);
2116 if (!t->stats) {
2117 kfree(tb);
2118 tb = NULL;
2119 }
2120#endif
2121
2122 return tb;
2123}
2124
2125#ifdef CONFIG_PROC_FS
2126/* Depth first Trie walk iterator */
2127struct fib_trie_iter {
2128 struct seq_net_private p;
2129 struct fib_table *tb;
2130 struct key_vector *tnode;
2131 unsigned int index;
2132 unsigned int depth;
2133};
2134
2135static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter)
2136{
2137 unsigned long cindex = iter->index;
2138 struct key_vector *pn = iter->tnode;
2139 t_key pkey;
2140
2141 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2142 iter->tnode, iter->index, iter->depth);
2143
2144 while (!IS_TRIE(pn)) {
2145 while (cindex < child_length(pn)) {
2146 struct key_vector *n = get_child_rcu(pn, cindex++);
2147
2148 if (!n)
2149 continue;
2150
2151 if (IS_LEAF(n)) {
2152 iter->tnode = pn;
2153 iter->index = cindex;
2154 } else {
2155 /* push down one level */
2156 iter->tnode = n;
2157 iter->index = 0;
2158 ++iter->depth;
2159 }
2160
2161 return n;
2162 }
2163
2164 /* Current node exhausted, pop back up */
2165 pkey = pn->key;
2166 pn = node_parent_rcu(pn);
2167 cindex = get_index(pkey, pn) + 1;
2168 --iter->depth;
2169 }
2170
2171 /* record root node so further searches know we are done */
2172 iter->tnode = pn;
2173 iter->index = 0;
2174
2175 return NULL;
2176}
2177
2178static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter,
2179 struct trie *t)
2180{
2181 struct key_vector *n, *pn;
2182
2183 if (!t)
2184 return NULL;
2185
2186 pn = t->kv;
2187 n = rcu_dereference(pn->tnode[0]);
2188 if (!n)
2189 return NULL;
2190
2191 if (IS_TNODE(n)) {
2192 iter->tnode = n;
2193 iter->index = 0;
2194 iter->depth = 1;
2195 } else {
2196 iter->tnode = pn;
2197 iter->index = 0;
2198 iter->depth = 0;
2199 }
2200
2201 return n;
2202}
2203
2204static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2205{
2206 struct key_vector *n;
2207 struct fib_trie_iter iter;
2208
2209 memset(s, 0, sizeof(*s));
2210
2211 rcu_read_lock();
2212 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2213 if (IS_LEAF(n)) {
2214 struct fib_alias *fa;
2215
2216 s->leaves++;
2217 s->totdepth += iter.depth;
2218 if (iter.depth > s->maxdepth)
2219 s->maxdepth = iter.depth;
2220
2221 hlist_for_each_entry_rcu(fa, &n->leaf, fa_list)
2222 ++s->prefixes;
2223 } else {
2224 s->tnodes++;
2225 if (n->bits < MAX_STAT_DEPTH)
2226 s->nodesizes[n->bits]++;
2227 s->nullpointers += tn_info(n)->empty_children;
2228 }
2229 }
2230 rcu_read_unlock();
2231}
2232
2233/*
2234 * This outputs /proc/net/fib_triestats
2235 */
2236static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2237{
2238 unsigned int i, max, pointers, bytes, avdepth;
2239
2240 if (stat->leaves)
2241 avdepth = stat->totdepth*100 / stat->leaves;
2242 else
2243 avdepth = 0;
2244
2245 seq_printf(seq, "\tAver depth: %u.%02d\n",
2246 avdepth / 100, avdepth % 100);
2247 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
2248
2249 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
2250 bytes = LEAF_SIZE * stat->leaves;
2251
2252 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
2253 bytes += sizeof(struct fib_alias) * stat->prefixes;
2254
2255 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2256 bytes += TNODE_SIZE(0) * stat->tnodes;
2257
2258 max = MAX_STAT_DEPTH;
2259 while (max > 0 && stat->nodesizes[max-1] == 0)
2260 max--;
2261
2262 pointers = 0;
2263 for (i = 1; i < max; i++)
2264 if (stat->nodesizes[i] != 0) {
2265 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
2266 pointers += (1<<i) * stat->nodesizes[i];
2267 }
2268 seq_putc(seq, '\n');
2269 seq_printf(seq, "\tPointers: %u\n", pointers);
2270
2271 bytes += sizeof(struct key_vector *) * pointers;
2272 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2273 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
2274}
2275
2276#ifdef CONFIG_IP_FIB_TRIE_STATS
2277static void trie_show_usage(struct seq_file *seq,
2278 const struct trie_use_stats __percpu *stats)
2279{
2280 struct trie_use_stats s = { 0 };
2281 int cpu;
2282
2283 /* loop through all of the CPUs and gather up the stats */
2284 for_each_possible_cpu(cpu) {
2285 const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);
2286
2287 s.gets += pcpu->gets;
2288 s.backtrack += pcpu->backtrack;
2289 s.semantic_match_passed += pcpu->semantic_match_passed;
2290 s.semantic_match_miss += pcpu->semantic_match_miss;
2291 s.null_node_hit += pcpu->null_node_hit;
2292 s.resize_node_skipped += pcpu->resize_node_skipped;
2293 }
2294
2295 seq_printf(seq, "\nCounters:\n---------\n");
2296 seq_printf(seq, "gets = %u\n", s.gets);
2297 seq_printf(seq, "backtracks = %u\n", s.backtrack);
2298 seq_printf(seq, "semantic match passed = %u\n",
2299 s.semantic_match_passed);
2300 seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
2301 seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
2302 seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
2303}
2304#endif /* CONFIG_IP_FIB_TRIE_STATS */
2305
2306static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2307{
2308 if (tb->tb_id == RT_TABLE_LOCAL)
2309 seq_puts(seq, "Local:\n");
2310 else if (tb->tb_id == RT_TABLE_MAIN)
2311 seq_puts(seq, "Main:\n");
2312 else
2313 seq_printf(seq, "Id %d:\n", tb->tb_id);
2314}
2315
2316
2317static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2318{
2319 struct net *net = (struct net *)seq->private;
2320 unsigned int h;
2321
2322 seq_printf(seq,
2323 "Basic info: size of leaf:"
2324 " %zd bytes, size of tnode: %zd bytes.\n",
2325 LEAF_SIZE, TNODE_SIZE(0));
2326
2327 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2328 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2329 struct fib_table *tb;
2330
2331 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2332 struct trie *t = (struct trie *) tb->tb_data;
2333 struct trie_stat stat;
2334
2335 if (!t)
2336 continue;
2337
2338 fib_table_print(seq, tb);
2339
2340 trie_collect_stats(t, &stat);
2341 trie_show_stats(seq, &stat);
2342#ifdef CONFIG_IP_FIB_TRIE_STATS
2343 trie_show_usage(seq, t->stats);
2344#endif
2345 }
2346 }
2347
2348 return 0;
2349}
2350
2351static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2352{
2353 return single_open_net(inode, file, fib_triestat_seq_show);
2354}
2355
2356static const struct file_operations fib_triestat_fops = {
2357 .open = fib_triestat_seq_open,
2358 .read = seq_read,
2359 .llseek = seq_lseek,
2360 .release = single_release_net,
2361};
2362
2363static struct key_vector *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2364{
2365 struct fib_trie_iter *iter = seq->private;
2366 struct net *net = seq_file_net(seq);
2367 loff_t idx = 0;
2368 unsigned int h;
2369
2370 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2371 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2372 struct fib_table *tb;
2373
2374 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2375 struct key_vector *n;
2376
2377 for (n = fib_trie_get_first(iter,
2378 (struct trie *) tb->tb_data);
2379 n; n = fib_trie_get_next(iter))
2380 if (pos == idx++) {
2381 iter->tb = tb;
2382 return n;
2383 }
2384 }
2385 }
2386
2387 return NULL;
2388}
2389
2390static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2391 __acquires(RCU)
2392{
2393 rcu_read_lock();
2394 return fib_trie_get_idx(seq, *pos);
2395}
2396
2397static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2398{
2399 struct fib_trie_iter *iter = seq->private;
2400 struct net *net = seq_file_net(seq);
2401 struct fib_table *tb = iter->tb;
2402 struct hlist_node *tb_node;
2403 unsigned int h;
2404 struct key_vector *n;
2405
2406 ++*pos;
2407 /* next node in same table */
2408 n = fib_trie_get_next(iter);
2409 if (n)
2410 return n;
2411
2412 /* walk rest of this hash chain */
2413 h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2414 while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2415 tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2416 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2417 if (n)
2418 goto found;
2419 }
2420
2421 /* new hash chain */
2422 while (++h < FIB_TABLE_HASHSZ) {
2423 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2424 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2425 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2426 if (n)
2427 goto found;
2428 }
2429 }
2430 return NULL;
2431
2432found:
2433 iter->tb = tb;
2434 return n;
2435}
2436
2437static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2438 __releases(RCU)
2439{
2440 rcu_read_unlock();
2441}
2442
2443static void seq_indent(struct seq_file *seq, int n)
2444{
2445 while (n-- > 0)
2446 seq_puts(seq, " ");
2447}
2448
2449static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2450{
2451 switch (s) {
2452 case RT_SCOPE_UNIVERSE: return "universe";
2453 case RT_SCOPE_SITE: return "site";
2454 case RT_SCOPE_LINK: return "link";
2455 case RT_SCOPE_HOST: return "host";
2456 case RT_SCOPE_NOWHERE: return "nowhere";
2457 default:
2458 snprintf(buf, len, "scope=%d", s);
2459 return buf;
2460 }
2461}
2462
2463static const char *const rtn_type_names[__RTN_MAX] = {
2464 [RTN_UNSPEC] = "UNSPEC",
2465 [RTN_UNICAST] = "UNICAST",
2466 [RTN_LOCAL] = "LOCAL",
2467 [RTN_BROADCAST] = "BROADCAST",
2468 [RTN_ANYCAST] = "ANYCAST",
2469 [RTN_MULTICAST] = "MULTICAST",
2470 [RTN_BLACKHOLE] = "BLACKHOLE",
2471 [RTN_UNREACHABLE] = "UNREACHABLE",
2472 [RTN_PROHIBIT] = "PROHIBIT",
2473 [RTN_THROW] = "THROW",
2474 [RTN_NAT] = "NAT",
2475 [RTN_XRESOLVE] = "XRESOLVE",
2476};
2477
2478static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2479{
2480 if (t < __RTN_MAX && rtn_type_names[t])
2481 return rtn_type_names[t];
2482 snprintf(buf, len, "type %u", t);
2483 return buf;
2484}
2485
2486/* Pretty print the trie */
2487static int fib_trie_seq_show(struct seq_file *seq, void *v)
2488{
2489 const struct fib_trie_iter *iter = seq->private;
2490 struct key_vector *n = v;
2491
2492 if (IS_TRIE(node_parent_rcu(n)))
2493 fib_table_print(seq, iter->tb);
2494
2495 if (IS_TNODE(n)) {
2496 __be32 prf = htonl(n->key);
2497
2498 seq_indent(seq, iter->depth-1);
2499 seq_printf(seq, " +-- %pI4/%zu %u %u %u\n",
2500 &prf, KEYLENGTH - n->pos - n->bits, n->bits,
2501 tn_info(n)->full_children,
2502 tn_info(n)->empty_children);
2503 } else {
2504 __be32 val = htonl(n->key);
2505 struct fib_alias *fa;
2506
2507 seq_indent(seq, iter->depth);
2508 seq_printf(seq, " |-- %pI4\n", &val);
2509
2510 hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
2511 char buf1[32], buf2[32];
2512
2513 seq_indent(seq, iter->depth + 1);
2514 seq_printf(seq, " /%zu %s %s",
2515 KEYLENGTH - fa->fa_slen,
2516 rtn_scope(buf1, sizeof(buf1),
2517 fa->fa_info->fib_scope),
2518 rtn_type(buf2, sizeof(buf2),
2519 fa->fa_type));
2520 if (fa->fa_tos)
2521 seq_printf(seq, " tos=%d", fa->fa_tos);
2522 seq_putc(seq, '\n');
2523 }
2524 }
2525
2526 return 0;
2527}
2528
2529static const struct seq_operations fib_trie_seq_ops = {
2530 .start = fib_trie_seq_start,
2531 .next = fib_trie_seq_next,
2532 .stop = fib_trie_seq_stop,
2533 .show = fib_trie_seq_show,
2534};
2535
2536static int fib_trie_seq_open(struct inode *inode, struct file *file)
2537{
2538 return seq_open_net(inode, file, &fib_trie_seq_ops,
2539 sizeof(struct fib_trie_iter));
2540}
2541
2542static const struct file_operations fib_trie_fops = {
2543 .open = fib_trie_seq_open,
2544 .read = seq_read,
2545 .llseek = seq_lseek,
2546 .release = seq_release_net,
2547};
2548
2549struct fib_route_iter {
2550 struct seq_net_private p;
2551 struct fib_table *main_tb;
2552 struct key_vector *tnode;
2553 loff_t pos;
2554 t_key key;
2555};
2556
2557static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter,
2558 loff_t pos)
2559{
2560 struct key_vector *l, **tp = &iter->tnode;
2561 t_key key;
2562
2563 /* use cached location of previously found key */
2564 if (iter->pos > 0 && pos >= iter->pos) {
2565 key = iter->key;
2566 } else {
2567 iter->pos = 1;
2568 key = 0;
2569 }
2570
2571 pos -= iter->pos;
2572
2573 while ((l = leaf_walk_rcu(tp, key)) && (pos-- > 0)) {
2574 key = l->key + 1;
2575 iter->pos++;
2576 l = NULL;
2577
2578 /* handle unlikely case of a key wrap */
2579 if (!key)
2580 break;
2581 }
2582
2583 if (l)
2584 iter->key = l->key; /* remember it */
2585 else
2586 iter->pos = 0; /* forget it */
2587
2588 return l;
2589}
2590
2591static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2592 __acquires(RCU)
2593{
2594 struct fib_route_iter *iter = seq->private;
2595 struct fib_table *tb;
2596 struct trie *t;
2597
2598 rcu_read_lock();
2599
2600 tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2601 if (!tb)
2602 return NULL;
2603
2604 iter->main_tb = tb;
2605 t = (struct trie *)tb->tb_data;
2606 iter->tnode = t->kv;
2607
2608 if (*pos != 0)
2609 return fib_route_get_idx(iter, *pos);
2610
2611 iter->pos = 0;
2612 iter->key = KEY_MAX;
2613
2614 return SEQ_START_TOKEN;
2615}
2616
2617static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2618{
2619 struct fib_route_iter *iter = seq->private;
2620 struct key_vector *l = NULL;
2621 t_key key = iter->key + 1;
2622
2623 ++*pos;
2624
2625 /* only allow key of 0 for start of sequence */
2626 if ((v == SEQ_START_TOKEN) || key)
2627 l = leaf_walk_rcu(&iter->tnode, key);
2628
2629 if (l) {
2630 iter->key = l->key;
2631 iter->pos++;
2632 } else {
2633 iter->pos = 0;
2634 }
2635
2636 return l;
2637}
2638
2639static void fib_route_seq_stop(struct seq_file *seq, void *v)
2640 __releases(RCU)
2641{
2642 rcu_read_unlock();
2643}
2644
2645static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2646{
2647 unsigned int flags = 0;
2648
2649 if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2650 flags = RTF_REJECT;
2651 if (fi && fi->fib_nh->nh_gw)
2652 flags |= RTF_GATEWAY;
2653 if (mask == htonl(0xFFFFFFFF))
2654 flags |= RTF_HOST;
2655 flags |= RTF_UP;
2656 return flags;
2657}
2658
2659/*
2660 * This outputs /proc/net/route.
2661 * The format of the file is not supposed to be changed
2662 * and needs to be same as fib_hash output to avoid breaking
2663 * legacy utilities
2664 */
2665static int fib_route_seq_show(struct seq_file *seq, void *v)
2666{
2667 struct fib_route_iter *iter = seq->private;
2668 struct fib_table *tb = iter->main_tb;
2669 struct fib_alias *fa;
2670 struct key_vector *l = v;
2671 __be32 prefix;
2672
2673 if (v == SEQ_START_TOKEN) {
2674 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2675 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2676 "\tWindow\tIRTT");
2677 return 0;
2678 }
2679
2680 prefix = htonl(l->key);
2681
2682 hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
2683 const struct fib_info *fi = fa->fa_info;
2684 __be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen);
2685 unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2686
2687 if ((fa->fa_type == RTN_BROADCAST) ||
2688 (fa->fa_type == RTN_MULTICAST))
2689 continue;
2690
2691 if (fa->tb_id != tb->tb_id)
2692 continue;
2693
2694 seq_setwidth(seq, 127);
2695
2696 if (fi)
2697 seq_printf(seq,
2698 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2699 "%d\t%08X\t%d\t%u\t%u",
2700 fi->fib_dev ? fi->fib_dev->name : "*",
2701 prefix,
2702 fi->fib_nh->nh_gw, flags, 0, 0,
2703 fi->fib_priority,
2704 mask,
2705 (fi->fib_advmss ?
2706 fi->fib_advmss + 40 : 0),
2707 fi->fib_window,
2708 fi->fib_rtt >> 3);
2709 else
2710 seq_printf(seq,
2711 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2712 "%d\t%08X\t%d\t%u\t%u",
2713 prefix, 0, flags, 0, 0, 0,
2714 mask, 0, 0, 0);
2715
2716 seq_pad(seq, '\n');
2717 }
2718
2719 return 0;
2720}
2721
2722static const struct seq_operations fib_route_seq_ops = {
2723 .start = fib_route_seq_start,
2724 .next = fib_route_seq_next,
2725 .stop = fib_route_seq_stop,
2726 .show = fib_route_seq_show,
2727};
2728
2729static int fib_route_seq_open(struct inode *inode, struct file *file)
2730{
2731 return seq_open_net(inode, file, &fib_route_seq_ops,
2732 sizeof(struct fib_route_iter));
2733}
2734
2735static const struct file_operations fib_route_fops = {
2736 .open = fib_route_seq_open,
2737 .read = seq_read,
2738 .llseek = seq_lseek,
2739 .release = seq_release_net,
2740};
2741
2742int __net_init fib_proc_init(struct net *net)
2743{
2744 if (!proc_create("fib_trie", 0444, net->proc_net, &fib_trie_fops))
2745 goto out1;
2746
2747 if (!proc_create("fib_triestat", 0444, net->proc_net,
2748 &fib_triestat_fops))
2749 goto out2;
2750
2751 if (!proc_create("route", 0444, net->proc_net, &fib_route_fops))
2752 goto out3;
2753
2754 return 0;
2755
2756out3:
2757 remove_proc_entry("fib_triestat", net->proc_net);
2758out2:
2759 remove_proc_entry("fib_trie", net->proc_net);
2760out1:
2761 return -ENOMEM;
2762}
2763
2764void __net_exit fib_proc_exit(struct net *net)
2765{
2766 remove_proc_entry("fib_trie", net->proc_net);
2767 remove_proc_entry("fib_triestat", net->proc_net);
2768 remove_proc_entry("route", net->proc_net);
2769}
2770
2771#endif /* CONFIG_PROC_FS */