fib_trie.c 66.6 KB
Newer Older
1 2 3 4 5 6 7 8 9
/*
 *   This program is free software; you can redistribute it and/or
 *   modify it under the terms of the GNU General Public License
 *   as published by the Free Software Foundation; either version
 *   2 of the License, or (at your option) any later version.
 *
 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
 *     & Swedish University of Agricultural Sciences.
 *
10
 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11
 *     Agricultural Sciences.
12
 *
13 14
 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
 *
15
 * This work is based on the LPC-trie which is originally described in:
16
 *
17 18
 * An experimental study of compression methods for dynamic tries
 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19
 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
 *
 *
 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
 *
 *
 * Code from fib_hash has been reused which includes the following header:
 *
 *
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		IPv4 FIB: lookup engine and maintenance routines.
 *
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
42 43 44 45 46 47 48
 *
 * Substantial contributions to this work comes from:
 *
 *		David S. Miller, <davem@davemloft.net>
 *		Stephen Hemminger <shemminger@osdl.org>
 *		Paul E. McKenney <paulmck@us.ibm.com>
 *		Patrick McHardy <kaber@trash.net>
49 50
 */

Jens Låås's avatar
Jens Låås committed
51
#define VERSION "0.409"
52

53
#include <linux/cache.h>
54
#include <linux/uaccess.h>
55
#include <linux/bitops.h>
56 57 58 59 60 61 62 63 64
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
65
#include <linux/inetdevice.h>
66 67 68
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
69
#include <linux/rcupdate.h>
70 71 72 73
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/init.h>
#include <linux/list.h>
74
#include <linux/slab.h>
75
#include <linux/export.h>
76
#include <linux/vmalloc.h>
77
#include <linux/notifier.h>
78
#include <net/net_namespace.h>
79 80 81 82 83 84
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
85
#include <net/fib_notifier.h>
David Ahern's avatar
David Ahern committed
86
#include <trace/events/fib.h>
87 88
#include "fib_lookup.h"

89 90
static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
				   enum fib_event_type event_type, u32 dst,
91
				   int dst_len, struct fib_alias *fa)
92 93 94 95
{
	struct fib_entry_notifier_info info = {
		.dst = dst,
		.dst_len = dst_len,
96 97 98 99
		.fi = fa->fa_info,
		.tos = fa->fa_tos,
		.type = fa->fa_type,
		.tb_id = fa->tb_id,
100
	};
101
	return call_fib4_notifier(nb, net, event_type, &info.info);
102 103
}

104 105
static int call_fib_entry_notifiers(struct net *net,
				    enum fib_event_type event_type, u32 dst,
106 107
				    int dst_len, struct fib_alias *fa,
				    struct netlink_ext_ack *extack)
108 109
{
	struct fib_entry_notifier_info info = {
110
		.info.extack = extack,
111 112
		.dst = dst,
		.dst_len = dst_len,
113 114 115 116
		.fi = fa->fa_info,
		.tos = fa->fa_tos,
		.type = fa->fa_type,
		.tb_id = fa->tb_id,
117
	};
118
	return call_fib4_notifiers(net, event_type, &info.info);
119 120
}

121
#define MAX_STAT_DEPTH 32
122

123 124
#define KEYLENGTH	(8*sizeof(t_key))
#define KEY_MAX		((t_key)~0)
125 126 127

typedef unsigned int t_key;

128 129 130
#define IS_TRIE(n)	((n)->pos >= KEYLENGTH)
#define IS_TNODE(n)	((n)->bits)
#define IS_LEAF(n)	(!(n)->bits)
131

132
struct key_vector {
133 134
	t_key key;
	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
135
	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
136
	unsigned char slen;
137
	union {
138
		/* This list pointer if valid if (pos | bits) == 0 (LEAF) */
139
		struct hlist_head leaf;
140
		/* This array is valid if (pos | bits) > 0 (TNODE) */
141
		struct key_vector __rcu *tnode[0];
142
	};
143 144
};

145
struct tnode {
146
	struct rcu_head rcu;
147 148
	t_key empty_children;		/* KEYLENGTH bits needed */
	t_key full_children;		/* KEYLENGTH bits needed */
149
	struct key_vector __rcu *parent;
150
	struct key_vector kv[1];
151
#define tn_bits kv[0].bits
152 153 154
};

#define TNODE_SIZE(n)	offsetof(struct tnode, kv[0].tnode[n])
155 156
#define LEAF_SIZE	TNODE_SIZE(1)

157 158 159 160 161 162 163
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats {
	unsigned int gets;
	unsigned int backtrack;
	unsigned int semantic_match_passed;
	unsigned int semantic_match_miss;
	unsigned int null_node_hit;
164
	unsigned int resize_node_skipped;
165 166 167 168 169 170 171 172 173
};
#endif

struct trie_stat {
	unsigned int totdepth;
	unsigned int maxdepth;
	unsigned int tnodes;
	unsigned int leaves;
	unsigned int nullpointers;
174
	unsigned int prefixes;
175
	unsigned int nodesizes[MAX_STAT_DEPTH];
176
};
177 178

struct trie {
179
	struct key_vector kv[1];
180
#ifdef CONFIG_IP_FIB_TRIE_STATS
181
	struct trie_use_stats __percpu *stats;
182 183 184
#endif
};

185
static struct key_vector *resize(struct trie *t, struct key_vector *tn);
186 187 188 189 190 191 192 193
static size_t tnode_free_size;

/*
 * synchronize_rcu after call_rcu for that many pages; it should be especially
 * useful before resizing the root node with PREEMPT_NONE configs; the value was
 * obtained experimentally, aiming to avoid visible slowdown.
 */
static const int sync_pages = 128;
194

195 196
static struct kmem_cache *fn_alias_kmem __ro_after_init;
static struct kmem_cache *trie_leaf_kmem __ro_after_init;
197

198 199 200 201 202
static inline struct tnode *tn_info(struct key_vector *kv)
{
	return container_of(kv, struct tnode, kv[0]);
}

203
/* caller must hold RTNL */
204
#define node_parent(tn) rtnl_dereference(tn_info(tn)->parent)
205
#define get_child(tn, i) rtnl_dereference((tn)->tnode[i])
206

207
/* caller must hold RCU read lock or RTNL */
208
#define node_parent_rcu(tn) rcu_dereference_rtnl(tn_info(tn)->parent)
209
#define get_child_rcu(tn, i) rcu_dereference_rtnl((tn)->tnode[i])
210

211
/* wrapper for rcu_assign_pointer */
212
static inline void node_set_parent(struct key_vector *n, struct key_vector *tp)
213
{
214
	if (n)
215
		rcu_assign_pointer(tn_info(n)->parent, tp);
216 217
}

218
#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER(tn_info(n)->parent, p)
219 220 221

/* This provides us with the number of children in this node, in the case of a
 * leaf this will return 0 meaning none of the children are accessible.
222
 */
223
static inline unsigned long child_length(const struct key_vector *tn)
224
{
225
	return (1ul << tn->bits) & ~(1ul);
226
}
227

228 229
#define get_cindex(key, kv) (((key) ^ (kv)->key) >> (kv)->pos)

230 231 232 233
static inline unsigned long get_index(t_key key, struct key_vector *kv)
{
	unsigned long index = key ^ kv->key;

234 235 236
	if ((BITS_PER_LONG <= KEYLENGTH) && (KEYLENGTH == kv->pos))
		return 0;

237 238 239
	return index >> kv->pos;
}

240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285
/* To understand this stuff, an understanding of keys and all their bits is
 * necessary. Every node in the trie has a key associated with it, but not
 * all of the bits in that key are significant.
 *
 * Consider a node 'n' and its parent 'tp'.
 *
 * If n is a leaf, every bit in its key is significant. Its presence is
 * necessitated by path compression, since during a tree traversal (when
 * searching for a leaf - unless we are doing an insertion) we will completely
 * ignore all skipped bits we encounter. Thus we need to verify, at the end of
 * a potentially successful search, that we have indeed been walking the
 * correct key path.
 *
 * Note that we can never "miss" the correct key in the tree if present by
 * following the wrong path. Path compression ensures that segments of the key
 * that are the same for all keys with a given prefix are skipped, but the
 * skipped part *is* identical for each node in the subtrie below the skipped
 * bit! trie_insert() in this implementation takes care of that.
 *
 * if n is an internal node - a 'tnode' here, the various parts of its key
 * have many different meanings.
 *
 * Example:
 * _________________________________________________________________
 * | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
 * -----------------------------------------------------------------
 *  31  30  29  28  27  26  25  24  23  22  21  20  19  18  17  16
 *
 * _________________________________________________________________
 * | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
 * -----------------------------------------------------------------
 *  15  14  13  12  11  10   9   8   7   6   5   4   3   2   1   0
 *
 * tp->pos = 22
 * tp->bits = 3
 * n->pos = 13
 * n->bits = 4
 *
 * First, let's just ignore the bits that come before the parent tp, that is
 * the bits from (tp->pos + tp->bits) to 31. They are *known* but at this
 * point we do not use them for anything.
 *
 * The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
 * index into the parent's child array. That is, they will be used to find
 * 'n' among tp's children.
 *
286
 * The bits from (n->pos + n->bits) to (tp->pos - 1) - "S" - are skipped bits
287 288 289 290 291 292 293 294
 * for the node n.
 *
 * All the bits we have seen so far are significant to the node n. The rest
 * of the bits are really not needed or indeed known in n->key.
 *
 * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
 * n's child array, and will of course be different for each child.
 *
295
 * The rest of the bits, from 0 to (n->pos -1) - "u" - are completely unknown
296 297
 * at this point.
 */
298

299 300
static const int halve_threshold = 25;
static const int inflate_threshold = 50;
301
static const int halve_threshold_root = 15;
Jens Låås's avatar
Jens Låås committed
302
static const int inflate_threshold_root = 30;
303 304

static void __alias_free_mem(struct rcu_head *head)
305
{
306 307
	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
	kmem_cache_free(fn_alias_kmem, fa);
308 309
}

310
static inline void alias_free_mem_rcu(struct fib_alias *fa)
311
{
312 313
	call_rcu(&fa->rcu, __alias_free_mem);
}
314

315
#define TNODE_KMALLOC_MAX \
316
	ilog2((PAGE_SIZE - TNODE_SIZE(0)) / sizeof(struct key_vector *))
317
#define TNODE_VMALLOC_MAX \
318
	ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct key_vector *))
319

320
static void __node_free_rcu(struct rcu_head *head)
321
{
322
	struct tnode *n = container_of(head, struct tnode, rcu);
323

324
	if (!n->tn_bits)
325 326
		kmem_cache_free(trie_leaf_kmem, n);
	else
327
		kvfree(n);
328 329
}

330
#define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu)
331

332
static struct tnode *tnode_alloc(int bits)
333
{
334 335 336 337 338 339 340 341 342
	size_t size;

	/* verify bits is within bounds */
	if (bits > TNODE_VMALLOC_MAX)
		return NULL;

	/* determine size and verify it is non-zero and didn't overflow */
	size = TNODE_SIZE(1ul << bits);

343
	if (size <= PAGE_SIZE)
344
		return kzalloc(size, GFP_KERNEL);
345
	else
346
		return vzalloc(size);
347
}
348

349
static inline void empty_child_inc(struct key_vector *n)
350
{
351
	++tn_info(n)->empty_children ? : ++tn_info(n)->full_children;
352 353
}

354
static inline void empty_child_dec(struct key_vector *n)
355
{
356
	tn_info(n)->empty_children-- ? : tn_info(n)->full_children--;
357 358
}

359
static struct key_vector *leaf_new(t_key key, struct fib_alias *fa)
360
{
361 362
	struct key_vector *l;
	struct tnode *kv;
363

364
	kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
365 366 367 368
	if (!kv)
		return NULL;

	/* initialize key vector */
369
	l = kv->kv;
370 371 372 373 374 375 376 377 378
	l->key = key;
	l->pos = 0;
	l->bits = 0;
	l->slen = fa->fa_slen;

	/* link leaf to fib alias */
	INIT_HLIST_HEAD(&l->leaf);
	hlist_add_head(&fa->fa_list, &l->leaf);

379 380 381
	return l;
}

382
static struct key_vector *tnode_new(t_key key, int pos, int bits)
383
{
384
	unsigned int shift = pos + bits;
385 386
	struct key_vector *tn;
	struct tnode *tnode;
387 388 389

	/* verify bits and pos their msb bits clear and values are valid */
	BUG_ON(!bits || (shift > KEYLENGTH));
390

391
	tnode = tnode_alloc(bits);
392 393 394
	if (!tnode)
		return NULL;

395 396 397
	pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0),
		 sizeof(struct key_vector *) << bits);

398
	if (bits == KEYLENGTH)
399
		tnode->full_children = 1;
400
	else
401
		tnode->empty_children = 1ul << bits;
402

403
	tn = tnode->kv;
404 405 406 407 408
	tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
	tn->pos = pos;
	tn->bits = bits;
	tn->slen = pos;

409 410 411
	return tn;
}

412
/* Check whether a tnode 'n' is "full", i.e. it is an internal node
413 414
 * and no bits are skipped. See discussion in dyntree paper p. 6
 */
415
static inline int tnode_full(struct key_vector *tn, struct key_vector *n)
416
{
417
	return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
418 419
}

420 421 422
/* Add a child at position i overwriting the old value.
 * Update the value of full_children and empty_children.
 */
423 424
static void put_child(struct key_vector *tn, unsigned long i,
		      struct key_vector *n)
425
{
426
	struct key_vector *chi = get_child(tn, i);
427
	int isfull, wasfull;
428

429
	BUG_ON(i >= child_length(tn));
430

431
	/* update emptyChildren, overflow into fullChildren */
432
	if (!n && chi)
433
		empty_child_inc(tn);
434
	if (n && !chi)
435
		empty_child_dec(tn);
436

437
	/* update fullChildren */
438
	wasfull = tnode_full(tn, chi);
439
	isfull = tnode_full(tn, n);
440

441
	if (wasfull && !isfull)
442
		tn_info(tn)->full_children--;
443
	else if (!wasfull && isfull)
444
		tn_info(tn)->full_children++;
445

446 447 448
	if (n && (tn->slen < n->slen))
		tn->slen = n->slen;

449
	rcu_assign_pointer(tn->tnode[i], n);
450 451
}

452
static void update_children(struct key_vector *tn)
453 454 455 456
{
	unsigned long i;

	/* update all of the child parent pointers */
457
	for (i = child_length(tn); i;) {
458
		struct key_vector *inode = get_child(tn, --i);
459 460 461 462 463 464 465 466 467 468 469 470 471 472 473

		if (!inode)
			continue;

		/* Either update the children of a tnode that
		 * already belongs to us or update the child
		 * to point to ourselves.
		 */
		if (node_parent(inode) == tn)
			update_children(inode);
		else
			node_set_parent(inode, tn);
	}
}

474 475
static inline void put_child_root(struct key_vector *tp, t_key key,
				  struct key_vector *n)
476
{
477 478
	if (IS_TRIE(tp))
		rcu_assign_pointer(tp->tnode[0], n);
479
	else
480
		put_child(tp, get_index(key, tp), n);
481 482
}

483
static inline void tnode_free_init(struct key_vector *tn)
484
{
485
	tn_info(tn)->rcu.next = NULL;
486 487
}

488 489
static inline void tnode_free_append(struct key_vector *tn,
				     struct key_vector *n)
490
{
491 492
	tn_info(n)->rcu.next = tn_info(tn)->rcu.next;
	tn_info(tn)->rcu.next = &tn_info(n)->rcu;
493
}
494

495
static void tnode_free(struct key_vector *tn)
496
{
497
	struct callback_head *head = &tn_info(tn)->rcu;
498 499 500

	while (head) {
		head = head->next;
501
		tnode_free_size += TNODE_SIZE(1ul << tn->bits);
502 503
		node_free(tn);

504
		tn = container_of(head, struct tnode, rcu)->kv;
505 506 507 508 509
	}

	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
		tnode_free_size = 0;
		synchronize_rcu();
510 511 512
	}
}

513 514 515
static struct key_vector *replace(struct trie *t,
				  struct key_vector *oldtnode,
				  struct key_vector *tn)
516
{
517
	struct key_vector *tp = node_parent(oldtnode);
518 519 520 521
	unsigned long i;

	/* setup the parent pointer out of and back into this node */
	NODE_INIT_PARENT(tn, tp);
522
	put_child_root(tp, tn->key, tn);
523 524 525 526 527 528 529 530

	/* update all of the child parent pointers */
	update_children(tn);

	/* all pointers should be clean so we are done */
	tnode_free(oldtnode);

	/* resize children now that oldtnode is freed */
531
	for (i = child_length(tn); i;) {
532
		struct key_vector *inode = get_child(tn, --i);
533 534 535

		/* resize child node */
		if (tnode_full(tn, inode))
536
			tn = resize(t, inode);
537
	}
538

539
	return tp;
540 541
}

542 543
static struct key_vector *inflate(struct trie *t,
				  struct key_vector *oldtnode)
544
{
545
	struct key_vector *tn;
546
	unsigned long i;
547
	t_key m;
548

549
	pr_debug("In inflate\n");
550

551
	tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1);
552
	if (!tn)
553
		goto notnode;
554

555 556 557
	/* prepare oldtnode to be freed */
	tnode_free_init(oldtnode);

558 559 560 561
	/* Assemble all of the pointers in our cluster, in this case that
	 * represents all of the pointers out of our allocated nodes that
	 * point to existing tnodes and the links between our allocated
	 * nodes.
562
	 */
563
	for (i = child_length(oldtnode), m = 1u << tn->pos; i;) {
564
		struct key_vector *inode = get_child(oldtnode, --i);
565
		struct key_vector *node0, *node1;
566
		unsigned long j, k;
567

568
		/* An empty child */
569
		if (!inode)
570 571 572
			continue;

		/* A leaf or an internal node with skipped bits */
573
		if (!tnode_full(oldtnode, inode)) {
574
			put_child(tn, get_index(inode->key, tn), inode);
575 576 577
			continue;
		}

578 579 580
		/* drop the node in the old tnode free list */
		tnode_free_append(oldtnode, inode);

581 582
		/* An internal node with two children */
		if (inode->bits == 1) {
583 584
			put_child(tn, 2 * i + 1, get_child(inode, 1));
			put_child(tn, 2 * i, get_child(inode, 0));
585
			continue;
586 587
		}

588
		/* We will replace this node 'inode' with two new
589
		 * ones, 'node0' and 'node1', each with half of the
590 591 592 593 594
		 * original children. The two new nodes will have
		 * a position one bit further down the key and this
		 * means that the "significant" part of their keys
		 * (see the discussion near the top of this file)
		 * will differ by one bit, which will be "0" in
595
		 * node0's key and "1" in node1's key. Since we are
596 597
		 * moving the key position by one step, the bit that
		 * we are moving away from - the bit at position
598 599 600
		 * (tn->pos) - is the one that will differ between
		 * node0 and node1. So... we synthesize that bit in the
		 * two new keys.
601
		 */
602 603 604
		node1 = tnode_new(inode->key | m, inode->pos, inode->bits - 1);
		if (!node1)
			goto nomem;
605
		node0 = tnode_new(inode->key, inode->pos, inode->bits - 1);
606

607
		tnode_free_append(tn, node1);
608 609 610 611 612
		if (!node0)
			goto nomem;
		tnode_free_append(tn, node0);

		/* populate child pointers in new nodes */
613
		for (k = child_length(inode), j = k / 2; j;) {
614 615 616 617
			put_child(node1, --j, get_child(inode, --k));
			put_child(node0, j, get_child(inode, j));
			put_child(node1, --j, get_child(inode, --k));
			put_child(node0, j, get_child(inode, j));
618
		}
619

620 621 622
		/* link new nodes to parent */
		NODE_INIT_PARENT(node1, tn);
		NODE_INIT_PARENT(node0, tn);
623

624 625 626 627
		/* link parent to nodes */
		put_child(tn, 2 * i + 1, node1);
		put_child(tn, 2 * i, node0);
	}
628

629
	/* setup the parent pointers into and out of this node */
630
	return replace(t, oldtnode, tn);
631
nomem:
632 633
	/* all pointers should be clean so we are done */
	tnode_free(tn);
634 635
notnode:
	return NULL;
636 637
}

638 639
static struct key_vector *halve(struct trie *t,
				struct key_vector *oldtnode)
640
{
641
	struct key_vector *tn;
642
	unsigned long i;
643

644
	pr_debug("In halve\n");
645

646
	tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1);
647
	if (!tn)
648
		goto notnode;
649

650 651 652
	/* prepare oldtnode to be freed */
	tnode_free_init(oldtnode);

653 654 655 656
	/* Assemble all of the pointers in our cluster, in this case that
	 * represents all of the pointers out of our allocated nodes that
	 * point to existing tnodes and the links between our allocated
	 * nodes.
657
	 */
658
	for (i = child_length(oldtnode); i;) {
659 660
		struct key_vector *node1 = get_child(oldtnode, --i);
		struct key_vector *node0 = get_child(oldtnode, --i);
661
		struct key_vector *inode;
662

663 664 665 666 667
		/* At least one of the children is empty */
		if (!node1 || !node0) {
			put_child(tn, i / 2, node1 ? : node0);
			continue;
		}
668

669
		/* Two nonempty children */
670
		inode = tnode_new(node0->key, oldtnode->pos, 1);
671 672
		if (!inode)
			goto nomem;
673
		tnode_free_append(tn, inode);
674

675 676 677 678 679 680 681
		/* initialize pointers out of node */
		put_child(inode, 1, node1);
		put_child(inode, 0, node0);
		NODE_INIT_PARENT(inode, tn);

		/* link parent to node */
		put_child(tn, i / 2, inode);
682
	}
683

684
	/* setup the parent pointers into and out of this node */
685 686 687 688 689 690
	return replace(t, oldtnode, tn);
nomem:
	/* all pointers should be clean so we are done */
	tnode_free(tn);
notnode:
	return NULL;
691 692
}

693 694
static struct key_vector *collapse(struct trie *t,
				   struct key_vector *oldtnode)
695
{
696
	struct key_vector *n, *tp;
697 698 699
	unsigned long i;

	/* scan the tnode looking for that one child that might still exist */
700
	for (n = NULL, i = child_length(oldtnode); !n && i;)
701
		n = get_child(oldtnode, --i);
702 703 704

	/* compress one level */
	tp = node_parent(oldtnode);
705
	put_child_root(tp, oldtnode->key, n);
706 707 708 709
	node_set_parent(n, tp);

	/* drop dead node */
	node_free(oldtnode);
710 711

	return tp;
712 713
}

714
static unsigned char update_suffix(struct key_vector *tn)
715 716 717
{
	unsigned char slen = tn->pos;
	unsigned long stride, i;
718 719 720 721 722 723 724
	unsigned char slen_max;

	/* only vector 0 can have a suffix length greater than or equal to
	 * tn->pos + tn->bits, the second highest node will have a suffix
	 * length at most of tn->pos + tn->bits - 1
	 */
	slen_max = min_t(unsigned char, tn->pos + tn->bits - 1, tn->slen);
725 726 727 728 729 730

	/* search though the list of children looking for nodes that might
	 * have a suffix greater than the one we currently have.  This is
	 * why we start with a stride of 2 since a stride of 1 would
	 * represent the nodes with suffix length equal to tn->pos
	 */
731
	for (i = 0, stride = 0x2ul ; i < child_length(tn); i += stride) {
732
		struct key_vector *n = get_child(tn, i);
733 734 735 736 737 738 739 740 741

		if (!n || (n->slen <= slen))
			continue;

		/* update stride and slen based on new value */
		stride <<= (n->slen - slen);
		slen = n->slen;
		i &= ~(stride - 1);

742 743
		/* stop searching if we have hit the maximum possible value */
		if (slen >= slen_max)
744 745 746 747 748 749 750 751
			break;
	}

	tn->slen = slen;

	return slen;
}

752 753 754 755 756 757 758 759
/* From "Implementing a dynamic compressed trie" by Stefan Nilsson of
 * the Helsinki University of Technology and Matti Tikkanen of Nokia
 * Telecommunications, page 6:
 * "A node is doubled if the ratio of non-empty children to all
 * children in the *doubled* node is at least 'high'."
 *
 * 'high' in this instance is the variable 'inflate_threshold'. It
 * is expressed as a percentage, so we multiply it with
760
 * child_length() and instead of multiplying by 2 (since the
761 762 763 764
 * child array will be doubled by inflate()) and multiplying
 * the left-hand side by 100 (to handle the percentage thing) we
 * multiply the left-hand side by 50.
 *
765
 * The left-hand side may look a bit weird: child_length(tn)
766 767 768 769 770 771 772 773 774
 * - tn->empty_children is of course the number of non-null children
 * in the current node. tn->full_children is the number of "full"
 * children, that is non-null tnodes with a skip value of 0.
 * All of those will be doubled in the resulting inflated tnode, so
 * we just count them one extra time here.
 *
 * A clearer way to write this would be:
 *
 * to_be_doubled = tn->full_children;
775
 * not_to_be_doubled = child_length(tn) - tn->empty_children -
776 777
 *     tn->full_children;
 *
778
 * new_child_length = child_length(tn) * 2;
779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794
 *
 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
 *      new_child_length;
 * if (new_fill_factor >= inflate_threshold)
 *
 * ...and so on, tho it would mess up the while () loop.
 *
 * anyway,
 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
 *      inflate_threshold
 *
 * avoid a division:
 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
 *      inflate_threshold * new_child_length
 *
 * expand not_to_be_doubled and to_be_doubled, and shorten:
795
 * 100 * (child_length(tn) - tn->empty_children +
796 797 798
 *    tn->full_children) >= inflate_threshold * new_child_length
 *
 * expand new_child_length:
799
 * 100 * (child_length(tn) - tn->empty_children +
800
 *    tn->full_children) >=
801
 *      inflate_threshold * child_length(tn) * 2
802 803
 *
 * shorten again:
804
 * 50 * (tn->full_children + child_length(tn) -
805
 *    tn->empty_children) >= inflate_threshold *
806
 *    child_length(tn)
807 808
 *
 */
809
static inline bool should_inflate(struct key_vector *tp, struct key_vector *tn)
810
{
811
	unsigned long used = child_length(tn);
812 813 814
	unsigned long threshold = used;

	/* Keep root node larger */
815
	threshold *= IS_TRIE(tp) ? inflate_threshold_root : inflate_threshold;
816 817
	used -= tn_info(tn)->empty_children;
	used += tn_info(tn)->full_children;
818

819 820 821
	/* if bits == KEYLENGTH then pos = 0, and will fail below */

	return (used > 1) && tn->pos && ((50 * used) >= threshold);
822 823
}

824
static inline bool should_halve(struct key_vector *tp, struct key_vector *tn)
825
{
826
	unsigned long used = child_length(tn);
827 828 829
	unsigned long threshold = used;

	/* Keep root node larger */
830
	threshold *= IS_TRIE(tp) ? halve_threshold_root : halve_threshold;
831
	used -= tn_info(tn)->empty_children;
832

833 834 835 836 837
	/* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */

	return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold);
}

838
static inline bool should_collapse(struct key_vector *tn)
839
{
840
	unsigned long used = child_length(tn);
841

842
	used -= tn_info(tn)->empty_children;
843 844

	/* account for bits == KEYLENGTH case */
845
	if ((tn->bits == KEYLENGTH) && tn_info(tn)->full_children)
846 847 848 849
		used -= KEY_MAX;

	/* One child or none, time to drop us from the trie */
	return used < 2;
850 851
}

852
#define MAX_WORK 10
853
static struct key_vector *resize(struct trie *t, struct key_vector *tn)
854
{
855 856 857
#ifdef CONFIG_IP_FIB_TRIE_STATS
	struct trie_use_stats __percpu *stats = t->stats;
#endif
858
	struct key_vector *tp = node_parent(tn);
859
	unsigned long cindex = get_index(tn->key, tp);
860
	int max_work = MAX_WORK;
861 862 863 864

	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
		 tn, inflate_threshold, halve_threshold);

865 866 867 868
	/* track the tnode via the pointer from the parent instead of
	 * doing it ourselves.  This way we can let RCU fully do its
	 * thing without us interfering
	 */
869
	BUG_ON(tn != get_child(tp, cindex));
870

871 872
	/* Double as long as the resulting node has a number of
	 * nonempty nodes that are above the threshold.
873
	 */
874
	while (should_inflate(tp, tn) && max_work) {
875 876
		tp = inflate(t, tn);
		if (!tp) {
877
#ifdef CONFIG_IP_FIB_TRIE_STATS
878
			this_cpu_inc(stats->resize_node_skipped);
879 880 881
#endif
			break;
		}
882

883
		max_work--;
884
		tn = get_child(tp, cindex);
885 886
	}

887 888 889
	/* update parent in case inflate failed */
	tp = node_parent(tn);

890 891
	/* Return if at least one inflate is run */
	if (max_work != MAX_WORK)
892
		return tp;
893

894
	/* Halve as long as the number of empty children in this
895 896
	 * node is above threshold.
	 */
897
	while (should_halve(tp, tn) && max_work) {
898 899
		tp = halve(t, tn);
		if (!tp) {
900
#ifdef CONFIG_IP_FIB_TRIE_STATS
901
			this_cpu_inc(stats->resize_node_skipped);
902 903 904 905
#endif
			break;
		}

906
		max_work--;
907
		tn = get_child(tp, cindex);
908
	}
909 910

	/* Only one child remains */
911 912 913
	if (should_collapse(tn))
		return collapse(t, tn);

914
	/* update parent in case halve failed */
915
	return node_parent(tn);
916 917
}

918
static void node_pull_suffix(struct key_vector *tn, unsigned char slen)
919
{
920 921 922 923 924
	unsigned char node_slen = tn->slen;

	while ((node_slen > tn->pos) && (node_slen > slen)) {
		slen = update_suffix(tn);
		if (node_slen == slen)
925
			break;
926 927 928

		tn = node_parent(tn);
		node_slen = tn->slen;
929 930 931
	}
}

932
static void node_push_suffix(struct key_vector *tn, unsigned char slen)
933
{
934 935
	while (tn->slen < slen) {
		tn->slen = slen;
936 937 938 939
		tn = node_parent(tn);
	}
}

940
/* rcu_read_lock needs to be hold by caller from readside */
941 942
static struct key_vector *fib_find_node(struct trie *t,
					struct key_vector **tp, u32 key)
943
{
944 945 946 947 948 949 950 951 952
	struct key_vector *pn, *n = t->kv;
	unsigned long index = 0;

	do {
		pn = n;
		n = get_child_rcu(n, index);

		if (!n)
			break;
953

954
		index = get_cindex(key, n);
955 956 957 958 959 960

		/* This bit of code is a bit tricky but it combines multiple
		 * checks into a single check.  The prefix consists of the
		 * prefix plus zeros for the bits in the cindex. The index
		 * is the difference between the key and this value.  From
		 * this we can actually derive several pieces of data.
961
		 *   if (index >= (1ul << bits))
962
		 *     we have a mismatch in skip bits and failed
963 964
		 *   else
		 *     we know the value is cindex
965 966 967 968
		 *
		 * This check is safe even if bits == KEYLENGTH due to the
		 * fact that we can only allocate a node with 32 bits if a
		 * long is greater than 32 bits.
969
		 */
970 971 972 973
		if (index >= (1ul << n->bits)) {
			n = NULL;
			break;
		}
974

975 976
		/* keep searching until we find a perfect match leaf or NULL */
	} while (IS_TNODE(n));
977

978
	*tp = pn;
979

980
	return n;
981 982
}

983 984 985
/* Return the first fib alias matching TOS with
 * priority less than or equal to PRIO.
 */
986
static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
987
					u8 tos, u32 prio, u32 tb_id)
988 989 990 991 992 993
{
	struct fib_alias *fa;

	if (!fah)
		return NULL;

994
	hlist_for_each_entry(fa, fah, fa_list) {
995 996 997 998
		if (fa->fa_slen < slen)
			continue;
		if (fa->fa_slen != slen)
			break;
999 1000 1001 1002
		if (fa->tb_id > tb_id)
			continue;
		if (fa->tb_id != tb_id)
			break;
1003 1004 1005 1006 1007 1008 1009 1010 1011
		if (fa->fa_tos > tos)
			continue;
		if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos)
			return fa;
	}

	return NULL;
}

1012
static void trie_rebalance(struct trie *t, struct key_vector *tn)
1013
{
1014 1015
	while (!IS_TRIE(tn))
		tn = resize(t, tn);
1016 1017
}

1018
static int fib_insert_node(struct trie *t, struct key_vector *tp,
1019
			   struct fib_alias *new, t_key key)
1020
{
1021
	struct key_vector *n, *l;
1022

1023
	l = leaf_new(key, new);
1024
	if (!l)
1025
		goto noleaf;
1026 1027

	/* retrieve child from parent node */
1028
	n = get_child(tp, get_index(key, tp));
1029

1030 1031 1032 1033 1034 1035 1036
	/* Case 2: n is a LEAF or a TNODE and the key doesn't match.
	 *
	 *  Add a new tnode here
	 *  first tnode need some special handling
	 *  leaves us in position for handling as case 3
	 */
	if (n) {
1037
		struct key_vector *tn;
1038

1039
		tn = tnode_new(key, __fls(key ^ n->key), 1);
1040 1041
		if (!tn)
			goto notnode;
1042

1043 1044 1045
		/* initialize routes out of node */
		NODE_INIT_PARENT(tn, tp);
		put_child(tn, get_index(key, tn) ^ 1, n);
1046

1047
		/* start adding routes into the node */
1048
		put_child_root(tp, key, tn);
1049
		node_set_parent(n, tn);
1050

1051
		/* parent now has a NULL spot where the leaf can go */
1052
		tp = tn;
1053
	}
1054

1055
	/* Case 3: n is NULL, and will just insert a new leaf */
1056
	node_push_suffix(tp, new->fa_slen);
1057
	NODE_INIT_PARENT(l, tp);
1058
	put_child_root(tp, key, l);
1059 1060 1061
	trie_rebalance(t, tp);

	return 0;
1062 1063 1064 1065
notnode:
	node_free(l);
noleaf:
	return -ENOMEM;
1066 1067
}

1068 1069 1070
/* fib notifier for ADD is sent before calling fib_insert_alias with
 * the expectation that the only possible failure ENOMEM
 */
1071 1072
static int fib_insert_alias(struct trie *t, struct key_vector *tp,
			    struct key_vector *l, struct fib_alias *new,
1073 1074 1075 1076 1077 1078 1079
			    struct fib_alias *fa, t_key key)
{
	if (!l)
		return fib_insert_node(t, tp, new, key);

	if (fa) {
		hlist_add_before_rcu(&new->fa_list, &fa->fa_list);
1080
	} else {
1081 1082 1083 1084 1085
		struct fib_alias *last;

		hlist_for_each_entry(last, &l->leaf, fa_list) {
			if (new->fa_slen < last->fa_slen)
				break;
1086 1087 1088
			if ((new->fa_slen == last->fa_slen) &&
			    (new->tb_id > last->tb_id))
				break;
1089 1090 1091 1092 1093 1094 1095
			fa = last;
		}

		if (fa)
			hlist_add_behind_rcu(&new->fa_list, &fa->fa_list);
		else
			hlist_add_head_rcu(&new->fa_list, &l->leaf);
1096
	}
1097

1098 1099 1100
	/* if we added to the tail node then we need to update slen */
	if (l->slen < new->fa_slen) {
		l->slen = new->fa_slen;
1101
		node_push_suffix(tp, new->fa_slen);
1102 1103 1104
	}

	return 0;
1105 1106
}

1107
static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack)
1108
{
1109 1110
	if (plen > KEYLENGTH) {
		NL_SET_ERR_MSG(extack, "Invalid prefix length");
1111
		return false;
1112
	}
1113

1114 1115 1116
	if ((plen < KEYLENGTH) && (key << plen)) {
		NL_SET_ERR_MSG(extack,
			       "Invalid prefix for given prefix length");
1117
		return false;
1118
	}
1119 1120 1121 1122

	return true;
}

1123
/* Caller must hold RTNL. */
1124
int fib_table_insert(struct net *net, struct fib_table *tb,
1125
		     struct fib_config *cfg, struct netlink_ext_ack *extack)
1126
{
1127
	enum fib_event_type event = FIB_EVENT_ENTRY_ADD;
1128
	struct trie *t = (struct trie *)tb->tb_data;