ip6mr.c 58.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 *	Linux IPv6 multicast routing support for BSD pim6sd
 *	Based on net/ipv4/ipmr.c.
 *
 *	(c) 2004 Mickael Hoerdt, <hoerdt@clarinet.u-strasbg.fr>
 *		LSIIT Laboratory, Strasbourg, France
 *	(c) 2004 Jean-Philippe Andriot, <jean-philippe.andriot@6WIND.com>
 *		6WIND, Paris, France
 *	Copyright (C)2007,2008 USAGI/WIDE Project
 *		YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
 *
 *	This program is free software; you can redistribute it and/or
 *	modify it under the terms of the GNU General Public License
 *	as published by the Free Software Foundation; either version
 *	2 of the License, or (at your option) any later version.
 *
 */

19
#include <linux/uaccess.h>
20 21 22 23 24 25 26 27 28 29 30 31 32 33
#include <linux/types.h>
#include <linux/sched.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/kernel.h>
#include <linux/fcntl.h>
#include <linux/stat.h>
#include <linux/socket.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/inetdevice.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/init.h>
34
#include <linux/compat.h>
35
#include <linux/rhashtable.h>
36 37 38 39 40 41 42
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <net/raw.h>
#include <linux/notifier.h>
#include <linux/if_arp.h>
#include <net/checksum.h>
#include <net/netlink.h>
43
#include <net/fib_rules.h>
44 45 46 47

#include <net/ipv6.h>
#include <net/ip6_route.h>
#include <linux/mroute6.h>
48
#include <linux/pim.h>
49 50
#include <net/addrconf.h>
#include <linux/netfilter_ipv6.h>
51
#include <linux/export.h>
52
#include <net/ip6_checksum.h>
53
#include <linux/netconf.h>
54
#include <net/ip_tunnels.h>
55

56 57
#include <linux/nospec.h>

58 59 60 61 62
struct ip6mr_rule {
	struct fib_rule		common;
};

struct ip6mr_result {
63
	struct mr_table	*mrt;
64 65
};

66 67 68 69 70 71
/* Big lock, protecting vif table, mrt cache and mroute socket state.
   Note that the changes are semaphored via rtnl_lock.
 */

static DEFINE_RWLOCK(mrt_lock);

72
/* Multicast router control variables */
73 74 75 76 77 78 79 80 81 82 83 84 85 86

/* Special spinlock for queue of unresolved entries */
static DEFINE_SPINLOCK(mfc_unres_lock);

/* We return to original Alan's scheme. Hash table of resolved
   entries is changed only in process context and protected
   with weak lock mrt_lock. Queue of unresolved entries is protected
   with strong spinlock mfc_unres_lock.

   In this case data path is free of exclusive locks at all.
 */

static struct kmem_cache *mrt_cachep __read_mostly;

87 88
static struct mr_table *ip6mr_new_table(struct net *net, u32 id);
static void ip6mr_free_table(struct mr_table *mrt);
89

90
static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
91 92
			   struct net_device *dev, struct sk_buff *skb,
			   struct mfc6_cache *cache);
93
static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
94
			      mifi_t mifi, int assert);
95
static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc,
96
			      int cmd);
97
static void mrt6msg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt);
98 99
static int ip6mr_rtm_dumproute(struct sk_buff *skb,
			       struct netlink_callback *cb);
100
static void mroute_clean_tables(struct mr_table *mrt, bool all);
101
static void ipmr_expire_process(struct timer_list *t);
102 103

#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
104
#define ip6mr_for_each_table(mrt, net) \
105 106
	list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list)

107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
static struct mr_table *ip6mr_mr_table_iter(struct net *net,
					    struct mr_table *mrt)
{
	struct mr_table *ret;

	if (!mrt)
		ret = list_entry_rcu(net->ipv6.mr6_tables.next,
				     struct mr_table, list);
	else
		ret = list_entry_rcu(mrt->list.next,
				     struct mr_table, list);

	if (&ret->list == &net->ipv6.mr6_tables)
		return NULL;
	return ret;
}

124
static struct mr_table *ip6mr_get_table(struct net *net, u32 id)
125
{
126
	struct mr_table *mrt;
127 128 129 130 131 132 133 134

	ip6mr_for_each_table(mrt, net) {
		if (mrt->id == id)
			return mrt;
	}
	return NULL;
}

135
static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6,
136
			    struct mr_table **mrt)
137 138
{
	int err;
139 140 141 142 143
	struct ip6mr_result res;
	struct fib_lookup_arg arg = {
		.result = &res,
		.flags = FIB_LOOKUP_NOREF,
	};
144

145 146 147
	/* update flow if oif or iif point to device enslaved to l3mdev */
	l3mdev_update_flow(net, flowi6_to_flowi(flp6));

148 149
	err = fib_rules_lookup(net->ipv6.mr6_rules_ops,
			       flowi6_to_flowi(flp6), 0, &arg);
150 151 152 153 154 155 156 157 158 159
	if (err < 0)
		return err;
	*mrt = res.mrt;
	return 0;
}

static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp,
			     int flags, struct fib_lookup_arg *arg)
{
	struct ip6mr_result *res = arg->result;
160
	struct mr_table *mrt;
161 162 163 164 165 166 167 168 169 170 171 172 173

	switch (rule->action) {
	case FR_ACT_TO_TBL:
		break;
	case FR_ACT_UNREACHABLE:
		return -ENETUNREACH;
	case FR_ACT_PROHIBIT:
		return -EACCES;
	case FR_ACT_BLACKHOLE:
	default:
		return -EINVAL;
	}

174 175 176
	arg->table = fib_rule_get_table(rule, arg);

	mrt = ip6mr_get_table(rule->fr_net, arg->table);
177
	if (!mrt)
178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
		return -EAGAIN;
	res->mrt = mrt;
	return 0;
}

static int ip6mr_rule_match(struct fib_rule *rule, struct flowi *flp, int flags)
{
	return 1;
}

static const struct nla_policy ip6mr_rule_policy[FRA_MAX + 1] = {
	FRA_GENERIC_POLICY,
};

static int ip6mr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
193 194
				struct fib_rule_hdr *frh, struct nlattr **tb,
				struct netlink_ext_ack *extack)
195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
{
	return 0;
}

static int ip6mr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
			      struct nlattr **tb)
{
	return 1;
}

static int ip6mr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
			   struct fib_rule_hdr *frh)
{
	frh->dst_len = 0;
	frh->src_len = 0;
	frh->tos     = 0;
	return 0;
}

214
static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = {
215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230
	.family		= RTNL_FAMILY_IP6MR,
	.rule_size	= sizeof(struct ip6mr_rule),
	.addr_size	= sizeof(struct in6_addr),
	.action		= ip6mr_rule_action,
	.match		= ip6mr_rule_match,
	.configure	= ip6mr_rule_configure,
	.compare	= ip6mr_rule_compare,
	.fill		= ip6mr_rule_fill,
	.nlgroup	= RTNLGRP_IPV6_RULE,
	.policy		= ip6mr_rule_policy,
	.owner		= THIS_MODULE,
};

static int __net_init ip6mr_rules_init(struct net *net)
{
	struct fib_rules_ops *ops;
231
	struct mr_table *mrt;
232 233 234 235 236 237 238 239 240
	int err;

	ops = fib_rules_register(&ip6mr_rules_ops_template, net);
	if (IS_ERR(ops))
		return PTR_ERR(ops);

	INIT_LIST_HEAD(&net->ipv6.mr6_tables);

	mrt = ip6mr_new_table(net, RT6_TABLE_DFLT);
241 242
	if (IS_ERR(mrt)) {
		err = PTR_ERR(mrt);
243 244 245 246 247 248 249 250 251 252 253
		goto err1;
	}

	err = fib_default_rule_add(ops, 0x7fff, RT6_TABLE_DFLT, 0);
	if (err < 0)
		goto err2;

	net->ipv6.mr6_rules_ops = ops;
	return 0;

err2:
254
	ip6mr_free_table(mrt);
255 256 257 258 259 260 261
err1:
	fib_rules_unregister(ops);
	return err;
}

static void __net_exit ip6mr_rules_exit(struct net *net)
{
262
	struct mr_table *mrt, *next;
263

264
	rtnl_lock();
Eric Dumazet's avatar
Eric Dumazet committed
265 266
	list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) {
		list_del(&mrt->list);
267
		ip6mr_free_table(mrt);
Eric Dumazet's avatar
Eric Dumazet committed
268
	}
269
	fib_rules_unregister(net->ipv6.mr6_rules_ops);
270
	rtnl_unlock();
271
}
272 273 274 275 276 277 278 279 280 281

static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb)
{
	return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR);
}

static unsigned int ip6mr_rules_seq_read(struct net *net)
{
	return fib_rules_seq_read(net, RTNL_FAMILY_IP6MR);
}
282 283 284 285 286 287 288

bool ip6mr_rule_default(const struct fib_rule *rule)
{
	return fib_rule_matchall(rule) && rule->action == FR_ACT_TO_TBL &&
	       rule->table == RT6_TABLE_DFLT && !rule->l3mdev;
}
EXPORT_SYMBOL(ip6mr_rule_default);
289 290 291 292
#else
#define ip6mr_for_each_table(mrt, net) \
	for (mrt = net->ipv6.mrt6; mrt; mrt = NULL)

293 294 295 296 297 298 299 300
static struct mr_table *ip6mr_mr_table_iter(struct net *net,
					    struct mr_table *mrt)
{
	if (!mrt)
		return net->ipv6.mrt6;
	return NULL;
}

301
static struct mr_table *ip6mr_get_table(struct net *net, u32 id)
302 303 304 305
{
	return net->ipv6.mrt6;
}

306
static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6,
307
			    struct mr_table **mrt)
308 309 310 311 312 313 314
{
	*mrt = net->ipv6.mrt6;
	return 0;
}

static int __net_init ip6mr_rules_init(struct net *net)
{
315 316 317 318 319 320 321
	struct mr_table *mrt;

	mrt = ip6mr_new_table(net, RT6_TABLE_DFLT);
	if (IS_ERR(mrt))
		return PTR_ERR(mrt);
	net->ipv6.mrt6 = mrt;
	return 0;
322 323 324 325
}

static void __net_exit ip6mr_rules_exit(struct net *net)
{
326
	rtnl_lock();
327
	ip6mr_free_table(net->ipv6.mrt6);
328 329
	net->ipv6.mrt6 = NULL;
	rtnl_unlock();
330
}
331 332 333 334 335 336 337 338 339 340

static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb)
{
	return 0;
}

static unsigned int ip6mr_rules_seq_read(struct net *net)
{
	return 0;
}
341 342
#endif

343 344 345 346 347 348 349 350 351 352 353
static int ip6mr_hash_cmp(struct rhashtable_compare_arg *arg,
			  const void *ptr)
{
	const struct mfc6_cache_cmp_arg *cmparg = arg->key;
	struct mfc6_cache *c = (struct mfc6_cache *)ptr;

	return !ipv6_addr_equal(&c->mf6c_mcastgrp, &cmparg->mf6c_mcastgrp) ||
	       !ipv6_addr_equal(&c->mf6c_origin, &cmparg->mf6c_origin);
}

static const struct rhashtable_params ip6mr_rht_params = {
354
	.head_offset = offsetof(struct mr_mfc, mnode),
355 356 357 358 359 360 361 362
	.key_offset = offsetof(struct mfc6_cache, cmparg),
	.key_len = sizeof(struct mfc6_cache_cmp_arg),
	.nelem_hint = 3,
	.locks_mul = 1,
	.obj_cmpfn = ip6mr_hash_cmp,
	.automatic_shrinking = true,
};

363 364 365 366 367 368 369 370
static void ip6mr_new_table_set(struct mr_table *mrt,
				struct net *net)
{
#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
	list_add_tail_rcu(&mrt->list, &net->ipv6.mr6_tables);
#endif
}

371 372 373 374 375 376 377 378 379 380
static struct mfc6_cache_cmp_arg ip6mr_mr_table_ops_cmparg_any = {
	.mf6c_origin = IN6ADDR_ANY_INIT,
	.mf6c_mcastgrp = IN6ADDR_ANY_INIT,
};

static struct mr_table_ops ip6mr_mr_table_ops = {
	.rht_params = &ip6mr_rht_params,
	.cmparg_any = &ip6mr_mr_table_ops_cmparg_any,
};

381
static struct mr_table *ip6mr_new_table(struct net *net, u32 id)
382
{
383
	struct mr_table *mrt;
384 385

	mrt = ip6mr_get_table(net, id);
386
	if (mrt)
387 388
		return mrt;

389
	return mr_table_alloc(net, id, &ip6mr_mr_table_ops,
390
			      ipmr_expire_process, ip6mr_new_table_set);
391
}
392

393
static void ip6mr_free_table(struct mr_table *mrt)
394
{
395
	del_timer_sync(&mrt->ipmr_expire_timer);
396
	mroute_clean_tables(mrt, true);
397
	rhltable_destroy(&mrt->mfc_hash);
398 399
	kfree(mrt);
}
400 401

#ifdef CONFIG_PROC_FS
402 403
/* The /proc interfaces to multicast routing
 * /proc/ip6_mr_cache /proc/ip6_mr_vif
404 405 406 407 408
 */

static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos)
	__acquires(mrt_lock)
{
409
	struct mr_vif_iter *iter = seq->private;
410
	struct net *net = seq_file_net(seq);
411
	struct mr_table *mrt;
412 413

	mrt = ip6mr_get_table(net, RT6_TABLE_DFLT);
414
	if (!mrt)
415 416 417
		return ERR_PTR(-ENOENT);

	iter->mrt = mrt;
418

419
	read_lock(&mrt_lock);
420
	return mr_vif_seq_start(seq, pos);
421 422 423 424 425 426 427 428 429 430
}

static void ip6mr_vif_seq_stop(struct seq_file *seq, void *v)
	__releases(mrt_lock)
{
	read_unlock(&mrt_lock);
}

static int ip6mr_vif_seq_show(struct seq_file *seq, void *v)
{
431
	struct mr_vif_iter *iter = seq->private;
432
	struct mr_table *mrt = iter->mrt;
433

434 435 436 437
	if (v == SEQ_START_TOKEN) {
		seq_puts(seq,
			 "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags\n");
	} else {
438
		const struct vif_device *vif = v;
439 440 441
		const char *name = vif->dev ? vif->dev->name : "none";

		seq_printf(seq,
Al Viro's avatar
Al Viro committed
442
			   "%2td %-10s %8ld %7ld  %8ld %7ld %05X\n",
443
			   vif - mrt->vif_table,
444 445 446 447 448 449 450
			   name, vif->bytes_in, vif->pkt_in,
			   vif->bytes_out, vif->pkt_out,
			   vif->flags);
	}
	return 0;
}

451
static const struct seq_operations ip6mr_vif_seq_ops = {
452
	.start = ip6mr_vif_seq_start,
453
	.next  = mr_vif_seq_next,
454 455 456 457 458 459
	.stop  = ip6mr_vif_seq_stop,
	.show  = ip6mr_vif_seq_show,
};

static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
{
460
	struct net *net = seq_file_net(seq);
461
	struct mr_table *mrt;
462

463
	mrt = ip6mr_get_table(net, RT6_TABLE_DFLT);
464
	if (!mrt)
465 466
		return ERR_PTR(-ENOENT);

467
	return mr_mfc_seq_start(seq, pos, mrt, &mfc_unres_lock);
468 469 470 471 472 473 474 475 476 477 478 479 480
}

static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
{
	int n;

	if (v == SEQ_START_TOKEN) {
		seq_puts(seq,
			 "Group                            "
			 "Origin                           "
			 "Iif      Pkts  Bytes     Wrong  Oifs\n");
	} else {
		const struct mfc6_cache *mfc = v;
481
		const struct mr_mfc_iter *it = seq->private;
482
		struct mr_table *mrt = it->mrt;
483

484
		seq_printf(seq, "%pI6 %pI6 %-3hd",
485
			   &mfc->mf6c_mcastgrp, &mfc->mf6c_origin,
486
			   mfc->_c.mfc_parent);
487

488
		if (it->cache != &mrt->mfc_unres_queue) {
489
			seq_printf(seq, " %8lu %8lu %8lu",
490 491 492 493 494
				   mfc->_c.mfc_un.res.pkt,
				   mfc->_c.mfc_un.res.bytes,
				   mfc->_c.mfc_un.res.wrong_if);
			for (n = mfc->_c.mfc_un.res.minvif;
			     n < mfc->_c.mfc_un.res.maxvif; n++) {
495
				if (VIF_EXISTS(mrt, n) &&
496
				    mfc->_c.mfc_un.res.ttls[n] < 255)
497
					seq_printf(seq,
498 499
						   " %2d:%-3d", n,
						   mfc->_c.mfc_un.res.ttls[n]);
500
			}
501 502 503 504 505
		} else {
			/* unresolved mfc_caches don't contain
			 * pkt, bytes and wrong_if values
			 */
			seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
506 507 508 509 510 511
		}
		seq_putc(seq, '\n');
	}
	return 0;
}

512
static const struct seq_operations ipmr_mfc_seq_ops = {
513
	.start = ipmr_mfc_seq_start,
514 515
	.next  = mr_mfc_seq_next,
	.stop  = mr_mfc_seq_stop,
516 517 518 519
	.show  = ipmr_mfc_seq_show,
};
#endif

520 521 522 523 524 525 526
#ifdef CONFIG_IPV6_PIMSM_V2

static int pim6_rcv(struct sk_buff *skb)
{
	struct pimreghdr *pim;
	struct ipv6hdr   *encap;
	struct net_device  *reg_dev = NULL;
527
	struct net *net = dev_net(skb->dev);
528
	struct mr_table *mrt;
529 530 531
	struct flowi6 fl6 = {
		.flowi6_iif	= skb->dev->ifindex,
		.flowi6_mark	= skb->mark,
532 533
	};
	int reg_vif_num;
534 535 536 537 538

	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
		goto drop;

	pim = (struct pimreghdr *)skb_transport_header(skb);
539
	if (pim->type != ((PIM_VERSION << 4) | PIM_TYPE_REGISTER) ||
540
	    (pim->flags & PIM_NULL_REGISTER) ||
541 542 543
	    (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr,
			     sizeof(*pim), IPPROTO_PIM,
			     csum_partial((void *)pim, sizeof(*pim), 0)) &&
544
	     csum_fold(skb_checksum(skb, 0, skb->len, 0))))
545 546 547 548 549 550 551 552 553 554 555
		goto drop;

	/* check if the inner packet is destined to mcast group */
	encap = (struct ipv6hdr *)(skb_transport_header(skb) +
				   sizeof(*pim));

	if (!ipv6_addr_is_multicast(&encap->daddr) ||
	    encap->payload_len == 0 ||
	    ntohs(encap->payload_len) + sizeof(*pim) > skb->len)
		goto drop;

556
	if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0)
557 558 559
		goto drop;
	reg_vif_num = mrt->mroute_reg_vif_num;

560 561
	read_lock(&mrt_lock);
	if (reg_vif_num >= 0)
562
		reg_dev = mrt->vif_table[reg_vif_num].dev;
563 564 565 566
	if (reg_dev)
		dev_hold(reg_dev);
	read_unlock(&mrt_lock);

567
	if (!reg_dev)
568 569 570 571 572
		goto drop;

	skb->mac_header = skb->network_header;
	skb_pull(skb, (u8 *)encap - skb->data);
	skb_reset_network_header(skb);
573
	skb->protocol = htons(ETH_P_IPV6);
574
	skb->ip_summed = CHECKSUM_NONE;
575

576
	skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev));
577

578
	netif_rx(skb);
579

580 581 582 583 584 585 586
	dev_put(reg_dev);
	return 0;
 drop:
	kfree_skb(skb);
	return 0;
}

587
static const struct inet6_protocol pim6_protocol = {
588 589 590 591 592
	.handler	=	pim6_rcv,
};

/* Service routines creating virtual interfaces: PIMREG */

593 594
static netdev_tx_t reg_vif_xmit(struct sk_buff *skb,
				      struct net_device *dev)
595
{
596
	struct net *net = dev_net(dev);
597
	struct mr_table *mrt;
598 599
	struct flowi6 fl6 = {
		.flowi6_oif	= dev->ifindex,
600
		.flowi6_iif	= skb->skb_iif ? : LOOPBACK_IFINDEX,
601
		.flowi6_mark	= skb->mark,
602 603
	};

604 605 606 607 608
	if (!pskb_inet_may_pull(skb))
		goto tx_err;

	if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0)
		goto tx_err;
609

610
	read_lock(&mrt_lock);
611 612
	dev->stats.tx_bytes += skb->len;
	dev->stats.tx_packets++;
613
	ip6mr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, MRT6MSG_WHOLEPKT);
614 615
	read_unlock(&mrt_lock);
	kfree_skb(skb);
616
	return NETDEV_TX_OK;
617 618 619 620 621

tx_err:
	dev->stats.tx_errors++;
	kfree_skb(skb);
	return NETDEV_TX_OK;
622 623
}

624 625 626 627 628
static int reg_vif_get_iflink(const struct net_device *dev)
{
	return 0;
}

629 630
static const struct net_device_ops reg_vif_netdev_ops = {
	.ndo_start_xmit	= reg_vif_xmit,
631
	.ndo_get_iflink = reg_vif_get_iflink,
632 633
};

634 635 636 637 638
static void reg_vif_setup(struct net_device *dev)
{
	dev->type		= ARPHRD_PIMREG;
	dev->mtu		= 1500 - sizeof(struct ipv6hdr) - 8;
	dev->flags		= IFF_NOARP;
639
	dev->netdev_ops		= &reg_vif_netdev_ops;
640
	dev->needs_free_netdev	= true;
Tom Goff's avatar
Tom Goff committed
641
	dev->features		|= NETIF_F_NETNS_LOCAL;
642 643
}

644
static struct net_device *ip6mr_reg_vif(struct net *net, struct mr_table *mrt)
645 646
{
	struct net_device *dev;
647 648 649 650 651 652
	char name[IFNAMSIZ];

	if (mrt->id == RT6_TABLE_DFLT)
		sprintf(name, "pim6reg");
	else
		sprintf(name, "pim6reg%u", mrt->id);
653

654
	dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup);
655
	if (!dev)
656 657
		return NULL;

658 659
	dev_net_set(dev, net);

660 661 662 663 664
	if (register_netdevice(dev)) {
		free_netdev(dev);
		return NULL;
	}

665
	if (dev_open(dev, NULL))
666 667
		goto failure;

668
	dev_hold(dev);
669 670 671 672 673 674 675 676
	return dev;

failure:
	unregister_netdevice(dev);
	return NULL;
}
#endif

677 678 679 680 681 682 683 684 685
static int call_ip6mr_vif_entry_notifiers(struct net *net,
					  enum fib_event_type event_type,
					  struct vif_device *vif,
					  mifi_t vif_index, u32 tb_id)
{
	return mr_call_vif_notifiers(net, RTNL_FAMILY_IP6MR, event_type,
				     vif, vif_index, tb_id,
				     &net->ipv6.ipmr_seq);
}
686

687 688 689 690 691 692 693 694 695
static int call_ip6mr_mfc_entry_notifiers(struct net *net,
					  enum fib_event_type event_type,
					  struct mfc6_cache *mfc, u32 tb_id)
{
	return mr_call_mfc_notifiers(net, RTNL_FAMILY_IP6MR, event_type,
				     &mfc->_c, tb_id, &net->ipv6.ipmr_seq);
}

/* Delete a VIF entry */
696
static int mif6_delete(struct mr_table *mrt, int vifi, int notify,
697
		       struct list_head *head)
698
{
699
	struct vif_device *v;
700
	struct net_device *dev;
701
	struct inet6_dev *in6_dev;
702 703

	if (vifi < 0 || vifi >= mrt->maxvif)
704 705
		return -EADDRNOTAVAIL;

706
	v = &mrt->vif_table[vifi];
707

708 709 710 711 712
	if (VIF_EXISTS(mrt, vifi))
		call_ip6mr_vif_entry_notifiers(read_pnet(&mrt->net),
					       FIB_EVENT_VIF_DEL, v, vifi,
					       mrt->id);

713 714 715 716 717 718 719 720 721
	write_lock_bh(&mrt_lock);
	dev = v->dev;
	v->dev = NULL;

	if (!dev) {
		write_unlock_bh(&mrt_lock);
		return -EADDRNOTAVAIL;
	}

722
#ifdef CONFIG_IPV6_PIMSM_V2
723 724
	if (vifi == mrt->mroute_reg_vif_num)
		mrt->mroute_reg_vif_num = -1;
725 726
#endif

727
	if (vifi + 1 == mrt->maxvif) {
728 729
		int tmp;
		for (tmp = vifi - 1; tmp >= 0; tmp--) {
730
			if (VIF_EXISTS(mrt, tmp))
731 732
				break;
		}
733
		mrt->maxvif = tmp + 1;
734 735 736 737 738 739
	}

	write_unlock_bh(&mrt_lock);

	dev_set_allmulti(dev, -1);

740
	in6_dev = __in6_dev_get(dev);
741
	if (in6_dev) {
742
		in6_dev->cnf.mc_forwarding--;
743
		inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
744 745 746
					     NETCONFA_MC_FORWARDING,
					     dev->ifindex, &in6_dev->cnf);
	}
747

748
	if ((v->flags & MIFF_REGISTER) && !notify)
749
		unregister_netdevice_queue(dev, head);
750 751 752 753 754

	dev_put(dev);
	return 0;
}

755
static inline void ip6mr_cache_free_rcu(struct rcu_head *head)
756
{
757
	struct mr_mfc *c = container_of(head, struct mr_mfc, rcu);
758

759
	kmem_cache_free(mrt_cachep, (struct mfc6_cache *)c);
760 761
}

762 763
static inline void ip6mr_cache_free(struct mfc6_cache *c)
{
764
	call_rcu(&c->_c.rcu, ip6mr_cache_free_rcu);
765 766
}

767 768 769 770
/* Destroy an unresolved cache entry, killing queued skbs
   and reporting error to netlink readers.
 */

771
static void ip6mr_destroy_unres(struct mr_table *mrt, struct mfc6_cache *c)
772
{
773
	struct net *net = read_pnet(&mrt->net);
774 775
	struct sk_buff *skb;

776
	atomic_dec(&mrt->cache_resolve_queue_len);
777

778
	while ((skb = skb_dequeue(&c->_c.mfc_un.unres.unresolved)) != NULL) {
779
		if (ipv6_hdr(skb)->version == 0) {
780 781
			struct nlmsghdr *nlh = skb_pull(skb,
							sizeof(struct ipv6hdr));
782
			nlh->nlmsg_type = NLMSG_ERROR;
783
			nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
784
			skb_trim(skb, nlh->nlmsg_len);
785
			((struct nlmsgerr *)nlmsg_data(nlh))->error = -ETIMEDOUT;
786
			rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
787 788 789 790
		} else
			kfree_skb(skb);
	}

791
	ip6mr_cache_free(c);
792 793 794
}


795
/* Timer process for all the unresolved queue. */
796

797
static void ipmr_do_expire_process(struct mr_table *mrt)
798 799 800
{
	unsigned long now = jiffies;
	unsigned long expires = 10 * HZ;
801
	struct mr_mfc *c, *next;
802

803
	list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
804 805 806 807 808 809 810 811
		if (time_after(c->mfc_un.unres.expires, now)) {
			/* not yet... */
			unsigned long interval = c->mfc_un.unres.expires - now;
			if (interval < expires)
				expires = interval;
			continue;
		}

812
		list_del(&c->list);
813 814
		mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE);
		ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c);
815 816
	}

817
	if (!list_empty(&mrt->mfc_unres_queue))
818
		mod_timer(&mrt->ipmr_expire_timer, jiffies + expires);
819 820
}

821
static void ipmr_expire_process(struct timer_list *t)
822
{
823
	struct mr_table *mrt = from_timer(mrt, t, ipmr_expire_timer);
824

825
	if (!spin_trylock(&mfc_unres_lock)) {
826
		mod_timer(&mrt->ipmr_expire_timer, jiffies + 1);
827 828 829
		return;
	}

830
	if (!list_empty(&mrt->mfc_unres_queue))
831
		ipmr_do_expire_process(mrt);
832 833 834 835 836 837

	spin_unlock(&mfc_unres_lock);
}

/* Fill oifs list. It is called under write locked mrt_lock. */

838
static void ip6mr_update_thresholds(struct mr_table *mrt,
839
				    struct mr_mfc *cache,
840
				    unsigned char *ttls)
841 842 843
{
	int vifi;

844
	cache->mfc_un.res.minvif = MAXMIFS;
845
	cache->mfc_un.res.maxvif = 0;
846
	memset(cache->mfc_un.res.ttls, 255, MAXMIFS);
847

848
	for (vifi = 0; vifi < mrt->maxvif; vifi++) {
849
		if (VIF_EXISTS(mrt, vifi) &&
850
		    ttls[vifi] && ttls[vifi] < 255) {
851 852 853 854 855 856 857
			cache->mfc_un.res.ttls[vifi] = ttls[vifi];
			if (cache->mfc_un.res.minvif > vifi)
				cache->mfc_un.res.minvif = vifi;
			if (cache->mfc_un.res.maxvif <= vifi)
				cache->mfc_un.res.maxvif = vifi + 1;
		}
	}
858
	cache->mfc_un.res.lastuse = jiffies;
859 860
}

861
static int mif6_add(struct net *net, struct mr_table *mrt,
862
		    struct mif6ctl *vifc, int mrtsock)
863 864
{
	int vifi = vifc->mif6c_mifi;
865
	struct vif_device *v = &mrt->vif_table[vifi];
866
	struct net_device *dev;
867
	struct inet6_dev *in6_dev;
868
	int err;
869 870

	/* Is vif busy ? */
871
	if (VIF_EXISTS(mrt, vifi))
872 873 874
		return -EADDRINUSE;

	switch (vifc->mif6c_flags) {
875 876 877 878 879 880
#ifdef CONFIG_IPV6_PIMSM_V2
	case MIFF_REGISTER:
		/*
		 * Special Purpose VIF in PIM
		 * All the packets will be sent to the daemon
		 */
881
		if (mrt->mroute_reg_vif_num >= 0)
882
			return -EADDRINUSE;
883
		dev = ip6mr_reg_vif(net, mrt);
884 885
		if (!dev)
			return -ENOBUFS;
886 887 888
		err = dev_set_allmulti(dev, 1);
		if (err) {
			unregister_netdevice(dev);
889
			dev_put(dev);
890 891
			return err;
		}
892 893
		break;
#endif
894
	case 0:
895
		dev = dev_get_by_index(net, vifc->mif6c_pifi);
896 897
		if (!dev)
			return -EADDRNOTAVAIL;
898
		err = dev_set_allmulti(dev, 1);
899 900
		if (err) {
			dev_put(dev);
901
			return err;
902
		}
903 904 905 906 907
		break;
	default:
		return -EINVAL;
	}

908
	in6_dev = __in6_dev_get(dev);
909
	if (in6_dev) {
910
		in6_dev->cnf.mc_forwarding++;
911
		inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
912 913 914
					     NETCONFA_MC_FORWARDING,
					     dev->ifindex, &in6_dev->cnf);
	}
915

916 917 918 919
	/* Fill in the VIF structures */
	vif_device_init(v, dev, vifc->vifc_rate_limit, vifc->vifc_threshold,
			vifc->mif6c_flags | (!mrtsock ? VIFF_STATIC : 0),
			MIFF_REGISTER);
920 921 922 923

	/* And finish update writing critical data */
	write_lock_bh(&mrt_lock);
	v->dev = dev;
924 925
#ifdef CONFIG_IPV6_PIMSM_V2
	if (v->flags & MIFF_REGISTER)
926
		mrt->mroute_reg_vif_num = vifi;
927
#endif
928 929
	if (vifi + 1 > mrt->maxvif)
		mrt->maxvif = vifi + 1;
930
	write_unlock_bh(&mrt_lock);
931 932
	call_ip6mr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD,
				       v, vifi, mrt->id);
933 934 935
	return 0;
}

936
static struct mfc6_cache *ip6mr_cache_find(struct mr_table *mrt,
937 938
					   const struct in6_addr *origin,
					   const struct in6_addr *mcastgrp)
939
{
940 941 942 943 944
	struct mfc6_cache_cmp_arg arg = {
		.mf6c_origin = *origin,
		.mf6c_mcastgrp = *mcastgrp,
	};

945
	return mr_mfc_find(mrt, &arg);
946 947 948
}

/* Look for a (*,G) entry */
949
static struct mfc6_cache *ip6mr_cache_find_any(struct mr_table *mrt,
950 951 952
					       struct in6_addr *mcastgrp,
					       mifi_t mifi)
{
953 954 955 956
	struct mfc6_cache_cmp_arg arg = {
		.mf6c_origin = in6addr_any,
		.mf6c_mcastgrp = *mcastgrp,
	};
957 958

	if (ipv6_addr_any(mcastgrp))
959 960
		return mr_mfc_find_any_parent(mrt, mifi);
	return mr_mfc_find_any(mrt, mifi, &arg);
961 962
}

963 964
/* Look for a (S,G,iif) entry if parent != -1 */
static struct mfc6_cache *
965
ip6mr_cache_find_parent(struct mr_table *mrt,
966 967 968 969 970 971 972 973 974
			const struct in6_addr *origin,
			const struct in6_addr *mcastgrp,
			int parent)
{
	struct mfc6_cache_cmp_arg arg = {
		.mf6c_origin = *origin,
		.mf6c_mcastgrp = *mcastgrp,
	};

975
	return mr_mfc_find_parent(mrt, &arg, parent);
976 977
}

978
/* Allocate a multicast cache entry */
979
static struct mfc6_cache *ip6mr_cache_alloc(void)
980
{
981
	struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
982
	if (!c)
983
		return NULL;
984 985
	c->_c.mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1;
	c->_c.mfc_un.res.minvif = MAXMIFS;
986 987
	c->_c.free = ip6mr_cache_free_rcu;
	refcount_set(&c->_c.mfc_un.res.refcount, 1);
988 989 990
	return c;
}

991
static struct mfc6_cache *ip6mr_cache_alloc_unres(void)
992
{
993
	struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
994
	if (!c)
995
		return NULL;
996 997
	skb_queue_head_init(&c->_c.mfc_un.unres.unresolved);
	c->_c.mfc_un.unres.expires = jiffies + 10 * HZ;
998 999 1000 1001 1002 1003 1004
	return c;
}

/*
 *	A cache entry has gone into a resolved state from queued
 */

1005
static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt,
1006
				struct mfc6_cache *uc, struct mfc6_cache *c)
1007 1008 1009 1010 1011 1012 1013
{
	struct sk_buff *skb;

	/*
	 *	Play the pending entries through our router
	 */

1014
	while ((skb = __skb_dequeue(&uc->_c.mfc_un.unres.unresolved))) {
1015
		if (ipv6_hdr(skb)->version == 0) {
1016 1017
			struct nlmsghdr *nlh = skb_pull(skb,
							sizeof(struct ipv6hdr));
1018

1019 1020
			if (mr_fill_mroute(mrt, skb, &c->_c,
					   nlmsg_data(nlh)) > 0) {
1021
				nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh;
1022 1023
			} else {
				nlh->nlmsg_type = NLMSG_ERROR;
1024
				nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
1025
				skb_trim(skb, nlh->nlmsg_len);
1026
				((struct nlmsgerr *)nlmsg_data(nlh))->error = -EMSGSIZE;
1027
			}
1028
			rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
1029
		} else
1030
			ip6_mr_forward(net, mrt, skb->dev, skb, c);
1031 1032 1033 1034
	}
}

/*
1035
 *	Bounce a cache query up to pim6sd and netlink.
1036 1037 1038 1039
 *
 *	Called under mrt_lock.
 */

1040
static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
1041
			      mifi_t mifi, int assert)
1042
{
1043
	struct sock *mroute6_sk;
1044 1045 1046 1047
	struct sk_buff *skb;
	struct mrt6msg *msg;
	int ret;

1048 1049 1050 1051 1052 1053 1054
#ifdef CONFIG_IPV6_PIMSM_V2
	if (assert == MRT6MSG_WHOLEPKT)
		skb = skb_realloc_headroom(pkt, -skb_network_offset(pkt)
						+sizeof(*msg));
	else
#endif
		skb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(*msg), GFP_ATOMIC);
1055 1056 1057 1058 1059 1060 1061 1062 1063

	if (!skb)
		return -ENOBUFS;

	/* I suppose that internal messages
	 * do not require checksums */

	skb->ip_summed = CHECKSUM_UNNECESSARY;

1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077
#ifdef CONFIG_IPV6_PIMSM_V2
	if (assert == MRT6MSG_WHOLEPKT) {
		/* Ugly, but we have no choice with this interface.
		   Duplicate old header, fix length etc.
		   And all this only to mangle msg->im6_msgtype and
		   to set msg->im6_mbz to "mbz" :-)
		 */
		skb_push(skb, -skb_network_offset(pkt));

		skb_push(skb, sizeof(*msg));
		skb_reset_transport_header(skb);
		msg = (struct mrt6msg *)skb_transport_header(skb);
		msg->im6_mbz = 0;
		msg->im6_msgtype = MRT6MSG_WHOLEPKT;
1078
		msg->im6_mif = mrt->mroute_reg_vif_num;
1079
		msg->im6_pad = 0;
1080 1081
		msg->im6_src = ipv6_hdr(pkt)->saddr;
		msg->im6_dst = ipv6_hdr(pkt)->daddr;
1082 1083 1084 1085 1086

		skb->ip_summed = CHECKSUM_UNNECESSARY;
	} else
#endif
	{
1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103
	/*
	 *	Copy the IP header
	 */

	skb_put(skb, sizeof(struct ipv6hdr));
	skb_reset_network_header(skb);
	skb_copy_to_linear_data(skb, ipv6_hdr(pkt), sizeof(struct ipv6hdr));

	/*
	 *	Add our header
	 */
	skb_put(skb, sizeof(*msg));
	skb_reset_transport_header(skb);
	msg = (struct mrt6msg *)skb_transport_header(skb);

	msg->im6_mbz = 0;
	msg->im6_msgtype = assert;
1104
	msg->im6_mif = mifi;
1105
	msg->im6_pad = 0;
1106 1107
	msg->im6_src = ipv6_hdr(pkt)->saddr;
	msg->im6_dst = ipv6_hdr(pkt)->daddr;
1108

Eric Dumazet's avatar
Eric Dumazet committed
1109
	skb_dst_set(skb, dst_clone(skb_dst(pkt)));
1110
	skb->ip_summed = CHECKSUM_UNNECESSARY;
1111
	}
1112

1113
	rcu_read_lock();
1114
	mroute6_sk = rcu_dereference(mrt->mroute_sk);
1115 1116
	if (!mroute6_sk) {
		rcu_read_unlock();
1117 1118 1119 1120
		kfree_skb(skb);
		return -EINVAL;
	}

1121 1122
	mrt6msg_netlink_event(mrt, skb);

1123 1124 1125
	/* Deliver to user space multicast routing algorithms */
	ret = sock_queue_rcv_skb(mroute6_sk, skb);
	rcu_read_unlock();
1126
	if (ret < 0) {
1127
		net_warn_ratelimited("mroute6: pending queue full, dropping entries\n");
1128 1129 1130 1131 1132 1133
		kfree_skb(skb);
	}

	return ret;
}

1134 1135
/* Queue a packet for resolution. It gets locked cache entry! */
static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi,
1136
				  struct sk_buff *skb, struct net_device *dev)
1137
{
1138
	struct mfc6_cache *c;
1139
	bool found = false;
1140 1141 1142
	int err;

	spin_lock_bh(&mfc_unres_lock);
1143
	list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) {
1144
		if (ipv6_addr_equal(&c->mf6c_mcastgrp, &ipv6_hdr(skb)->daddr) &&
1145 1146
		    ipv6_addr_equal(&c->mf6c_origin, &ipv6_hdr(skb)->saddr)) {
			found = true;
1147
			break;
1148
		}
1149 1150
	}

1151
	if (!found) {
1152 1153 1154 1155
		/*
		 *	Create a new entry if allowable
		 */

1156
		if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
1157
		    (c = ip6mr_cache_alloc_unres()) == NULL) {
1158 1159 1160 1161 1162 1163
			spin_unlock_bh(&mfc_unres_lock);

			kfree_skb(skb);
			return -ENOBUFS;
		}

1164 1165
		/* Fill in the new cache entry */
		c->_c.mfc_parent = -1;
1166 1167 1168 1169 1170 1171
		c->mf6c_origin = ipv6_hdr(skb)->saddr;
		c->mf6c_mcastgrp = ipv6_hdr(skb)->daddr;

		/*
		 *	Reflect first query at pim6sd
		 */
1172
		err = ip6mr_cache_report(mrt, skb, mifi, MRT6MSG_NOCACHE);
1173
		if (err < 0) {
1174 1175 1176 1177 1178
			/* If the report failed throw the cache entry
			   out - Brad Parker
			 */
			spin_unlock_bh(&mfc_unres_lock);

1179
			ip6mr_cache_free(c);
1180 1181 1182 1183
			kfree_skb(skb);
			return err;
		}

1184
		atomic_inc(&mrt->cache_resolve_queue_len);
1185
		list_add(&c->_c.list, &mrt->mfc_unres_queue);
1186
		mr6_netlink_event(mrt, c, RTM_NEWROUTE);
1187

1188
		ipmr_do_expire_process(mrt);
1189 1190
	}

1191 1192
	/* See if we can append the packet */
	if (c->_c.mfc_un.unres.unresolved.qlen > 3) {
1193 1194 1195
		kfree_skb(skb);
		err = -ENOBUFS;
	} else {
1196 1197 1198 1199
		if (dev) {
			skb->dev = dev;
			skb->skb_iif = dev->ifindex;
		}
1200
		skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb);
1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211
		err = 0;
	}

	spin_unlock_bh(&mfc_unres_lock);
	return err;
}

/*
 *	MFC6 cache manipulation by user space
 */

1212
static int ip6mr_mfc_delete(struct mr_table *mrt, struct mf6cctl *mfc,
1213
			    int parent)
1214
{
1215
	struct mfc6_cache *c;
1216

1217 1218 1219 1220 1221 1222 1223
	/* The entries are added/deleted only under RTNL */
	rcu_read_lock();
	c = ip6mr_cache_find_parent(mrt, &mfc->mf6cc_origin.sin6_addr,
				    &mfc->mf6cc_mcastgrp.sin6_addr, parent);
	rcu_read_unlock();
	if (!c)
		return -ENOENT;
1224 1225
	rhltable_remove(&mrt->mfc_hash, &c->_c.mnode, ip6mr_rht_params);
	list_del_rcu(&c->_c.list);
1226

1227 1228
	call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net),
				       FIB_EVENT_ENTRY_DEL, c, mrt->id);
1229
	mr6_netlink_event(mrt, c, RTM_DELROUTE);
1230
	mr_cache_put(&c->_c);
1231
	return 0;
1232 1233 1234 1235 1236
}

static int ip6mr_device_event(struct notifier_block *this,
			      unsigned long event, void *ptr)
{
1237
	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1238
	struct net *net = dev_net(dev);
1239
	struct mr_table *mrt;
1240
	struct vif_device *v;
1241 1242 1243 1244 1245
	int ct;

	if (event != NETDEV_UNREGISTER)
		return NOTIFY_DONE;

1246
	ip6mr_for_each_table(mrt, net) {
1247
		v = &mrt->vif_table[0];
1248 1249
		for (ct = 0; ct < mrt->maxvif; ct++, v++) {
			if (v->dev == dev)
1250
				mif6_delete(mrt, ct, 1, NULL);
1251
		}
1252
	}
1253

1254 1255 1256
	return NOTIFY_DONE;
}

1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269
static unsigned int ip6mr_seq_read(struct net *net)
{
	ASSERT_RTNL();

	return net->ipv6.ipmr_seq + ip6mr_rules_seq_read(net);
}

static int ip6mr_dump(struct net *net, struct notifier_block *nb)
{
	return mr_dump(net, nb, RTNL_FAMILY_IP6MR, ip6mr_rules_dump,
		       ip6mr_mr_table_iter, &mrt_lock);
}