2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
68 #include <linux/uaccess.h>
71 #include <linux/sysctl.h>
75 RT6_NUD_FAIL_HARD = -3,
76 RT6_NUD_FAIL_PROBE = -2,
77 RT6_NUD_FAIL_DO_RR = -1,
81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void ip6_dst_destroy(struct dst_entry *);
86 static void ip6_dst_ifdown(struct dst_entry *,
87 struct net_device *dev, int how);
88 static int ip6_dst_gc(struct dst_ops *ops);
90 static int ip6_pkt_discard(struct sk_buff *skb);
91 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int ip6_pkt_prohibit(struct sk_buff *skb);
93 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void ip6_link_failure(struct sk_buff *skb);
95 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96 struct sk_buff *skb, u32 mtu);
97 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
100 static size_t rt6_nlmsg_size(struct fib6_info *rt);
101 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
102 struct fib6_info *rt, struct dst_entry *dst,
103 struct in6_addr *dest, struct in6_addr *src,
104 int iif, int type, u32 portid, u32 seq,
106 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
107 struct in6_addr *daddr,
108 struct in6_addr *saddr);
110 #ifdef CONFIG_IPV6_ROUTE_INFO
111 static struct fib6_info *rt6_add_route_info(struct net *net,
112 const struct in6_addr *prefix, int prefixlen,
113 const struct in6_addr *gwaddr,
114 struct net_device *dev,
116 static struct fib6_info *rt6_get_route_info(struct net *net,
117 const struct in6_addr *prefix, int prefixlen,
118 const struct in6_addr *gwaddr,
119 struct net_device *dev);
122 struct uncached_list {
124 struct list_head head;
127 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
129 void rt6_uncached_list_add(struct rt6_info *rt)
131 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
133 rt->rt6i_uncached_list = ul;
135 spin_lock_bh(&ul->lock);
136 list_add_tail(&rt->rt6i_uncached, &ul->head);
137 spin_unlock_bh(&ul->lock);
140 void rt6_uncached_list_del(struct rt6_info *rt)
142 if (!list_empty(&rt->rt6i_uncached)) {
143 struct uncached_list *ul = rt->rt6i_uncached_list;
144 struct net *net = dev_net(rt->dst.dev);
146 spin_lock_bh(&ul->lock);
147 list_del(&rt->rt6i_uncached);
148 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
149 spin_unlock_bh(&ul->lock);
153 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
155 struct net_device *loopback_dev = net->loopback_dev;
158 if (dev == loopback_dev)
161 for_each_possible_cpu(cpu) {
162 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165 spin_lock_bh(&ul->lock);
166 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
167 struct inet6_dev *rt_idev = rt->rt6i_idev;
168 struct net_device *rt_dev = rt->dst.dev;
170 if (rt_idev->dev == dev) {
171 rt->rt6i_idev = in6_dev_get(loopback_dev);
172 in6_dev_put(rt_idev);
176 rt->dst.dev = loopback_dev;
177 dev_hold(rt->dst.dev);
181 spin_unlock_bh(&ul->lock);
185 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
189 if (!ipv6_addr_any(p))
190 return (const void *) p;
192 return &ipv6_hdr(skb)->daddr;
196 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
197 struct net_device *dev,
203 daddr = choose_neigh_daddr(gw, skb, daddr);
204 n = __ipv6_neigh_lookup(dev, daddr);
207 return neigh_create(&nd_tbl, daddr, dev);
210 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
214 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
216 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
219 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
221 struct net_device *dev = dst->dev;
222 struct rt6_info *rt = (struct rt6_info *)dst;
224 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
227 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
229 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
231 __ipv6_confirm_neigh(dev, daddr);
234 static struct dst_ops ip6_dst_ops_template = {
238 .check = ip6_dst_check,
239 .default_advmss = ip6_default_advmss,
241 .cow_metrics = dst_cow_metrics_generic,
242 .destroy = ip6_dst_destroy,
243 .ifdown = ip6_dst_ifdown,
244 .negative_advice = ip6_negative_advice,
245 .link_failure = ip6_link_failure,
246 .update_pmtu = ip6_rt_update_pmtu,
247 .redirect = rt6_do_redirect,
248 .local_out = __ip6_local_out,
249 .neigh_lookup = ip6_dst_neigh_lookup,
250 .confirm_neigh = ip6_confirm_neigh,
253 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
255 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
257 return mtu ? : dst->dev->mtu;
260 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
261 struct sk_buff *skb, u32 mtu)
265 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
270 static struct dst_ops ip6_dst_blackhole_ops = {
272 .destroy = ip6_dst_destroy,
273 .check = ip6_dst_check,
274 .mtu = ip6_blackhole_mtu,
275 .default_advmss = ip6_default_advmss,
276 .update_pmtu = ip6_rt_blackhole_update_pmtu,
277 .redirect = ip6_rt_blackhole_redirect,
278 .cow_metrics = dst_cow_metrics_generic,
279 .neigh_lookup = ip6_dst_neigh_lookup,
282 static const u32 ip6_template_metrics[RTAX_MAX] = {
283 [RTAX_HOPLIMIT - 1] = 0,
286 static const struct fib6_info fib6_null_entry_template = {
287 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
288 .fib6_protocol = RTPROT_KERNEL,
289 .fib6_metric = ~(u32)0,
290 .fib6_ref = ATOMIC_INIT(1),
291 .fib6_type = RTN_UNREACHABLE,
292 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
295 static const struct rt6_info ip6_null_entry_template = {
297 .__refcnt = ATOMIC_INIT(1),
299 .obsolete = DST_OBSOLETE_FORCE_CHK,
300 .error = -ENETUNREACH,
301 .input = ip6_pkt_discard,
302 .output = ip6_pkt_discard_out,
304 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
309 static const struct rt6_info ip6_prohibit_entry_template = {
311 .__refcnt = ATOMIC_INIT(1),
313 .obsolete = DST_OBSOLETE_FORCE_CHK,
315 .input = ip6_pkt_prohibit,
316 .output = ip6_pkt_prohibit_out,
318 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
321 static const struct rt6_info ip6_blk_hole_entry_template = {
323 .__refcnt = ATOMIC_INIT(1),
325 .obsolete = DST_OBSOLETE_FORCE_CHK,
327 .input = dst_discard,
328 .output = dst_discard_out,
330 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
335 static void rt6_info_init(struct rt6_info *rt)
337 struct dst_entry *dst = &rt->dst;
339 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
340 INIT_LIST_HEAD(&rt->rt6i_uncached);
343 /* allocate dst with ip6_dst_ops */
344 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
347 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
348 1, DST_OBSOLETE_FORCE_CHK, flags);
352 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
357 EXPORT_SYMBOL(ip6_dst_alloc);
359 static void ip6_dst_destroy(struct dst_entry *dst)
361 struct rt6_info *rt = (struct rt6_info *)dst;
362 struct fib6_info *from;
363 struct inet6_dev *idev;
365 dst_destroy_metrics_generic(dst);
366 rt6_uncached_list_del(rt);
368 idev = rt->rt6i_idev;
370 rt->rt6i_idev = NULL;
375 from = rcu_dereference(rt->from);
376 rcu_assign_pointer(rt->from, NULL);
377 fib6_info_release(from);
381 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
384 struct rt6_info *rt = (struct rt6_info *)dst;
385 struct inet6_dev *idev = rt->rt6i_idev;
386 struct net_device *loopback_dev =
387 dev_net(dev)->loopback_dev;
389 if (idev && idev->dev != loopback_dev) {
390 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
392 rt->rt6i_idev = loopback_idev;
398 static bool __rt6_check_expired(const struct rt6_info *rt)
400 if (rt->rt6i_flags & RTF_EXPIRES)
401 return time_after(jiffies, rt->dst.expires);
406 static bool rt6_check_expired(const struct rt6_info *rt)
408 struct fib6_info *from;
410 from = rcu_dereference(rt->from);
412 if (rt->rt6i_flags & RTF_EXPIRES) {
413 if (time_after(jiffies, rt->dst.expires))
416 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
417 fib6_check_expired(from);
422 static struct fib6_info *rt6_multipath_select(const struct net *net,
423 struct fib6_info *match,
424 struct flowi6 *fl6, int oif,
425 const struct sk_buff *skb,
428 struct fib6_info *sibling, *next_sibling;
430 /* We might have already computed the hash for ICMPv6 errors. In such
431 * case it will always be non-zero. Otherwise now is the time to do it.
434 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
436 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
439 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
443 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
444 if (fl6->mp_hash > nh_upper_bound)
446 if (rt6_score_route(sibling, oif, strict) < 0)
456 * Route lookup. rcu_read_lock() should be held.
459 static inline struct fib6_info *rt6_device_match(struct net *net,
460 struct fib6_info *rt,
461 const struct in6_addr *saddr,
465 struct fib6_info *sprt;
467 if (!oif && ipv6_addr_any(saddr) &&
468 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
471 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
472 const struct net_device *dev = sprt->fib6_nh.nh_dev;
474 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
478 if (dev->ifindex == oif)
481 if (ipv6_chk_addr(net, saddr, dev,
482 flags & RT6_LOOKUP_F_IFACE))
487 if (oif && flags & RT6_LOOKUP_F_IFACE)
488 return net->ipv6.fib6_null_entry;
490 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
493 #ifdef CONFIG_IPV6_ROUTER_PREF
494 struct __rt6_probe_work {
495 struct work_struct work;
496 struct in6_addr target;
497 struct net_device *dev;
500 static void rt6_probe_deferred(struct work_struct *w)
502 struct in6_addr mcaddr;
503 struct __rt6_probe_work *work =
504 container_of(w, struct __rt6_probe_work, work);
506 addrconf_addr_solict_mult(&work->target, &mcaddr);
507 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
512 static void rt6_probe(struct fib6_info *rt)
514 struct __rt6_probe_work *work;
515 const struct in6_addr *nh_gw;
516 struct neighbour *neigh;
517 struct net_device *dev;
520 * Okay, this does not seem to be appropriate
521 * for now, however, we need to check if it
522 * is really so; aka Router Reachability Probing.
524 * Router Reachability Probe MUST be rate-limited
525 * to no more than one per minute.
527 if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
530 nh_gw = &rt->fib6_nh.nh_gw;
531 dev = rt->fib6_nh.nh_dev;
533 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
535 struct inet6_dev *idev;
537 if (neigh->nud_state & NUD_VALID)
540 idev = __in6_dev_get(dev);
542 write_lock(&neigh->lock);
543 if (!(neigh->nud_state & NUD_VALID) &&
545 neigh->updated + idev->cnf.rtr_probe_interval)) {
546 work = kmalloc(sizeof(*work), GFP_ATOMIC);
548 __neigh_set_probe_once(neigh);
550 write_unlock(&neigh->lock);
552 work = kmalloc(sizeof(*work), GFP_ATOMIC);
556 INIT_WORK(&work->work, rt6_probe_deferred);
557 work->target = *nh_gw;
560 schedule_work(&work->work);
564 rcu_read_unlock_bh();
567 static inline void rt6_probe(struct fib6_info *rt)
573 * Default Router Selection (RFC 2461 6.3.6)
575 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
577 const struct net_device *dev = rt->fib6_nh.nh_dev;
579 if (!oif || dev->ifindex == oif)
584 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
586 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
587 struct neighbour *neigh;
589 if (rt->fib6_flags & RTF_NONEXTHOP ||
590 !(rt->fib6_flags & RTF_GATEWAY))
591 return RT6_NUD_SUCCEED;
594 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
597 read_lock(&neigh->lock);
598 if (neigh->nud_state & NUD_VALID)
599 ret = RT6_NUD_SUCCEED;
600 #ifdef CONFIG_IPV6_ROUTER_PREF
601 else if (!(neigh->nud_state & NUD_FAILED))
602 ret = RT6_NUD_SUCCEED;
604 ret = RT6_NUD_FAIL_PROBE;
606 read_unlock(&neigh->lock);
608 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
609 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
611 rcu_read_unlock_bh();
616 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
620 m = rt6_check_dev(rt, oif);
621 if (!m && (strict & RT6_LOOKUP_F_IFACE))
622 return RT6_NUD_FAIL_HARD;
623 #ifdef CONFIG_IPV6_ROUTER_PREF
624 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
626 if (strict & RT6_LOOKUP_F_REACHABLE) {
627 int n = rt6_check_neigh(rt);
634 /* called with rc_read_lock held */
635 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
637 const struct net_device *dev = fib6_info_nh_dev(f6i);
641 const struct inet6_dev *idev = __in6_dev_get(dev);
643 rc = !!idev->cnf.ignore_routes_with_linkdown;
649 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
650 int *mpri, struct fib6_info *match,
654 bool match_do_rr = false;
656 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
659 if (fib6_ignore_linkdown(rt) &&
660 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
661 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
664 if (fib6_check_expired(rt))
667 m = rt6_score_route(rt, oif, strict);
668 if (m == RT6_NUD_FAIL_DO_RR) {
670 m = 0; /* lowest valid score */
671 } else if (m == RT6_NUD_FAIL_HARD) {
675 if (strict & RT6_LOOKUP_F_REACHABLE)
678 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
680 *do_rr = match_do_rr;
688 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
689 struct fib6_info *leaf,
690 struct fib6_info *rr_head,
691 u32 metric, int oif, int strict,
694 struct fib6_info *rt, *match, *cont;
699 for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
700 if (rt->fib6_metric != metric) {
705 match = find_match(rt, oif, strict, &mpri, match, do_rr);
708 for (rt = leaf; rt && rt != rr_head;
709 rt = rcu_dereference(rt->rt6_next)) {
710 if (rt->fib6_metric != metric) {
715 match = find_match(rt, oif, strict, &mpri, match, do_rr);
721 for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
722 match = find_match(rt, oif, strict, &mpri, match, do_rr);
727 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
730 struct fib6_info *leaf = rcu_dereference(fn->leaf);
731 struct fib6_info *match, *rt0;
735 if (!leaf || leaf == net->ipv6.fib6_null_entry)
736 return net->ipv6.fib6_null_entry;
738 rt0 = rcu_dereference(fn->rr_ptr);
742 /* Double check to make sure fn is not an intermediate node
743 * and fn->leaf does not points to its child's leaf
744 * (This might happen if all routes under fn are deleted from
745 * the tree and fib6_repair_tree() is called on the node.)
747 key_plen = rt0->fib6_dst.plen;
748 #ifdef CONFIG_IPV6_SUBTREES
749 if (rt0->fib6_src.plen)
750 key_plen = rt0->fib6_src.plen;
752 if (fn->fn_bit != key_plen)
753 return net->ipv6.fib6_null_entry;
755 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
759 struct fib6_info *next = rcu_dereference(rt0->rt6_next);
761 /* no entries matched; do round-robin */
762 if (!next || next->fib6_metric != rt0->fib6_metric)
766 spin_lock_bh(&leaf->fib6_table->tb6_lock);
767 /* make sure next is not being deleted from the tree */
769 rcu_assign_pointer(fn->rr_ptr, next);
770 spin_unlock_bh(&leaf->fib6_table->tb6_lock);
774 return match ? match : net->ipv6.fib6_null_entry;
777 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
779 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
782 #ifdef CONFIG_IPV6_ROUTE_INFO
783 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
784 const struct in6_addr *gwaddr)
786 struct net *net = dev_net(dev);
787 struct route_info *rinfo = (struct route_info *) opt;
788 struct in6_addr prefix_buf, *prefix;
790 unsigned long lifetime;
791 struct fib6_info *rt;
793 if (len < sizeof(struct route_info)) {
797 /* Sanity check for prefix_len and length */
798 if (rinfo->length > 3) {
800 } else if (rinfo->prefix_len > 128) {
802 } else if (rinfo->prefix_len > 64) {
803 if (rinfo->length < 2) {
806 } else if (rinfo->prefix_len > 0) {
807 if (rinfo->length < 1) {
812 pref = rinfo->route_pref;
813 if (pref == ICMPV6_ROUTER_PREF_INVALID)
816 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
818 if (rinfo->length == 3)
819 prefix = (struct in6_addr *)rinfo->prefix;
821 /* this function is safe */
822 ipv6_addr_prefix(&prefix_buf,
823 (struct in6_addr *)rinfo->prefix,
825 prefix = &prefix_buf;
828 if (rinfo->prefix_len == 0)
829 rt = rt6_get_dflt_router(net, gwaddr, dev);
831 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
834 if (rt && !lifetime) {
840 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
843 rt->fib6_flags = RTF_ROUTEINFO |
844 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
847 if (!addrconf_finite_timeout(lifetime))
848 fib6_clean_expires(rt);
850 fib6_set_expires(rt, jiffies + HZ * lifetime);
852 fib6_info_release(rt);
859 * Misc support functions
862 /* called with rcu_lock held */
863 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
865 struct net_device *dev = rt->fib6_nh.nh_dev;
867 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
868 /* for copies of local routes, dst->dev needs to be the
869 * device if it is a master device, the master device if
870 * device is enslaved, and the loopback as the default
872 if (netif_is_l3_slave(dev) &&
873 !rt6_need_strict(&rt->fib6_dst.addr))
874 dev = l3mdev_master_dev_rcu(dev);
875 else if (!netif_is_l3_master(dev))
876 dev = dev_net(dev)->loopback_dev;
877 /* last case is netif_is_l3_master(dev) is true in which
878 * case we want dev returned to be dev
885 static const int fib6_prop[RTN_MAX + 1] = {
892 [RTN_BLACKHOLE] = -EINVAL,
893 [RTN_UNREACHABLE] = -EHOSTUNREACH,
894 [RTN_PROHIBIT] = -EACCES,
895 [RTN_THROW] = -EAGAIN,
897 [RTN_XRESOLVE] = -EINVAL,
900 static int ip6_rt_type_to_error(u8 fib6_type)
902 return fib6_prop[fib6_type];
905 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
907 unsigned short flags = 0;
910 flags |= DST_NOCOUNT;
911 if (rt->dst_nopolicy)
912 flags |= DST_NOPOLICY;
919 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
921 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
923 switch (ort->fib6_type) {
925 rt->dst.output = dst_discard_out;
926 rt->dst.input = dst_discard;
929 rt->dst.output = ip6_pkt_prohibit_out;
930 rt->dst.input = ip6_pkt_prohibit;
933 case RTN_UNREACHABLE:
935 rt->dst.output = ip6_pkt_discard_out;
936 rt->dst.input = ip6_pkt_discard;
941 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
943 rt->dst.flags |= fib6_info_dst_flags(ort);
945 if (ort->fib6_flags & RTF_REJECT) {
946 ip6_rt_init_dst_reject(rt, ort);
951 rt->dst.output = ip6_output;
953 if (ort->fib6_type == RTN_LOCAL) {
954 rt->dst.input = ip6_input;
955 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
956 rt->dst.input = ip6_mc_input;
958 rt->dst.input = ip6_forward;
961 if (ort->fib6_nh.nh_lwtstate) {
962 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
963 lwtunnel_set_redirect(&rt->dst);
966 rt->dst.lastuse = jiffies;
969 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
971 rt->rt6i_flags &= ~RTF_EXPIRES;
972 fib6_info_hold(from);
973 rcu_assign_pointer(rt->from, from);
974 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
975 if (from->fib6_metrics != &dst_default_metrics) {
976 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
977 refcount_inc(&from->fib6_metrics->refcnt);
981 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
983 struct net_device *dev = fib6_info_nh_dev(ort);
985 ip6_rt_init_dst(rt, ort);
987 rt->rt6i_dst = ort->fib6_dst;
988 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
989 rt->rt6i_gateway = ort->fib6_nh.nh_gw;
990 rt->rt6i_flags = ort->fib6_flags;
991 rt6_set_from(rt, ort);
992 #ifdef CONFIG_IPV6_SUBTREES
993 rt->rt6i_src = ort->fib6_src;
995 rt->rt6i_prefsrc = ort->fib6_prefsrc;
996 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
999 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1000 struct in6_addr *saddr)
1002 struct fib6_node *pn, *sn;
1004 if (fn->fn_flags & RTN_TL_ROOT)
1006 pn = rcu_dereference(fn->parent);
1007 sn = FIB6_SUBTREE(pn);
1009 fn = fib6_lookup(sn, NULL, saddr);
1012 if (fn->fn_flags & RTN_RTINFO)
1017 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1020 struct rt6_info *rt = *prt;
1022 if (dst_hold_safe(&rt->dst))
1024 if (null_fallback) {
1025 rt = net->ipv6.ip6_null_entry;
1034 /* called with rcu_lock held */
1035 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1037 unsigned short flags = fib6_info_dst_flags(rt);
1038 struct net_device *dev = rt->fib6_nh.nh_dev;
1039 struct rt6_info *nrt;
1041 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1043 ip6_rt_copy_init(nrt, rt);
1048 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1049 struct fib6_table *table,
1051 const struct sk_buff *skb,
1054 struct fib6_info *f6i;
1055 struct fib6_node *fn;
1056 struct rt6_info *rt;
1058 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1059 flags &= ~RT6_LOOKUP_F_IFACE;
1062 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1064 f6i = rcu_dereference(fn->leaf);
1066 f6i = net->ipv6.fib6_null_entry;
1068 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1069 fl6->flowi6_oif, flags);
1070 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1071 f6i = rt6_multipath_select(net, f6i, fl6,
1072 fl6->flowi6_oif, skb, flags);
1074 if (f6i == net->ipv6.fib6_null_entry) {
1075 fn = fib6_backtrack(fn, &fl6->saddr);
1080 /* Search through exception table */
1081 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1083 if (ip6_hold_safe(net, &rt, true))
1084 dst_use_noref(&rt->dst, jiffies);
1085 } else if (f6i == net->ipv6.fib6_null_entry) {
1086 rt = net->ipv6.ip6_null_entry;
1089 rt = ip6_create_rt_rcu(f6i);
1091 rt = net->ipv6.ip6_null_entry;
1098 trace_fib6_table_lookup(net, rt, table, fl6);
1103 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1104 const struct sk_buff *skb, int flags)
1106 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1108 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1110 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1111 const struct in6_addr *saddr, int oif,
1112 const struct sk_buff *skb, int strict)
1114 struct flowi6 fl6 = {
1118 struct dst_entry *dst;
1119 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1122 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1123 flags |= RT6_LOOKUP_F_HAS_SADDR;
1126 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1127 if (dst->error == 0)
1128 return (struct rt6_info *) dst;
1134 EXPORT_SYMBOL(rt6_lookup);
1136 /* ip6_ins_rt is called with FREE table->tb6_lock.
1137 * It takes new route entry, the addition fails by any reason the
1138 * route is released.
1139 * Caller must hold dst before calling it.
1142 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1143 struct netlink_ext_ack *extack)
1146 struct fib6_table *table;
1148 table = rt->fib6_table;
1149 spin_lock_bh(&table->tb6_lock);
1150 err = fib6_add(&table->tb6_root, rt, info, extack);
1151 spin_unlock_bh(&table->tb6_lock);
1156 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1158 struct nl_info info = { .nl_net = net, };
1160 return __ip6_ins_rt(rt, &info, NULL);
1163 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1164 const struct in6_addr *daddr,
1165 const struct in6_addr *saddr)
1167 struct net_device *dev;
1168 struct rt6_info *rt;
1174 dev = ip6_rt_get_dev_rcu(ort);
1175 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1179 ip6_rt_copy_init(rt, ort);
1180 rt->rt6i_flags |= RTF_CACHE;
1181 rt->dst.flags |= DST_HOST;
1182 rt->rt6i_dst.addr = *daddr;
1183 rt->rt6i_dst.plen = 128;
1185 if (!rt6_is_gw_or_nonexthop(ort)) {
1186 if (ort->fib6_dst.plen != 128 &&
1187 ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1188 rt->rt6i_flags |= RTF_ANYCAST;
1189 #ifdef CONFIG_IPV6_SUBTREES
1190 if (rt->rt6i_src.plen && saddr) {
1191 rt->rt6i_src.addr = *saddr;
1192 rt->rt6i_src.plen = 128;
1200 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1202 unsigned short flags = fib6_info_dst_flags(rt);
1203 struct net_device *dev;
1204 struct rt6_info *pcpu_rt;
1207 dev = ip6_rt_get_dev_rcu(rt);
1208 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1212 ip6_rt_copy_init(pcpu_rt, rt);
1213 pcpu_rt->rt6i_flags |= RTF_PCPU;
1217 /* It should be called with rcu_read_lock() acquired */
1218 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1220 struct rt6_info *pcpu_rt, **p;
1222 p = this_cpu_ptr(rt->rt6i_pcpu);
1226 ip6_hold_safe(NULL, &pcpu_rt, false);
1231 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1232 struct fib6_info *rt)
1234 struct rt6_info *pcpu_rt, *prev, **p;
1236 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1238 dst_hold(&net->ipv6.ip6_null_entry->dst);
1239 return net->ipv6.ip6_null_entry;
1242 dst_hold(&pcpu_rt->dst);
1243 p = this_cpu_ptr(rt->rt6i_pcpu);
1244 prev = cmpxchg(p, NULL, pcpu_rt);
1250 /* exception hash table implementation
1252 static DEFINE_SPINLOCK(rt6_exception_lock);
1254 /* Remove rt6_ex from hash table and free the memory
1255 * Caller must hold rt6_exception_lock
1257 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1258 struct rt6_exception *rt6_ex)
1262 if (!bucket || !rt6_ex)
1265 net = dev_net(rt6_ex->rt6i->dst.dev);
1266 hlist_del_rcu(&rt6_ex->hlist);
1267 dst_release(&rt6_ex->rt6i->dst);
1268 kfree_rcu(rt6_ex, rcu);
1269 WARN_ON_ONCE(!bucket->depth);
1271 net->ipv6.rt6_stats->fib_rt_cache--;
1274 /* Remove oldest rt6_ex in bucket and free the memory
1275 * Caller must hold rt6_exception_lock
1277 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1279 struct rt6_exception *rt6_ex, *oldest = NULL;
1284 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1285 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1288 rt6_remove_exception(bucket, oldest);
1291 static u32 rt6_exception_hash(const struct in6_addr *dst,
1292 const struct in6_addr *src)
1294 static u32 seed __read_mostly;
1297 net_get_random_once(&seed, sizeof(seed));
1298 val = jhash(dst, sizeof(*dst), seed);
1300 #ifdef CONFIG_IPV6_SUBTREES
1302 val = jhash(src, sizeof(*src), val);
1304 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1307 /* Helper function to find the cached rt in the hash table
1308 * and update bucket pointer to point to the bucket for this
1309 * (daddr, saddr) pair
1310 * Caller must hold rt6_exception_lock
1312 static struct rt6_exception *
1313 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1314 const struct in6_addr *daddr,
1315 const struct in6_addr *saddr)
1317 struct rt6_exception *rt6_ex;
1320 if (!(*bucket) || !daddr)
1323 hval = rt6_exception_hash(daddr, saddr);
1326 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1327 struct rt6_info *rt6 = rt6_ex->rt6i;
1328 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1330 #ifdef CONFIG_IPV6_SUBTREES
1331 if (matched && saddr)
1332 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1340 /* Helper function to find the cached rt in the hash table
1341 * and update bucket pointer to point to the bucket for this
1342 * (daddr, saddr) pair
1343 * Caller must hold rcu_read_lock()
1345 static struct rt6_exception *
1346 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1347 const struct in6_addr *daddr,
1348 const struct in6_addr *saddr)
1350 struct rt6_exception *rt6_ex;
1353 WARN_ON_ONCE(!rcu_read_lock_held());
1355 if (!(*bucket) || !daddr)
1358 hval = rt6_exception_hash(daddr, saddr);
1361 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1362 struct rt6_info *rt6 = rt6_ex->rt6i;
1363 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1365 #ifdef CONFIG_IPV6_SUBTREES
1366 if (matched && saddr)
1367 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1375 static unsigned int fib6_mtu(const struct fib6_info *rt)
1379 if (rt->fib6_pmtu) {
1380 mtu = rt->fib6_pmtu;
1382 struct net_device *dev = fib6_info_nh_dev(rt);
1383 struct inet6_dev *idev;
1386 idev = __in6_dev_get(dev);
1387 mtu = idev->cnf.mtu6;
1391 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1393 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1396 static int rt6_insert_exception(struct rt6_info *nrt,
1397 struct fib6_info *ort)
1399 struct net *net = dev_net(nrt->dst.dev);
1400 struct rt6_exception_bucket *bucket;
1401 struct in6_addr *src_key = NULL;
1402 struct rt6_exception *rt6_ex;
1405 spin_lock_bh(&rt6_exception_lock);
1407 if (ort->exception_bucket_flushed) {
1412 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1413 lockdep_is_held(&rt6_exception_lock));
1415 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1421 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1424 #ifdef CONFIG_IPV6_SUBTREES
1425 /* rt6i_src.plen != 0 indicates ort is in subtree
1426 * and exception table is indexed by a hash of
1427 * both rt6i_dst and rt6i_src.
1428 * Otherwise, the exception table is indexed by
1429 * a hash of only rt6i_dst.
1431 if (ort->fib6_src.plen)
1432 src_key = &nrt->rt6i_src.addr;
1435 /* Update rt6i_prefsrc as it could be changed
1436 * in rt6_remove_prefsrc()
1438 nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1439 /* rt6_mtu_change() might lower mtu on ort.
1440 * Only insert this exception route if its mtu
1441 * is less than ort's mtu value.
1443 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1448 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1451 rt6_remove_exception(bucket, rt6_ex);
1453 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1459 rt6_ex->stamp = jiffies;
1460 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1462 net->ipv6.rt6_stats->fib_rt_cache++;
1464 if (bucket->depth > FIB6_MAX_DEPTH)
1465 rt6_exception_remove_oldest(bucket);
1468 spin_unlock_bh(&rt6_exception_lock);
1470 /* Update fn->fn_sernum to invalidate all cached dst */
1472 spin_lock_bh(&ort->fib6_table->tb6_lock);
1473 fib6_update_sernum(net, ort);
1474 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1475 fib6_force_start_gc(net);
1481 void rt6_flush_exceptions(struct fib6_info *rt)
1483 struct rt6_exception_bucket *bucket;
1484 struct rt6_exception *rt6_ex;
1485 struct hlist_node *tmp;
1488 spin_lock_bh(&rt6_exception_lock);
1489 /* Prevent rt6_insert_exception() to recreate the bucket list */
1490 rt->exception_bucket_flushed = 1;
1492 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1493 lockdep_is_held(&rt6_exception_lock));
1497 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1498 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1499 rt6_remove_exception(bucket, rt6_ex);
1500 WARN_ON_ONCE(bucket->depth);
1505 spin_unlock_bh(&rt6_exception_lock);
1508 /* Find cached rt in the hash table inside passed in rt
1509 * Caller has to hold rcu_read_lock()
1511 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1512 struct in6_addr *daddr,
1513 struct in6_addr *saddr)
1515 struct rt6_exception_bucket *bucket;
1516 struct in6_addr *src_key = NULL;
1517 struct rt6_exception *rt6_ex;
1518 struct rt6_info *res = NULL;
1520 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1522 #ifdef CONFIG_IPV6_SUBTREES
1523 /* rt6i_src.plen != 0 indicates rt is in subtree
1524 * and exception table is indexed by a hash of
1525 * both rt6i_dst and rt6i_src.
1526 * Otherwise, the exception table is indexed by
1527 * a hash of only rt6i_dst.
1529 if (rt->fib6_src.plen)
1532 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1534 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1540 /* Remove the passed in cached rt from the hash table that contains it */
1541 static int rt6_remove_exception_rt(struct rt6_info *rt)
1543 struct rt6_exception_bucket *bucket;
1544 struct in6_addr *src_key = NULL;
1545 struct rt6_exception *rt6_ex;
1546 struct fib6_info *from;
1549 from = rcu_dereference_protected(rt->from,
1550 lockdep_is_held(&rt6_exception_lock));
1552 !(rt->rt6i_flags & RTF_CACHE))
1555 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1558 spin_lock_bh(&rt6_exception_lock);
1559 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1560 lockdep_is_held(&rt6_exception_lock));
1561 #ifdef CONFIG_IPV6_SUBTREES
1562 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1563 * and exception table is indexed by a hash of
1564 * both rt6i_dst and rt6i_src.
1565 * Otherwise, the exception table is indexed by
1566 * a hash of only rt6i_dst.
1568 if (from->fib6_src.plen)
1569 src_key = &rt->rt6i_src.addr;
1571 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1575 rt6_remove_exception(bucket, rt6_ex);
1581 spin_unlock_bh(&rt6_exception_lock);
1585 /* Find rt6_ex which contains the passed in rt cache and
1588 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1590 struct rt6_exception_bucket *bucket;
1591 struct fib6_info *from = rt->from;
1592 struct in6_addr *src_key = NULL;
1593 struct rt6_exception *rt6_ex;
1596 !(rt->rt6i_flags & RTF_CACHE))
1600 bucket = rcu_dereference(from->rt6i_exception_bucket);
1602 #ifdef CONFIG_IPV6_SUBTREES
1603 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1604 * and exception table is indexed by a hash of
1605 * both rt6i_dst and rt6i_src.
1606 * Otherwise, the exception table is indexed by
1607 * a hash of only rt6i_dst.
1609 if (from->fib6_src.plen)
1610 src_key = &rt->rt6i_src.addr;
1612 rt6_ex = __rt6_find_exception_rcu(&bucket,
1616 rt6_ex->stamp = jiffies;
1621 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1623 struct rt6_exception_bucket *bucket;
1624 struct rt6_exception *rt6_ex;
1627 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1628 lockdep_is_held(&rt6_exception_lock));
1631 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1632 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1633 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1640 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1641 struct rt6_info *rt, int mtu)
1643 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1644 * lowest MTU in the path: always allow updating the route PMTU to
1645 * reflect PMTU decreases.
1647 * If the new MTU is higher, and the route PMTU is equal to the local
1648 * MTU, this means the old MTU is the lowest in the path, so allow
1649 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1653 if (dst_mtu(&rt->dst) >= mtu)
1656 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1662 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1663 struct fib6_info *rt, int mtu)
1665 struct rt6_exception_bucket *bucket;
1666 struct rt6_exception *rt6_ex;
1669 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1670 lockdep_is_held(&rt6_exception_lock));
1675 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1676 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1677 struct rt6_info *entry = rt6_ex->rt6i;
1679 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1680 * route), the metrics of its rt->from have already
1683 if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1684 rt6_mtu_change_route_allowed(idev, entry, mtu))
1685 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1691 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1693 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1694 struct in6_addr *gateway)
1696 struct rt6_exception_bucket *bucket;
1697 struct rt6_exception *rt6_ex;
1698 struct hlist_node *tmp;
1701 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1704 spin_lock_bh(&rt6_exception_lock);
1705 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1706 lockdep_is_held(&rt6_exception_lock));
1709 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1710 hlist_for_each_entry_safe(rt6_ex, tmp,
1711 &bucket->chain, hlist) {
1712 struct rt6_info *entry = rt6_ex->rt6i;
1714 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1715 RTF_CACHE_GATEWAY &&
1716 ipv6_addr_equal(gateway,
1717 &entry->rt6i_gateway)) {
1718 rt6_remove_exception(bucket, rt6_ex);
1725 spin_unlock_bh(&rt6_exception_lock);
1728 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1729 struct rt6_exception *rt6_ex,
1730 struct fib6_gc_args *gc_args,
1733 struct rt6_info *rt = rt6_ex->rt6i;
1735 /* we are pruning and obsoleting aged-out and non gateway exceptions
1736 * even if others have still references to them, so that on next
1737 * dst_check() such references can be dropped.
1738 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1739 * expired, independently from their aging, as per RFC 8201 section 4
1741 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1742 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1743 RT6_TRACE("aging clone %p\n", rt);
1744 rt6_remove_exception(bucket, rt6_ex);
1747 } else if (time_after(jiffies, rt->dst.expires)) {
1748 RT6_TRACE("purging expired route %p\n", rt);
1749 rt6_remove_exception(bucket, rt6_ex);
1753 if (rt->rt6i_flags & RTF_GATEWAY) {
1754 struct neighbour *neigh;
1755 __u8 neigh_flags = 0;
1757 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1759 neigh_flags = neigh->flags;
1761 if (!(neigh_flags & NTF_ROUTER)) {
1762 RT6_TRACE("purging route %p via non-router but gateway\n",
1764 rt6_remove_exception(bucket, rt6_ex);
1772 void rt6_age_exceptions(struct fib6_info *rt,
1773 struct fib6_gc_args *gc_args,
1776 struct rt6_exception_bucket *bucket;
1777 struct rt6_exception *rt6_ex;
1778 struct hlist_node *tmp;
1781 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1785 spin_lock(&rt6_exception_lock);
1786 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1787 lockdep_is_held(&rt6_exception_lock));
1790 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1791 hlist_for_each_entry_safe(rt6_ex, tmp,
1792 &bucket->chain, hlist) {
1793 rt6_age_examine_exception(bucket, rt6_ex,
1799 spin_unlock(&rt6_exception_lock);
1800 rcu_read_unlock_bh();
1803 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1804 int oif, struct flowi6 *fl6,
1805 const struct sk_buff *skb, int flags)
1807 struct fib6_node *fn, *saved_fn;
1808 struct fib6_info *f6i;
1809 struct rt6_info *rt;
1812 strict |= flags & RT6_LOOKUP_F_IFACE;
1813 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1814 if (net->ipv6.devconf_all->forwarding == 0)
1815 strict |= RT6_LOOKUP_F_REACHABLE;
1819 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1822 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1826 f6i = rt6_select(net, fn, oif, strict);
1827 if (f6i->fib6_nsiblings)
1828 f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict);
1829 if (f6i == net->ipv6.fib6_null_entry) {
1830 fn = fib6_backtrack(fn, &fl6->saddr);
1832 goto redo_rt6_select;
1833 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1834 /* also consider unreachable route */
1835 strict &= ~RT6_LOOKUP_F_REACHABLE;
1837 goto redo_rt6_select;
1841 if (f6i == net->ipv6.fib6_null_entry) {
1842 rt = net->ipv6.ip6_null_entry;
1845 trace_fib6_table_lookup(net, rt, table, fl6);
1849 /*Search through exception table */
1850 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1852 if (ip6_hold_safe(net, &rt, true))
1853 dst_use_noref(&rt->dst, jiffies);
1856 trace_fib6_table_lookup(net, rt, table, fl6);
1858 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1859 !(f6i->fib6_flags & RTF_GATEWAY))) {
1860 /* Create a RTF_CACHE clone which will not be
1861 * owned by the fib6 tree. It is for the special case where
1862 * the daddr in the skb during the neighbor look-up is different
1863 * from the fl6->daddr used to look-up route here.
1865 struct rt6_info *uncached_rt;
1867 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1872 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1873 * No need for another dst_hold()
1875 rt6_uncached_list_add(uncached_rt);
1876 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1878 uncached_rt = net->ipv6.ip6_null_entry;
1879 dst_hold(&uncached_rt->dst);
1882 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1886 /* Get a percpu copy */
1888 struct rt6_info *pcpu_rt;
1891 pcpu_rt = rt6_get_pcpu_route(f6i);
1894 pcpu_rt = rt6_make_pcpu_route(net, f6i);
1898 trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1902 EXPORT_SYMBOL_GPL(ip6_pol_route);
1904 static struct rt6_info *ip6_pol_route_input(struct net *net,
1905 struct fib6_table *table,
1907 const struct sk_buff *skb,
1910 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1913 struct dst_entry *ip6_route_input_lookup(struct net *net,
1914 struct net_device *dev,
1916 const struct sk_buff *skb,
1919 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1920 flags |= RT6_LOOKUP_F_IFACE;
1922 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1924 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1926 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1927 struct flow_keys *keys,
1928 struct flow_keys *flkeys)
1930 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1931 const struct ipv6hdr *key_iph = outer_iph;
1932 struct flow_keys *_flkeys = flkeys;
1933 const struct ipv6hdr *inner_iph;
1934 const struct icmp6hdr *icmph;
1935 struct ipv6hdr _inner_iph;
1937 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1940 icmph = icmp6_hdr(skb);
1941 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1942 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1943 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1944 icmph->icmp6_type != ICMPV6_PARAMPROB)
1947 inner_iph = skb_header_pointer(skb,
1948 skb_transport_offset(skb) + sizeof(*icmph),
1949 sizeof(_inner_iph), &_inner_iph);
1953 key_iph = inner_iph;
1957 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1958 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1959 keys->tags.flow_label = _flkeys->tags.flow_label;
1960 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1962 keys->addrs.v6addrs.src = key_iph->saddr;
1963 keys->addrs.v6addrs.dst = key_iph->daddr;
1964 keys->tags.flow_label = ip6_flowinfo(key_iph);
1965 keys->basic.ip_proto = key_iph->nexthdr;
1969 /* if skb is set it will be used and fl6 can be NULL */
1970 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1971 const struct sk_buff *skb, struct flow_keys *flkeys)
1973 struct flow_keys hash_keys;
1976 switch (ip6_multipath_hash_policy(net)) {
1978 memset(&hash_keys, 0, sizeof(hash_keys));
1979 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1981 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1983 hash_keys.addrs.v6addrs.src = fl6->saddr;
1984 hash_keys.addrs.v6addrs.dst = fl6->daddr;
1985 hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
1986 hash_keys.basic.ip_proto = fl6->flowi6_proto;
1991 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1992 struct flow_keys keys;
1994 /* short-circuit if we already have L4 hash present */
1996 return skb_get_hash_raw(skb) >> 1;
1998 memset(&hash_keys, 0, sizeof(hash_keys));
2001 skb_flow_dissect_flow_keys(skb, &keys, flag);
2004 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2005 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2006 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2007 hash_keys.ports.src = flkeys->ports.src;
2008 hash_keys.ports.dst = flkeys->ports.dst;
2009 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2011 memset(&hash_keys, 0, sizeof(hash_keys));
2012 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2013 hash_keys.addrs.v6addrs.src = fl6->saddr;
2014 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2015 hash_keys.ports.src = fl6->fl6_sport;
2016 hash_keys.ports.dst = fl6->fl6_dport;
2017 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2021 mhash = flow_hash_from_keys(&hash_keys);
2026 void ip6_route_input(struct sk_buff *skb)
2028 const struct ipv6hdr *iph = ipv6_hdr(skb);
2029 struct net *net = dev_net(skb->dev);
2030 int flags = RT6_LOOKUP_F_HAS_SADDR;
2031 struct ip_tunnel_info *tun_info;
2032 struct flowi6 fl6 = {
2033 .flowi6_iif = skb->dev->ifindex,
2034 .daddr = iph->daddr,
2035 .saddr = iph->saddr,
2036 .flowlabel = ip6_flowinfo(iph),
2037 .flowi6_mark = skb->mark,
2038 .flowi6_proto = iph->nexthdr,
2040 struct flow_keys *flkeys = NULL, _flkeys;
2042 tun_info = skb_tunnel_info(skb);
2043 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2044 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2046 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2049 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2050 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2053 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2056 static struct rt6_info *ip6_pol_route_output(struct net *net,
2057 struct fib6_table *table,
2059 const struct sk_buff *skb,
2062 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2065 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2066 struct flowi6 *fl6, int flags)
2070 if (rt6_need_strict(&fl6->daddr)) {
2071 struct dst_entry *dst;
2073 dst = l3mdev_link_scope_lookup(net, fl6);
2078 fl6->flowi6_iif = LOOPBACK_IFINDEX;
2080 any_src = ipv6_addr_any(&fl6->saddr);
2081 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2082 (fl6->flowi6_oif && any_src))
2083 flags |= RT6_LOOKUP_F_IFACE;
2086 flags |= RT6_LOOKUP_F_HAS_SADDR;
2088 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2090 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2092 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2094 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2096 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2097 struct net_device *loopback_dev = net->loopback_dev;
2098 struct dst_entry *new = NULL;
2100 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2101 DST_OBSOLETE_DEAD, 0);
2104 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2108 new->input = dst_discard;
2109 new->output = dst_discard_out;
2111 dst_copy_metrics(new, &ort->dst);
2113 rt->rt6i_idev = in6_dev_get(loopback_dev);
2114 rt->rt6i_gateway = ort->rt6i_gateway;
2115 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2117 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2118 #ifdef CONFIG_IPV6_SUBTREES
2119 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2123 dst_release(dst_orig);
2124 return new ? new : ERR_PTR(-ENOMEM);
2128 * Destination cache support functions
2131 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2135 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2138 if (fib6_check_expired(f6i))
2144 static struct dst_entry *rt6_check(struct rt6_info *rt,
2145 struct fib6_info *from,
2150 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2151 rt_cookie != cookie)
2154 if (rt6_check_expired(rt))
2160 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2161 struct fib6_info *from,
2164 if (!__rt6_check_expired(rt) &&
2165 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2166 fib6_check(from, cookie))
2172 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2174 struct dst_entry *dst_ret;
2175 struct fib6_info *from;
2176 struct rt6_info *rt;
2178 rt = container_of(dst, struct rt6_info, dst);
2182 /* All IPV6 dsts are created with ->obsolete set to the value
2183 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2184 * into this function always.
2187 from = rcu_dereference(rt->from);
2189 if (from && (rt->rt6i_flags & RTF_PCPU ||
2190 unlikely(!list_empty(&rt->rt6i_uncached))))
2191 dst_ret = rt6_dst_from_check(rt, from, cookie);
2193 dst_ret = rt6_check(rt, from, cookie);
2200 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2202 struct rt6_info *rt = (struct rt6_info *) dst;
2205 if (rt->rt6i_flags & RTF_CACHE) {
2207 if (rt6_check_expired(rt)) {
2208 rt6_remove_exception_rt(rt);
2220 static void ip6_link_failure(struct sk_buff *skb)
2222 struct rt6_info *rt;
2224 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2226 rt = (struct rt6_info *) skb_dst(skb);
2229 if (rt->rt6i_flags & RTF_CACHE) {
2230 if (dst_hold_safe(&rt->dst))
2231 rt6_remove_exception_rt(rt);
2233 struct fib6_info *from;
2234 struct fib6_node *fn;
2236 from = rcu_dereference(rt->from);
2238 fn = rcu_dereference(from->fib6_node);
2239 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2247 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2249 if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2250 struct fib6_info *from;
2253 from = rcu_dereference(rt0->from);
2255 rt0->dst.expires = from->expires;
2259 dst_set_expires(&rt0->dst, timeout);
2260 rt0->rt6i_flags |= RTF_EXPIRES;
2263 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2265 struct net *net = dev_net(rt->dst.dev);
2267 dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2268 rt->rt6i_flags |= RTF_MODIFIED;
2269 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2272 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2277 from_set = !!rcu_dereference(rt->from);
2280 return !(rt->rt6i_flags & RTF_CACHE) &&
2281 (rt->rt6i_flags & RTF_PCPU || from_set);
2284 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2285 const struct ipv6hdr *iph, u32 mtu)
2287 const struct in6_addr *daddr, *saddr;
2288 struct rt6_info *rt6 = (struct rt6_info *)dst;
2290 if (rt6->rt6i_flags & RTF_LOCAL)
2293 if (dst_metric_locked(dst, RTAX_MTU))
2297 daddr = &iph->daddr;
2298 saddr = &iph->saddr;
2300 daddr = &sk->sk_v6_daddr;
2301 saddr = &inet6_sk(sk)->saddr;
2306 dst_confirm_neigh(dst, daddr);
2307 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2308 if (mtu >= dst_mtu(dst))
2311 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2312 rt6_do_update_pmtu(rt6, mtu);
2313 /* update rt6_ex->stamp for cache */
2314 if (rt6->rt6i_flags & RTF_CACHE)
2315 rt6_update_exception_stamp_rt(rt6);
2317 struct fib6_info *from;
2318 struct rt6_info *nrt6;
2321 from = rcu_dereference(rt6->from);
2322 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2324 rt6_do_update_pmtu(nrt6, mtu);
2325 if (rt6_insert_exception(nrt6, from))
2326 dst_release_immediate(&nrt6->dst);
2332 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2333 struct sk_buff *skb, u32 mtu)
2335 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2338 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2339 int oif, u32 mark, kuid_t uid)
2341 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2342 struct dst_entry *dst;
2345 memset(&fl6, 0, sizeof(fl6));
2346 fl6.flowi6_oif = oif;
2347 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2348 fl6.daddr = iph->daddr;
2349 fl6.saddr = iph->saddr;
2350 fl6.flowlabel = ip6_flowinfo(iph);
2351 fl6.flowi6_uid = uid;
2353 dst = ip6_route_output(net, NULL, &fl6);
2355 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2358 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2360 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2362 struct dst_entry *dst;
2364 ip6_update_pmtu(skb, sock_net(sk), mtu,
2365 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2367 dst = __sk_dst_get(sk);
2368 if (!dst || !dst->obsolete ||
2369 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2373 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2374 ip6_datagram_dst_update(sk, false);
2377 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2379 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2380 const struct flowi6 *fl6)
2382 #ifdef CONFIG_IPV6_SUBTREES
2383 struct ipv6_pinfo *np = inet6_sk(sk);
2386 ip6_dst_store(sk, dst,
2387 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2388 &sk->sk_v6_daddr : NULL,
2389 #ifdef CONFIG_IPV6_SUBTREES
2390 ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2396 /* Handle redirects */
2397 struct ip6rd_flowi {
2399 struct in6_addr gateway;
2402 static struct rt6_info *__ip6_route_redirect(struct net *net,
2403 struct fib6_table *table,
2405 const struct sk_buff *skb,
2408 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2409 struct rt6_info *ret = NULL, *rt_cache;
2410 struct fib6_info *rt;
2411 struct fib6_node *fn;
2413 /* Get the "current" route for this destination and
2414 * check if the redirect has come from appropriate router.
2416 * RFC 4861 specifies that redirects should only be
2417 * accepted if they come from the nexthop to the target.
2418 * Due to the way the routes are chosen, this notion
2419 * is a bit fuzzy and one might need to check all possible
2424 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2426 for_each_fib6_node_rt_rcu(fn) {
2427 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2429 if (fib6_check_expired(rt))
2431 if (rt->fib6_flags & RTF_REJECT)
2433 if (!(rt->fib6_flags & RTF_GATEWAY))
2435 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2437 /* rt_cache's gateway might be different from its 'parent'
2438 * in the case of an ip redirect.
2439 * So we keep searching in the exception table if the gateway
2442 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2443 rt_cache = rt6_find_cached_rt(rt,
2447 ipv6_addr_equal(&rdfl->gateway,
2448 &rt_cache->rt6i_gateway)) {
2458 rt = net->ipv6.fib6_null_entry;
2459 else if (rt->fib6_flags & RTF_REJECT) {
2460 ret = net->ipv6.ip6_null_entry;
2464 if (rt == net->ipv6.fib6_null_entry) {
2465 fn = fib6_backtrack(fn, &fl6->saddr);
2472 dst_hold(&ret->dst);
2474 ret = ip6_create_rt_rcu(rt);
2478 trace_fib6_table_lookup(net, ret, table, fl6);
2482 static struct dst_entry *ip6_route_redirect(struct net *net,
2483 const struct flowi6 *fl6,
2484 const struct sk_buff *skb,
2485 const struct in6_addr *gateway)
2487 int flags = RT6_LOOKUP_F_HAS_SADDR;
2488 struct ip6rd_flowi rdfl;
2491 rdfl.gateway = *gateway;
2493 return fib6_rule_lookup(net, &rdfl.fl6, skb,
2494 flags, __ip6_route_redirect);
2497 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2500 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2501 struct dst_entry *dst;
2504 memset(&fl6, 0, sizeof(fl6));
2505 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2506 fl6.flowi6_oif = oif;
2507 fl6.flowi6_mark = mark;
2508 fl6.daddr = iph->daddr;
2509 fl6.saddr = iph->saddr;
2510 fl6.flowlabel = ip6_flowinfo(iph);
2511 fl6.flowi6_uid = uid;
2513 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2514 rt6_do_redirect(dst, NULL, skb);
2517 EXPORT_SYMBOL_GPL(ip6_redirect);
2519 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2522 const struct ipv6hdr *iph = ipv6_hdr(skb);
2523 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2524 struct dst_entry *dst;
2527 memset(&fl6, 0, sizeof(fl6));
2528 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2529 fl6.flowi6_oif = oif;
2530 fl6.flowi6_mark = mark;
2531 fl6.daddr = msg->dest;
2532 fl6.saddr = iph->daddr;
2533 fl6.flowi6_uid = sock_net_uid(net, NULL);
2535 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2536 rt6_do_redirect(dst, NULL, skb);
2540 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2542 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2545 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2547 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2549 struct net_device *dev = dst->dev;
2550 unsigned int mtu = dst_mtu(dst);
2551 struct net *net = dev_net(dev);
2553 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2555 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2556 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2559 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2560 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2561 * IPV6_MAXPLEN is also valid and means: "any MSS,
2562 * rely only on pmtu discovery"
2564 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2569 static unsigned int ip6_mtu(const struct dst_entry *dst)
2571 struct inet6_dev *idev;
2574 mtu = dst_metric_raw(dst, RTAX_MTU);
2581 idev = __in6_dev_get(dst->dev);
2583 mtu = idev->cnf.mtu6;
2587 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2589 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2592 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2595 struct dst_entry *dst;
2596 struct rt6_info *rt;
2597 struct inet6_dev *idev = in6_dev_get(dev);
2598 struct net *net = dev_net(dev);
2600 if (unlikely(!idev))
2601 return ERR_PTR(-ENODEV);
2603 rt = ip6_dst_alloc(net, dev, 0);
2604 if (unlikely(!rt)) {
2606 dst = ERR_PTR(-ENOMEM);
2610 rt->dst.flags |= DST_HOST;
2611 rt->dst.input = ip6_input;
2612 rt->dst.output = ip6_output;
2613 rt->rt6i_gateway = fl6->daddr;
2614 rt->rt6i_dst.addr = fl6->daddr;
2615 rt->rt6i_dst.plen = 128;
2616 rt->rt6i_idev = idev;
2617 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2619 /* Add this dst into uncached_list so that rt6_disable_ip() can
2620 * do proper release of the net_device
2622 rt6_uncached_list_add(rt);
2623 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2625 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2631 static int ip6_dst_gc(struct dst_ops *ops)
2633 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2634 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2635 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2636 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2637 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2638 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2641 entries = dst_entries_get_fast(ops);
2642 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2643 entries <= rt_max_size)
2646 net->ipv6.ip6_rt_gc_expire++;
2647 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2648 entries = dst_entries_get_slow(ops);
2649 if (entries < ops->gc_thresh)
2650 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2652 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2653 return entries > rt_max_size;
2656 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2657 struct fib6_config *cfg)
2659 struct dst_metrics *p;
2664 p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2668 refcount_set(&p->refcnt, 1);
2669 rt->fib6_metrics = p;
2671 return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2674 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2675 struct fib6_config *cfg,
2676 const struct in6_addr *gw_addr,
2677 u32 tbid, int flags)
2679 struct flowi6 fl6 = {
2680 .flowi6_oif = cfg->fc_ifindex,
2682 .saddr = cfg->fc_prefsrc,
2684 struct fib6_table *table;
2685 struct rt6_info *rt;
2687 table = fib6_get_table(net, tbid);
2691 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2692 flags |= RT6_LOOKUP_F_HAS_SADDR;
2694 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2695 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2697 /* if table lookup failed, fall back to full lookup */
2698 if (rt == net->ipv6.ip6_null_entry) {
2706 static int ip6_route_check_nh_onlink(struct net *net,
2707 struct fib6_config *cfg,
2708 const struct net_device *dev,
2709 struct netlink_ext_ack *extack)
2711 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2712 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2713 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2714 struct rt6_info *grt;
2718 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2720 if (!grt->dst.error &&
2721 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2722 NL_SET_ERR_MSG(extack,
2723 "Nexthop has invalid gateway or device mismatch");
2733 static int ip6_route_check_nh(struct net *net,
2734 struct fib6_config *cfg,
2735 struct net_device **_dev,
2736 struct inet6_dev **idev)
2738 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2739 struct net_device *dev = _dev ? *_dev : NULL;
2740 struct rt6_info *grt = NULL;
2741 int err = -EHOSTUNREACH;
2743 if (cfg->fc_table) {
2744 int flags = RT6_LOOKUP_F_IFACE;
2746 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2747 cfg->fc_table, flags);
2749 if (grt->rt6i_flags & RTF_GATEWAY ||
2750 (dev && dev != grt->dst.dev)) {
2758 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2764 if (dev != grt->dst.dev) {
2769 *_dev = dev = grt->dst.dev;
2770 *idev = grt->rt6i_idev;
2772 in6_dev_hold(grt->rt6i_idev);
2775 if (!(grt->rt6i_flags & RTF_GATEWAY))
2784 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2785 struct net_device **_dev, struct inet6_dev **idev,
2786 struct netlink_ext_ack *extack)
2788 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2789 int gwa_type = ipv6_addr_type(gw_addr);
2790 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2791 const struct net_device *dev = *_dev;
2792 bool need_addr_check = !dev;
2795 /* if gw_addr is local we will fail to detect this in case
2796 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2797 * will return already-added prefix route via interface that
2798 * prefix route was assigned to, which might be non-loopback.
2801 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2802 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2806 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2807 /* IPv6 strictly inhibits using not link-local
2808 * addresses as nexthop address.
2809 * Otherwise, router will not able to send redirects.
2810 * It is very good, but in some (rare!) circumstances
2811 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2812 * some exceptions. --ANK
2813 * We allow IPv4-mapped nexthops to support RFC4798-type
2816 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2817 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2821 if (cfg->fc_flags & RTNH_F_ONLINK)
2822 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2824 err = ip6_route_check_nh(net, cfg, _dev, idev);
2830 /* reload in case device was changed */
2835 NL_SET_ERR_MSG(extack, "Egress device not specified");
2837 } else if (dev->flags & IFF_LOOPBACK) {
2838 NL_SET_ERR_MSG(extack,
2839 "Egress device can not be loopback device for this route");
2843 /* if we did not check gw_addr above, do so now that the
2844 * egress device has been resolved.
2846 if (need_addr_check &&
2847 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2848 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2857 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2859 struct netlink_ext_ack *extack)
2861 struct net *net = cfg->fc_nlinfo.nl_net;
2862 struct fib6_info *rt = NULL;
2863 struct net_device *dev = NULL;
2864 struct inet6_dev *idev = NULL;
2865 struct fib6_table *table;
2869 /* RTF_PCPU is an internal flag; can not be set by userspace */
2870 if (cfg->fc_flags & RTF_PCPU) {
2871 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2875 /* RTF_CACHE is an internal flag; can not be set by userspace */
2876 if (cfg->fc_flags & RTF_CACHE) {
2877 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2881 if (cfg->fc_type > RTN_MAX) {
2882 NL_SET_ERR_MSG(extack, "Invalid route type");
2886 if (cfg->fc_dst_len > 128) {
2887 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2890 if (cfg->fc_src_len > 128) {
2891 NL_SET_ERR_MSG(extack, "Invalid source address length");
2894 #ifndef CONFIG_IPV6_SUBTREES
2895 if (cfg->fc_src_len) {
2896 NL_SET_ERR_MSG(extack,
2897 "Specifying source address requires IPV6_SUBTREES to be enabled");
2901 if (cfg->fc_ifindex) {
2903 dev = dev_get_by_index(net, cfg->fc_ifindex);
2906 idev = in6_dev_get(dev);
2911 if (cfg->fc_metric == 0)
2912 cfg->fc_metric = IP6_RT_PRIO_USER;
2914 if (cfg->fc_flags & RTNH_F_ONLINK) {
2916 NL_SET_ERR_MSG(extack,
2917 "Nexthop device required for onlink");
2922 if (!(dev->flags & IFF_UP)) {
2923 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2930 if (cfg->fc_nlinfo.nlh &&
2931 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2932 table = fib6_get_table(net, cfg->fc_table);
2934 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2935 table = fib6_new_table(net, cfg->fc_table);
2938 table = fib6_new_table(net, cfg->fc_table);
2945 rt = fib6_info_alloc(gfp_flags);
2949 if (cfg->fc_flags & RTF_ADDRCONF)
2950 rt->dst_nocount = true;
2952 err = ip6_convert_metrics(net, rt, cfg);
2956 if (cfg->fc_flags & RTF_EXPIRES)
2957 fib6_set_expires(rt, jiffies +
2958 clock_t_to_jiffies(cfg->fc_expires));
2960 fib6_clean_expires(rt);
2962 if (cfg->fc_protocol == RTPROT_UNSPEC)
2963 cfg->fc_protocol = RTPROT_BOOT;
2964 rt->fib6_protocol = cfg->fc_protocol;
2966 addr_type = ipv6_addr_type(&cfg->fc_dst);
2968 if (cfg->fc_encap) {
2969 struct lwtunnel_state *lwtstate;
2971 err = lwtunnel_build_state(cfg->fc_encap_type,
2972 cfg->fc_encap, AF_INET6, cfg,
2976 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
2979 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2980 rt->fib6_dst.plen = cfg->fc_dst_len;
2981 if (rt->fib6_dst.plen == 128)
2982 rt->dst_host = true;
2984 #ifdef CONFIG_IPV6_SUBTREES
2985 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
2986 rt->fib6_src.plen = cfg->fc_src_len;
2989 rt->fib6_metric = cfg->fc_metric;
2990 rt->fib6_nh.nh_weight = 1;
2992 rt->fib6_type = cfg->fc_type;
2994 /* We cannot add true routes via loopback here,
2995 they would result in kernel looping; promote them to reject routes
2997 if ((cfg->fc_flags & RTF_REJECT) ||
2998 (dev && (dev->flags & IFF_LOOPBACK) &&
2999 !(addr_type & IPV6_ADDR_LOOPBACK) &&
3000 !(cfg->fc_flags & RTF_LOCAL))) {
3001 /* hold loopback dev/idev if we haven't done so. */
3002 if (dev != net->loopback_dev) {
3007 dev = net->loopback_dev;
3009 idev = in6_dev_get(dev);
3015 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3019 if (cfg->fc_flags & RTF_GATEWAY) {
3020 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3024 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3031 if (idev->cnf.disable_ipv6) {
3032 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3037 if (!(dev->flags & IFF_UP)) {
3038 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3043 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3044 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3045 NL_SET_ERR_MSG(extack, "Invalid source address");
3049 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3050 rt->fib6_prefsrc.plen = 128;
3052 rt->fib6_prefsrc.plen = 0;
3054 rt->fib6_flags = cfg->fc_flags;
3057 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3058 !netif_carrier_ok(dev))
3059 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3060 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3061 rt->fib6_nh.nh_dev = dev;
3062 rt->fib6_table = table;
3064 cfg->fc_nlinfo.nl_net = dev_net(dev);
3076 fib6_info_release(rt);
3077 return ERR_PTR(err);
3080 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3081 struct netlink_ext_ack *extack)
3083 struct fib6_info *rt;
3086 rt = ip6_route_info_create(cfg, gfp_flags, extack);
3090 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3091 fib6_info_release(rt);
3096 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3098 struct net *net = info->nl_net;
3099 struct fib6_table *table;
3102 if (rt == net->ipv6.fib6_null_entry) {
3107 table = rt->fib6_table;
3108 spin_lock_bh(&table->tb6_lock);
3109 err = fib6_del(rt, info);
3110 spin_unlock_bh(&table->tb6_lock);
3113 fib6_info_release(rt);
3117 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3119 struct nl_info info = { .nl_net = net };
3121 return __ip6_del_rt(rt, &info);
3124 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3126 struct nl_info *info = &cfg->fc_nlinfo;
3127 struct net *net = info->nl_net;
3128 struct sk_buff *skb = NULL;
3129 struct fib6_table *table;
3132 if (rt == net->ipv6.fib6_null_entry)
3134 table = rt->fib6_table;
3135 spin_lock_bh(&table->tb6_lock);
3137 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3138 struct fib6_info *sibling, *next_sibling;
3140 /* prefer to send a single notification with all hops */
3141 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3143 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3145 if (rt6_fill_node(net, skb, rt, NULL,
3146 NULL, NULL, 0, RTM_DELROUTE,
3147 info->portid, seq, 0) < 0) {
3151 info->skip_notify = 1;
3154 list_for_each_entry_safe(sibling, next_sibling,
3157 err = fib6_del(sibling, info);
3163 err = fib6_del(rt, info);
3165 spin_unlock_bh(&table->tb6_lock);
3167 fib6_info_release(rt);
3170 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3171 info->nlh, gfp_any());
3176 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3180 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3183 if (cfg->fc_flags & RTF_GATEWAY &&
3184 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3186 if (dst_hold_safe(&rt->dst))
3187 rc = rt6_remove_exception_rt(rt);
3192 static int ip6_route_del(struct fib6_config *cfg,
3193 struct netlink_ext_ack *extack)
3195 struct rt6_info *rt_cache;
3196 struct fib6_table *table;
3197 struct fib6_info *rt;
3198 struct fib6_node *fn;
3201 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3203 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3209 fn = fib6_locate(&table->tb6_root,
3210 &cfg->fc_dst, cfg->fc_dst_len,
3211 &cfg->fc_src, cfg->fc_src_len,
3212 !(cfg->fc_flags & RTF_CACHE));
3215 for_each_fib6_node_rt_rcu(fn) {
3216 if (cfg->fc_flags & RTF_CACHE) {
3219 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3222 rc = ip6_del_cached_rt(rt_cache, cfg);
3228 if (cfg->fc_ifindex &&
3229 (!rt->fib6_nh.nh_dev ||
3230 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3232 if (cfg->fc_flags & RTF_GATEWAY &&
3233 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3235 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3237 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3242 /* if gateway was specified only delete the one hop */
3243 if (cfg->fc_flags & RTF_GATEWAY)
3244 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3246 return __ip6_del_rt_siblings(rt, cfg);
3254 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3256 struct netevent_redirect netevent;
3257 struct rt6_info *rt, *nrt = NULL;
3258 struct ndisc_options ndopts;
3259 struct inet6_dev *in6_dev;
3260 struct neighbour *neigh;
3261 struct fib6_info *from;
3263 int optlen, on_link;
3266 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3267 optlen -= sizeof(*msg);
3270 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3274 msg = (struct rd_msg *)icmp6_hdr(skb);
3276 if (ipv6_addr_is_multicast(&msg->dest)) {
3277 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3282 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3284 } else if (ipv6_addr_type(&msg->target) !=
3285 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3286 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3290 in6_dev = __in6_dev_get(skb->dev);
3293 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3297 * The IP source address of the Redirect MUST be the same as the current
3298 * first-hop router for the specified ICMP Destination Address.
3301 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3302 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3307 if (ndopts.nd_opts_tgt_lladdr) {
3308 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3311 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3316 rt = (struct rt6_info *) dst;
3317 if (rt->rt6i_flags & RTF_REJECT) {
3318 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3322 /* Redirect received -> path was valid.
3323 * Look, redirects are sent only in response to data packets,
3324 * so that this nexthop apparently is reachable. --ANK
3326 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3328 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3333 * We have finally decided to accept it.
3336 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3337 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3338 NEIGH_UPDATE_F_OVERRIDE|
3339 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3340 NEIGH_UPDATE_F_ISROUTER)),
3341 NDISC_REDIRECT, &ndopts);
3344 from = rcu_dereference(rt->from);
3345 fib6_info_hold(from);
3348 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3352 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3354 nrt->rt6i_flags &= ~RTF_GATEWAY;
3356 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3358 /* No need to remove rt from the exception table if rt is
3359 * a cached route because rt6_insert_exception() will
3362 if (rt6_insert_exception(nrt, from)) {
3363 dst_release_immediate(&nrt->dst);
3367 netevent.old = &rt->dst;
3368 netevent.new = &nrt->dst;
3369 netevent.daddr = &msg->dest;
3370 netevent.neigh = neigh;
3371 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3374 fib6_info_release(from);
3375 neigh_release(neigh);
3378 #ifdef CONFIG_IPV6_ROUTE_INFO
3379 static struct fib6_info *rt6_get_route_info(struct net *net,
3380 const struct in6_addr *prefix, int prefixlen,
3381 const struct in6_addr *gwaddr,
3382 struct net_device *dev)
3384 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3385 int ifindex = dev->ifindex;
3386 struct fib6_node *fn;
3387 struct fib6_info *rt = NULL;
3388 struct fib6_table *table;
3390 table = fib6_get_table(net, tb_id);
3395 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3399 for_each_fib6_node_rt_rcu(fn) {
3400 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3402 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3404 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3414 static struct fib6_info *rt6_add_route_info(struct net *net,
3415 const struct in6_addr *prefix, int prefixlen,
3416 const struct in6_addr *gwaddr,
3417 struct net_device *dev,
3420 struct fib6_config cfg = {
3421 .fc_metric = IP6_RT_PRIO_USER,
3422 .fc_ifindex = dev->ifindex,
3423 .fc_dst_len = prefixlen,
3424 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3425 RTF_UP | RTF_PREF(pref),
3426 .fc_protocol = RTPROT_RA,
3427 .fc_type = RTN_UNICAST,
3428 .fc_nlinfo.portid = 0,
3429 .fc_nlinfo.nlh = NULL,
3430 .fc_nlinfo.nl_net = net,
3433 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3434 cfg.fc_dst = *prefix;
3435 cfg.fc_gateway = *gwaddr;
3437 /* We should treat it as a default route if prefix length is 0. */
3439 cfg.fc_flags |= RTF_DEFAULT;
3441 ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3443 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3447 struct fib6_info *rt6_get_dflt_router(struct net *net,
3448 const struct in6_addr *addr,
3449 struct net_device *dev)
3451 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3452 struct fib6_info *rt;
3453 struct fib6_table *table;
3455 table = fib6_get_table(net, tb_id);
3460 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3461 if (dev == rt->fib6_nh.nh_dev &&
3462 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3463 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3472 struct fib6_info *rt6_add_dflt_router(struct net *net,
3473 const struct in6_addr *gwaddr,
3474 struct net_device *dev,
3477 struct fib6_config cfg = {
3478 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3479 .fc_metric = IP6_RT_PRIO_USER,
3480 .fc_ifindex = dev->ifindex,
3481 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3482 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3483 .fc_protocol = RTPROT_RA,
3484 .fc_type = RTN_UNICAST,
3485 .fc_nlinfo.portid = 0,
3486 .fc_nlinfo.nlh = NULL,
3487 .fc_nlinfo.nl_net = net,
3490 cfg.fc_gateway = *gwaddr;
3492 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3493 struct fib6_table *table;
3495 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3497 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3500 return rt6_get_dflt_router(net, gwaddr, dev);
3503 static void __rt6_purge_dflt_routers(struct net *net,
3504 struct fib6_table *table)
3506 struct fib6_info *rt;
3510 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3511 struct net_device *dev = fib6_info_nh_dev(rt);
3512 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3514 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3515 (!idev || idev->cnf.accept_ra != 2)) {
3518 ip6_del_rt(net, rt);
3524 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3527 void rt6_purge_dflt_routers(struct net *net)
3529 struct fib6_table *table;
3530 struct hlist_head *head;
3535 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3536 head = &net->ipv6.fib_table_hash[h];
3537 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3538 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3539 __rt6_purge_dflt_routers(net, table);
3546 static void rtmsg_to_fib6_config(struct net *net,
3547 struct in6_rtmsg *rtmsg,
3548 struct fib6_config *cfg)
3550 memset(cfg, 0, sizeof(*cfg));
3552 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3554 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3555 cfg->fc_metric = rtmsg->rtmsg_metric;
3556 cfg->fc_expires = rtmsg->rtmsg_info;
3557 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3558 cfg->fc_src_len = rtmsg->rtmsg_src_len;
3559 cfg->fc_flags = rtmsg->rtmsg_flags;
3560 cfg->fc_type = rtmsg->rtmsg_type;
3562 cfg->fc_nlinfo.nl_net = net;
3564 cfg->fc_dst = rtmsg->rtmsg_dst;
3565 cfg->fc_src = rtmsg->rtmsg_src;
3566 cfg->fc_gateway = rtmsg->rtmsg_gateway;
3569 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3571 struct fib6_config cfg;
3572 struct in6_rtmsg rtmsg;
3576 case SIOCADDRT: /* Add a route */
3577 case SIOCDELRT: /* Delete a route */
3578 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3580 err = copy_from_user(&rtmsg, arg,
3581 sizeof(struct in6_rtmsg));
3585 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3590 err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3593 err = ip6_route_del(&cfg, NULL);
3607 * Drop the packet on the floor
3610 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3613 struct dst_entry *dst = skb_dst(skb);
3614 switch (ipstats_mib_noroutes) {
3615 case IPSTATS_MIB_INNOROUTES:
3616 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3617 if (type == IPV6_ADDR_ANY) {
3618 IP6_INC_STATS(dev_net(dst->dev),
3619 __in6_dev_get_safely(skb->dev),
3620 IPSTATS_MIB_INADDRERRORS);
3624 case IPSTATS_MIB_OUTNOROUTES:
3625 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3626 ipstats_mib_noroutes);
3629 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3634 static int ip6_pkt_discard(struct sk_buff *skb)
3636 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3639 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3641 skb->dev = skb_dst(skb)->dev;
3642 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3645 static int ip6_pkt_prohibit(struct sk_buff *skb)
3647 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3650 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3652 skb->dev = skb_dst(skb)->dev;
3653 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3657 * Allocate a dst for local (unicast / anycast) address.
3660 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3661 struct inet6_dev *idev,
3662 const struct in6_addr *addr,
3663 bool anycast, gfp_t gfp_flags)
3666 struct net_device *dev = idev->dev;
3667 struct fib6_info *f6i;
3669 f6i = fib6_info_alloc(gfp_flags);
3671 return ERR_PTR(-ENOMEM);
3673 f6i->dst_nocount = true;
3674 f6i->dst_host = true;
3675 f6i->fib6_protocol = RTPROT_KERNEL;
3676 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3678 f6i->fib6_type = RTN_ANYCAST;
3679 f6i->fib6_flags |= RTF_ANYCAST;
3681 f6i->fib6_type = RTN_LOCAL;
3682 f6i->fib6_flags |= RTF_LOCAL;
3685 f6i->fib6_nh.nh_gw = *addr;
3687 f6i->fib6_nh.nh_dev = dev;
3688 f6i->fib6_dst.addr = *addr;
3689 f6i->fib6_dst.plen = 128;
3690 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3691 f6i->fib6_table = fib6_get_table(net, tb_id);
3696 /* remove deleted ip from prefsrc entries */
3697 struct arg_dev_net_ip {
3698 struct net_device *dev;
3700 struct in6_addr *addr;
3703 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3705 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3706 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3707 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3709 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3710 rt != net->ipv6.fib6_null_entry &&
3711 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3712 spin_lock_bh(&rt6_exception_lock);
3713 /* remove prefsrc entry */
3714 rt->fib6_prefsrc.plen = 0;
3715 /* need to update cache as well */
3716 rt6_exceptions_remove_prefsrc(rt);
3717 spin_unlock_bh(&rt6_exception_lock);
3722 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3724 struct net *net = dev_net(ifp->idev->dev);
3725 struct arg_dev_net_ip adni = {
3726 .dev = ifp->idev->dev,
3730 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3733 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3735 /* Remove routers and update dst entries when gateway turn into host. */
3736 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3738 struct in6_addr *gateway = (struct in6_addr *)arg;
3740 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3741 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3745 /* Further clean up cached routes in exception table.
3746 * This is needed because cached route may have a different
3747 * gateway than its 'parent' in the case of an ip redirect.
3749 rt6_exceptions_clean_tohost(rt, gateway);
3754 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3756 fib6_clean_all(net, fib6_clean_tohost, gateway);
3759 struct arg_netdev_event {
3760 const struct net_device *dev;
3762 unsigned int nh_flags;
3763 unsigned long event;
3767 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3769 struct fib6_info *iter;
3770 struct fib6_node *fn;
3772 fn = rcu_dereference_protected(rt->fib6_node,
3773 lockdep_is_held(&rt->fib6_table->tb6_lock));
3774 iter = rcu_dereference_protected(fn->leaf,
3775 lockdep_is_held(&rt->fib6_table->tb6_lock));
3777 if (iter->fib6_metric == rt->fib6_metric &&
3778 rt6_qualify_for_ecmp(iter))
3780 iter = rcu_dereference_protected(iter->rt6_next,
3781 lockdep_is_held(&rt->fib6_table->tb6_lock));
3787 static bool rt6_is_dead(const struct fib6_info *rt)
3789 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3790 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3791 fib6_ignore_linkdown(rt)))
3797 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3799 struct fib6_info *iter;
3802 if (!rt6_is_dead(rt))
3803 total += rt->fib6_nh.nh_weight;
3805 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3806 if (!rt6_is_dead(iter))
3807 total += iter->fib6_nh.nh_weight;
3813 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3815 int upper_bound = -1;
3817 if (!rt6_is_dead(rt)) {
3818 *weight += rt->fib6_nh.nh_weight;
3819 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3822 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3825 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3827 struct fib6_info *iter;
3830 rt6_upper_bound_set(rt, &weight, total);
3832 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3833 rt6_upper_bound_set(iter, &weight, total);
3836 void rt6_multipath_rebalance(struct fib6_info *rt)
3838 struct fib6_info *first;
3841 /* In case the entire multipath route was marked for flushing,
3842 * then there is no need to rebalance upon the removal of every
3845 if (!rt->fib6_nsiblings || rt->should_flush)
3848 /* During lookup routes are evaluated in order, so we need to
3849 * make sure upper bounds are assigned from the first sibling
3852 first = rt6_multipath_first_sibling(rt);
3853 if (WARN_ON_ONCE(!first))
3856 total = rt6_multipath_total_weight(first);
3857 rt6_multipath_upper_bound_set(first, total);
3860 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3862 const struct arg_netdev_event *arg = p_arg;
3863 struct net *net = dev_net(arg->dev);
3865 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3866 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3867 fib6_update_sernum_upto_root(net, rt);
3868 rt6_multipath_rebalance(rt);
3874 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3876 struct arg_netdev_event arg = {
3879 .nh_flags = nh_flags,
3883 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3884 arg.nh_flags |= RTNH_F_LINKDOWN;
3886 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3889 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3890 const struct net_device *dev)
3892 struct fib6_info *iter;
3894 if (rt->fib6_nh.nh_dev == dev)
3896 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3897 if (iter->fib6_nh.nh_dev == dev)
3903 static void rt6_multipath_flush(struct fib6_info *rt)
3905 struct fib6_info *iter;
3907 rt->should_flush = 1;
3908 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3909 iter->should_flush = 1;
3912 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3913 const struct net_device *down_dev)
3915 struct fib6_info *iter;
3916 unsigned int dead = 0;
3918 if (rt->fib6_nh.nh_dev == down_dev ||
3919 rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3921 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3922 if (iter->fib6_nh.nh_dev == down_dev ||
3923 iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3929 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3930 const struct net_device *dev,
3931 unsigned int nh_flags)
3933 struct fib6_info *iter;
3935 if (rt->fib6_nh.nh_dev == dev)
3936 rt->fib6_nh.nh_flags |= nh_flags;
3937 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3938 if (iter->fib6_nh.nh_dev == dev)
3939 iter->fib6_nh.nh_flags |= nh_flags;
3942 /* called with write lock held for table with rt */
3943 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
3945 const struct arg_netdev_event *arg = p_arg;
3946 const struct net_device *dev = arg->dev;
3947 struct net *net = dev_net(dev);
3949 if (rt == net->ipv6.fib6_null_entry)
3952 switch (arg->event) {
3953 case NETDEV_UNREGISTER:
3954 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3956 if (rt->should_flush)
3958 if (!rt->fib6_nsiblings)
3959 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3960 if (rt6_multipath_uses_dev(rt, dev)) {
3963 count = rt6_multipath_dead_count(rt, dev);
3964 if (rt->fib6_nsiblings + 1 == count) {
3965 rt6_multipath_flush(rt);
3968 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3970 fib6_update_sernum(net, rt);
3971 rt6_multipath_rebalance(rt);
3975 if (rt->fib6_nh.nh_dev != dev ||
3976 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
3978 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3979 rt6_multipath_rebalance(rt);
3986 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3988 struct arg_netdev_event arg = {
3995 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3998 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4000 rt6_sync_down_dev(dev, event);
4001 rt6_uncached_list_flush_dev(dev_net(dev), dev);
4002 neigh_ifdown(&nd_tbl, dev);
4005 struct rt6_mtu_change_arg {
4006 struct net_device *dev;
4010 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4012 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4013 struct inet6_dev *idev;
4015 /* In IPv6 pmtu discovery is not optional,
4016 so that RTAX_MTU lock cannot disable it.
4017 We still use this lock to block changes
4018 caused by addrconf/ndisc.
4021 idev = __in6_dev_get(arg->dev);
4025 /* For administrative MTU increase, there is no way to discover
4026 IPv6 PMTU increase, so PMTU increase should be updated here.
4027 Since RFC 1981 doesn't include administrative MTU increase
4028 update PMTU increase is a MUST. (i.e. jumbo frame)
4030 if (rt->fib6_nh.nh_dev == arg->dev &&
4031 !fib6_metric_locked(rt, RTAX_MTU)) {
4032 u32 mtu = rt->fib6_pmtu;
4034 if (mtu >= arg->mtu ||
4035 (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4036 fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4038 spin_lock_bh(&rt6_exception_lock);
4039 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4040 spin_unlock_bh(&rt6_exception_lock);
4045 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4047 struct rt6_mtu_change_arg arg = {
4052 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4055 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4056 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
4057 [RTA_OIF] = { .type = NLA_U32 },
4058 [RTA_IIF] = { .type = NLA_U32 },
4059 [RTA_PRIORITY] = { .type = NLA_U32 },
4060 [RTA_METRICS] = { .type = NLA_NESTED },
4061 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
4062 [RTA_PREF] = { .type = NLA_U8 },
4063 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
4064 [RTA_ENCAP] = { .type = NLA_NESTED },
4065 [RTA_EXPIRES] = { .type = NLA_U32 },
4066 [RTA_UID] = { .type = NLA_U32 },
4067 [RTA_MARK] = { .type = NLA_U32 },
4070 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4071 struct fib6_config *cfg,
4072 struct netlink_ext_ack *extack)
4075 struct nlattr *tb[RTA_MAX+1];
4079 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4085 rtm = nlmsg_data(nlh);
4086 memset(cfg, 0, sizeof(*cfg));
4088 cfg->fc_table = rtm->rtm_table;
4089 cfg->fc_dst_len = rtm->rtm_dst_len;
4090 cfg->fc_src_len = rtm->rtm_src_len;
4091 cfg->fc_flags = RTF_UP;
4092 cfg->fc_protocol = rtm->rtm_protocol;
4093 cfg->fc_type = rtm->rtm_type;
4095 if (rtm->rtm_type == RTN_UNREACHABLE ||
4096 rtm->rtm_type == RTN_BLACKHOLE ||
4097 rtm->rtm_type == RTN_PROHIBIT ||
4098 rtm->rtm_type == RTN_THROW)
4099 cfg->fc_flags |= RTF_REJECT;
4101 if (rtm->rtm_type == RTN_LOCAL)
4102 cfg->fc_flags |= RTF_LOCAL;
4104 if (rtm->rtm_flags & RTM_F_CLONED)
4105 cfg->fc_flags |= RTF_CACHE;
4107 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4109 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4110 cfg->fc_nlinfo.nlh = nlh;
4111 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4113 if (tb[RTA_GATEWAY]) {
4114 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4115 cfg->fc_flags |= RTF_GATEWAY;
4119 int plen = (rtm->rtm_dst_len + 7) >> 3;
4121 if (nla_len(tb[RTA_DST]) < plen)
4124 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4128 int plen = (rtm->rtm_src_len + 7) >> 3;
4130 if (nla_len(tb[RTA_SRC]) < plen)
4133 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4136 if (tb[RTA_PREFSRC])
4137 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4140 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4142 if (tb[RTA_PRIORITY])
4143 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4145 if (tb[RTA_METRICS]) {
4146 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4147 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4151 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4153 if (tb[RTA_MULTIPATH]) {
4154 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4155 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4157 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4158 cfg->fc_mp_len, extack);
4164 pref = nla_get_u8(tb[RTA_PREF]);
4165 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4166 pref != ICMPV6_ROUTER_PREF_HIGH)
4167 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4168 cfg->fc_flags |= RTF_PREF(pref);
4172 cfg->fc_encap = tb[RTA_ENCAP];
4174 if (tb[RTA_ENCAP_TYPE]) {
4175 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4177 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4182 if (tb[RTA_EXPIRES]) {
4183 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4185 if (addrconf_finite_timeout(timeout)) {
4186 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4187 cfg->fc_flags |= RTF_EXPIRES;
4197 struct fib6_info *fib6_info;
4198 struct fib6_config r_cfg;
4199 struct list_head next;
4202 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4206 list_for_each_entry(nh, rt6_nh_list, next) {
4207 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4208 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4209 nh->r_cfg.fc_ifindex);
4213 static int ip6_route_info_append(struct net *net,
4214 struct list_head *rt6_nh_list,
4215 struct fib6_info *rt,
4216 struct fib6_config *r_cfg)
4221 list_for_each_entry(nh, rt6_nh_list, next) {
4222 /* check if fib6_info already exists */
4223 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4227 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4231 err = ip6_convert_metrics(net, rt, r_cfg);
4236 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4237 list_add_tail(&nh->next, rt6_nh_list);
4242 static void ip6_route_mpath_notify(struct fib6_info *rt,
4243 struct fib6_info *rt_last,
4244 struct nl_info *info,
4247 /* if this is an APPEND route, then rt points to the first route
4248 * inserted and rt_last points to last route inserted. Userspace
4249 * wants a consistent dump of the route which starts at the first
4250 * nexthop. Since sibling routes are always added at the end of
4251 * the list, find the first sibling of the last route appended
4253 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4254 rt = list_first_entry(&rt_last->fib6_siblings,
4260 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4263 static int ip6_route_multipath_add(struct fib6_config *cfg,
4264 struct netlink_ext_ack *extack)
4266 struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4267 struct nl_info *info = &cfg->fc_nlinfo;
4268 struct fib6_config r_cfg;
4269 struct rtnexthop *rtnh;
4270 struct fib6_info *rt;
4271 struct rt6_nh *err_nh;
4272 struct rt6_nh *nh, *nh_safe;
4278 int replace = (cfg->fc_nlinfo.nlh &&
4279 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4280 LIST_HEAD(rt6_nh_list);
4282 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4283 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4284 nlflags |= NLM_F_APPEND;
4286 remaining = cfg->fc_mp_len;
4287 rtnh = (struct rtnexthop *)cfg->fc_mp;
4289 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4290 * fib6_info structs per nexthop
4292 while (rtnh_ok(rtnh, remaining)) {
4293 memcpy(&r_cfg, cfg, sizeof(*cfg));
4294 if (rtnh->rtnh_ifindex)
4295 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4297 attrlen = rtnh_attrlen(rtnh);
4299 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4301 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4303 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4304 r_cfg.fc_flags |= RTF_GATEWAY;
4306 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4307 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4309 r_cfg.fc_encap_type = nla_get_u16(nla);
4312 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4313 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4320 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4322 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4325 fib6_info_release(rt);
4329 rtnh = rtnh_next(rtnh, &remaining);
4332 /* for add and replace send one notification with all nexthops.
4333 * Skip the notification in fib6_add_rt2node and send one with
4334 * the full route when done
4336 info->skip_notify = 1;
4339 list_for_each_entry(nh, &rt6_nh_list, next) {
4340 rt_last = nh->fib6_info;
4341 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4342 fib6_info_release(nh->fib6_info);
4344 /* save reference to first route for notification */
4345 if (!rt_notif && !err)
4346 rt_notif = nh->fib6_info;
4348 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4349 nh->fib6_info = NULL;
4352 ip6_print_replace_route_err(&rt6_nh_list);
4357 /* Because each route is added like a single route we remove
4358 * these flags after the first nexthop: if there is a collision,
4359 * we have already failed to add the first nexthop:
4360 * fib6_add_rt2node() has rejected it; when replacing, old
4361 * nexthops have been replaced by first new, the rest should
4364 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4369 /* success ... tell user about new route */
4370 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4374 /* send notification for routes that were added so that
4375 * the delete notifications sent by ip6_route_del are
4379 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4381 /* Delete routes that were already added */
4382 list_for_each_entry(nh, &rt6_nh_list, next) {
4385 ip6_route_del(&nh->r_cfg, extack);
4389 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4391 fib6_info_release(nh->fib6_info);
4392 list_del(&nh->next);
4399 static int ip6_route_multipath_del(struct fib6_config *cfg,
4400 struct netlink_ext_ack *extack)
4402 struct fib6_config r_cfg;
4403 struct rtnexthop *rtnh;
4406 int err = 1, last_err = 0;
4408 remaining = cfg->fc_mp_len;
4409 rtnh = (struct rtnexthop *)cfg->fc_mp;
4411 /* Parse a Multipath Entry */
4412 while (rtnh_ok(rtnh, remaining)) {
4413 memcpy(&r_cfg, cfg, sizeof(*cfg));
4414 if (rtnh->rtnh_ifindex)
4415 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4417 attrlen = rtnh_attrlen(rtnh);
4419 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4421 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4423 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4424 r_cfg.fc_flags |= RTF_GATEWAY;
4427 err = ip6_route_del(&r_cfg, extack);
4431 rtnh = rtnh_next(rtnh, &remaining);
4437 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4438 struct netlink_ext_ack *extack)
4440 struct fib6_config cfg;
4443 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4448 return ip6_route_multipath_del(&cfg, extack);
4450 cfg.fc_delete_all_nh = 1;
4451 return ip6_route_del(&cfg, extack);
4455 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4456 struct netlink_ext_ack *extack)
4458 struct fib6_config cfg;
4461 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4466 return ip6_route_multipath_add(&cfg, extack);
4468 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4471 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4473 int nexthop_len = 0;
4475 if (rt->fib6_nsiblings) {
4476 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4477 + NLA_ALIGN(sizeof(struct rtnexthop))
4478 + nla_total_size(16) /* RTA_GATEWAY */
4479 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4481 nexthop_len *= rt->fib6_nsiblings;
4484 return NLMSG_ALIGN(sizeof(struct rtmsg))
4485 + nla_total_size(16) /* RTA_SRC */
4486 + nla_total_size(16) /* RTA_DST */
4487 + nla_total_size(16) /* RTA_GATEWAY */
4488 + nla_total_size(16) /* RTA_PREFSRC */
4489 + nla_total_size(4) /* RTA_TABLE */
4490 + nla_total_size(4) /* RTA_IIF */
4491 + nla_total_size(4) /* RTA_OIF */
4492 + nla_total_size(4) /* RTA_PRIORITY */
4493 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4494 + nla_total_size(sizeof(struct rta_cacheinfo))
4495 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4496 + nla_total_size(1) /* RTA_PREF */
4497 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4501 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4502 unsigned int *flags, bool skip_oif)
4504 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4505 *flags |= RTNH_F_DEAD;
4507 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4508 *flags |= RTNH_F_LINKDOWN;
4511 if (fib6_ignore_linkdown(rt))
4512 *flags |= RTNH_F_DEAD;
4516 if (rt->fib6_flags & RTF_GATEWAY) {
4517 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4518 goto nla_put_failure;
4521 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4522 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4523 *flags |= RTNH_F_OFFLOAD;
4525 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4526 if (!skip_oif && rt->fib6_nh.nh_dev &&
4527 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4528 goto nla_put_failure;
4530 if (rt->fib6_nh.nh_lwtstate &&
4531 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4532 goto nla_put_failure;
4540 /* add multipath next hop */
4541 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4543 const struct net_device *dev = rt->fib6_nh.nh_dev;
4544 struct rtnexthop *rtnh;
4545 unsigned int flags = 0;
4547 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4549 goto nla_put_failure;
4551 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4552 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4554 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4555 goto nla_put_failure;
4557 rtnh->rtnh_flags = flags;
4559 /* length of rtnetlink header + attributes */
4560 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4568 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4569 struct fib6_info *rt, struct dst_entry *dst,
4570 struct in6_addr *dest, struct in6_addr *src,
4571 int iif, int type, u32 portid, u32 seq,
4575 struct nlmsghdr *nlh;
4580 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4584 rtm = nlmsg_data(nlh);
4585 rtm->rtm_family = AF_INET6;
4586 rtm->rtm_dst_len = rt->fib6_dst.plen;
4587 rtm->rtm_src_len = rt->fib6_src.plen;
4590 table = rt->fib6_table->tb6_id;
4592 table = RT6_TABLE_UNSPEC;
4593 rtm->rtm_table = table;
4594 if (nla_put_u32(skb, RTA_TABLE, table))
4595 goto nla_put_failure;
4597 rtm->rtm_type = rt->fib6_type;
4599 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4600 rtm->rtm_protocol = rt->fib6_protocol;
4602 if (rt->fib6_flags & RTF_CACHE)
4603 rtm->rtm_flags |= RTM_F_CLONED;
4606 if (nla_put_in6_addr(skb, RTA_DST, dest))
4607 goto nla_put_failure;
4608 rtm->rtm_dst_len = 128;
4609 } else if (rtm->rtm_dst_len)
4610 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr))
4611 goto nla_put_failure;
4612 #ifdef CONFIG_IPV6_SUBTREES
4614 if (nla_put_in6_addr(skb, RTA_SRC, src))
4615 goto nla_put_failure;
4616 rtm->rtm_src_len = 128;
4617 } else if (rtm->rtm_src_len &&
4618 nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr))
4619 goto nla_put_failure;
4622 #ifdef CONFIG_IPV6_MROUTE
4623 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) {
4624 int err = ip6mr_get_route(net, skb, rtm, portid);
4629 goto nla_put_failure;
4632 if (nla_put_u32(skb, RTA_IIF, iif))
4633 goto nla_put_failure;
4635 struct in6_addr saddr_buf;
4636 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4637 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4638 goto nla_put_failure;
4641 if (rt->fib6_prefsrc.plen) {
4642 struct in6_addr saddr_buf;
4643 saddr_buf = rt->fib6_prefsrc.addr;
4644 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4645 goto nla_put_failure;
4648 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4649 if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4650 goto nla_put_failure;
4652 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4653 goto nla_put_failure;
4655 /* For multipath routes, walk the siblings list and add
4656 * each as a nexthop within RTA_MULTIPATH.
4658 if (rt->fib6_nsiblings) {
4659 struct fib6_info *sibling, *next_sibling;
4662 mp = nla_nest_start(skb, RTA_MULTIPATH);
4664 goto nla_put_failure;
4666 if (rt6_add_nexthop(skb, rt) < 0)
4667 goto nla_put_failure;
4669 list_for_each_entry_safe(sibling, next_sibling,
4670 &rt->fib6_siblings, fib6_siblings) {
4671 if (rt6_add_nexthop(skb, sibling) < 0)
4672 goto nla_put_failure;
4675 nla_nest_end(skb, mp);
4677 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4678 goto nla_put_failure;
4681 if (rt->fib6_flags & RTF_EXPIRES) {
4682 expires = dst ? dst->expires : rt->expires;
4686 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4687 goto nla_put_failure;
4689 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags)))
4690 goto nla_put_failure;
4693 nlmsg_end(skb, nlh);
4697 nlmsg_cancel(skb, nlh);
4701 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4703 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4704 struct net *net = arg->net;
4706 if (rt == net->ipv6.fib6_null_entry)
4709 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4710 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4712 /* user wants prefix routes only */
4713 if (rtm->rtm_flags & RTM_F_PREFIX &&
4714 !(rt->fib6_flags & RTF_PREFIX_RT)) {
4715 /* success since this is not a prefix route */
4720 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4721 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4722 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4725 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4726 struct netlink_ext_ack *extack)
4728 struct net *net = sock_net(in_skb->sk);
4729 struct nlattr *tb[RTA_MAX+1];
4730 int err, iif = 0, oif = 0;
4731 struct fib6_info *from;
4732 struct dst_entry *dst;
4733 struct rt6_info *rt;
4734 struct sk_buff *skb;
4739 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4745 memset(&fl6, 0, sizeof(fl6));
4746 rtm = nlmsg_data(nlh);
4747 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4748 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4751 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4754 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4758 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4761 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4765 iif = nla_get_u32(tb[RTA_IIF]);
4768 oif = nla_get_u32(tb[RTA_OIF]);
4771 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4774 fl6.flowi6_uid = make_kuid(current_user_ns(),
4775 nla_get_u32(tb[RTA_UID]));
4777 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4780 struct net_device *dev;
4785 dev = dev_get_by_index_rcu(net, iif);
4792 fl6.flowi6_iif = iif;
4794 if (!ipv6_addr_any(&fl6.saddr))
4795 flags |= RT6_LOOKUP_F_HAS_SADDR;
4797 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4801 fl6.flowi6_oif = oif;
4803 dst = ip6_route_output(net, NULL, &fl6);
4807 rt = container_of(dst, struct rt6_info, dst);
4808 if (rt->dst.error) {
4809 err = rt->dst.error;
4814 if (rt == net->ipv6.ip6_null_entry) {
4815 err = rt->dst.error;
4820 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4827 skb_dst_set(skb, &rt->dst);
4830 from = rcu_dereference(rt->from);
4833 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4834 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4837 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4838 &fl6.saddr, iif, RTM_NEWROUTE,
4839 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4848 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4853 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4854 unsigned int nlm_flags)
4856 struct sk_buff *skb;
4857 struct net *net = info->nl_net;
4862 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4864 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4868 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4869 event, info->portid, seq, nlm_flags);
4871 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4872 WARN_ON(err == -EMSGSIZE);
4876 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4877 info->nlh, gfp_any());
4881 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4884 static int ip6_route_dev_notify(struct notifier_block *this,
4885 unsigned long event, void *ptr)
4887 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4888 struct net *net = dev_net(dev);
4890 if (!(dev->flags & IFF_LOOPBACK))
4893 if (event == NETDEV_REGISTER) {
4894 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
4895 net->ipv6.ip6_null_entry->dst.dev = dev;
4896 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4897 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4898 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4899 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4900 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4901 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4903 } else if (event == NETDEV_UNREGISTER &&
4904 dev->reg_state != NETREG_UNREGISTERED) {
4905 /* NETDEV_UNREGISTER could be fired for multiple times by
4906 * netdev_wait_allrefs(). Make sure we only call this once.
4908 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4909 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4910 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4911 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4922 #ifdef CONFIG_PROC_FS
4924 static const struct file_operations ipv6_route_proc_fops = {
4925 .open = ipv6_route_open,
4927 .llseek = seq_lseek,
4928 .release = seq_release_net,
4931 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4933 struct net *net = (struct net *)seq->private;
4934 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4935 net->ipv6.rt6_stats->fib_nodes,
4936 net->ipv6.rt6_stats->fib_route_nodes,
4937 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4938 net->ipv6.rt6_stats->fib_rt_entries,
4939 net->ipv6.rt6_stats->fib_rt_cache,
4940 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4941 net->ipv6.rt6_stats->fib_discarded_routes);
4946 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4948 return single_open_net(inode, file, rt6_stats_seq_show);
4951 static const struct file_operations rt6_stats_seq_fops = {
4952 .open = rt6_stats_seq_open,
4954 .llseek = seq_lseek,
4955 .release = single_release_net,
4957 #endif /* CONFIG_PROC_FS */
4959 #ifdef CONFIG_SYSCTL
4962 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4963 void __user *buffer, size_t *lenp, loff_t *ppos)
4970 net = (struct net *)ctl->extra1;
4971 delay = net->ipv6.sysctl.flush_delay;
4972 proc_dointvec(ctl, write, buffer, lenp, ppos);
4973 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4977 struct ctl_table ipv6_route_table_template[] = {
4979 .procname = "flush",
4980 .data = &init_net.ipv6.sysctl.flush_delay,
4981 .maxlen = sizeof(int),
4983 .proc_handler = ipv6_sysctl_rtcache_flush
4986 .procname = "gc_thresh",
4987 .data = &ip6_dst_ops_template.gc_thresh,
4988 .maxlen = sizeof(int),
4990 .proc_handler = proc_dointvec,
4993 .procname = "max_size",
4994 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
4995 .maxlen = sizeof(int),
4997 .proc_handler = proc_dointvec,
5000 .procname = "gc_min_interval",
5001 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5002 .maxlen = sizeof(int),
5004 .proc_handler = proc_dointvec_jiffies,
5007 .procname = "gc_timeout",
5008 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5009 .maxlen = sizeof(int),
5011 .proc_handler = proc_dointvec_jiffies,
5014 .procname = "gc_interval",
5015 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5016 .maxlen = sizeof(int),
5018 .proc_handler = proc_dointvec_jiffies,
5021 .procname = "gc_elasticity",
5022 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5023 .maxlen = sizeof(int),
5025 .proc_handler = proc_dointvec,
5028 .procname = "mtu_expires",
5029 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5030 .maxlen = sizeof(int),
5032 .proc_handler = proc_dointvec_jiffies,
5035 .procname = "min_adv_mss",
5036 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5037 .maxlen = sizeof(int),
5039 .proc_handler = proc_dointvec,
5042 .procname = "gc_min_interval_ms",
5043 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5044 .maxlen = sizeof(int),
5046 .proc_handler = proc_dointvec_ms_jiffies,
5051 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5053 struct ctl_table *table;
5055 table = kmemdup(ipv6_route_table_template,
5056 sizeof(ipv6_route_table_template),
5060 table[0].data = &net->ipv6.sysctl.flush_delay;
5061 table[0].extra1 = net;
5062 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5063 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5064 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5065 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5066 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5067 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5068 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5069 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5070 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5072 /* Don't export sysctls to unprivileged users */
5073 if (net->user_ns != &init_user_ns)
5074 table[0].procname = NULL;
5081 static int __net_init ip6_route_net_init(struct net *net)
5085 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5086 sizeof(net->ipv6.ip6_dst_ops));
5088 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5089 goto out_ip6_dst_ops;
5091 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5092 sizeof(*net->ipv6.fib6_null_entry),
5094 if (!net->ipv6.fib6_null_entry)
5095 goto out_ip6_dst_entries;
5097 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5098 sizeof(*net->ipv6.ip6_null_entry),
5100 if (!net->ipv6.ip6_null_entry)
5101 goto out_fib6_null_entry;
5102 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5103 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5104 ip6_template_metrics, true);
5106 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5107 net->ipv6.fib6_has_custom_rules = false;
5108 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5109 sizeof(*net->ipv6.ip6_prohibit_entry),
5111 if (!net->ipv6.ip6_prohibit_entry)
5112 goto out_ip6_null_entry;
5113 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5114 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5115 ip6_template_metrics, true);
5117 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5118 sizeof(*net->ipv6.ip6_blk_hole_entry),
5120 if (!net->ipv6.ip6_blk_hole_entry)
5121 goto out_ip6_prohibit_entry;
5122 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5123 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5124 ip6_template_metrics, true);
5127 net->ipv6.sysctl.flush_delay = 0;
5128 net->ipv6.sysctl.ip6_rt_max_size = 4096;
5129 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5130 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5131 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5132 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5133 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5134 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5136 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5142 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5143 out_ip6_prohibit_entry:
5144 kfree(net->ipv6.ip6_prohibit_entry);
5146 kfree(net->ipv6.ip6_null_entry);
5148 out_fib6_null_entry:
5149 kfree(net->ipv6.fib6_null_entry);
5150 out_ip6_dst_entries:
5151 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5156 static void __net_exit ip6_route_net_exit(struct net *net)
5158 kfree(net->ipv6.fib6_null_entry);
5159 kfree(net->ipv6.ip6_null_entry);
5160 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5161 kfree(net->ipv6.ip6_prohibit_entry);
5162 kfree(net->ipv6.ip6_blk_hole_entry);
5164 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5167 static int __net_init ip6_route_net_init_late(struct net *net)
5169 #ifdef CONFIG_PROC_FS
5170 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
5171 proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
5176 static void __net_exit ip6_route_net_exit_late(struct net *net)
5178 #ifdef CONFIG_PROC_FS
5179 remove_proc_entry("ipv6_route", net->proc_net);
5180 remove_proc_entry("rt6_stats", net->proc_net);
5184 static struct pernet_operations ip6_route_net_ops = {
5185 .init = ip6_route_net_init,
5186 .exit = ip6_route_net_exit,
5189 static int __net_init ipv6_inetpeer_init(struct net *net)
5191 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5195 inet_peer_base_init(bp);
5196 net->ipv6.peers = bp;
5200 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5202 struct inet_peer_base *bp = net->ipv6.peers;
5204 net->ipv6.peers = NULL;
5205 inetpeer_invalidate_tree(bp);
5209 static struct pernet_operations ipv6_inetpeer_ops = {
5210 .init = ipv6_inetpeer_init,
5211 .exit = ipv6_inetpeer_exit,
5214 static struct pernet_operations ip6_route_net_late_ops = {
5215 .init = ip6_route_net_init_late,
5216 .exit = ip6_route_net_exit_late,
5219 static struct notifier_block ip6_route_dev_notifier = {
5220 .notifier_call = ip6_route_dev_notify,
5221 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5224 void __init ip6_route_init_special_entries(void)
5226 /* Registering of the loopback is done before this portion of code,
5227 * the loopback reference in rt6_info will not be taken, do it
5228 * manually for init_net */
5229 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5230 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5231 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5232 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5233 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5234 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5235 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5236 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5240 int __init ip6_route_init(void)
5246 ip6_dst_ops_template.kmem_cachep =
5247 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5248 SLAB_HWCACHE_ALIGN, NULL);
5249 if (!ip6_dst_ops_template.kmem_cachep)
5252 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5254 goto out_kmem_cache;
5256 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5258 goto out_dst_entries;
5260 ret = register_pernet_subsys(&ip6_route_net_ops);
5262 goto out_register_inetpeer;
5264 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5268 goto out_register_subsys;
5274 ret = fib6_rules_init();
5278 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5280 goto fib6_rules_init;
5282 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5283 inet6_rtm_newroute, NULL, 0);
5285 goto out_register_late_subsys;
5287 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5288 inet6_rtm_delroute, NULL, 0);
5290 goto out_register_late_subsys;
5292 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5293 inet6_rtm_getroute, NULL,
5294 RTNL_FLAG_DOIT_UNLOCKED);
5296 goto out_register_late_subsys;
5298 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5300 goto out_register_late_subsys;
5302 for_each_possible_cpu(cpu) {
5303 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5305 INIT_LIST_HEAD(&ul->head);
5306 spin_lock_init(&ul->lock);
5312 out_register_late_subsys:
5313 rtnl_unregister_all(PF_INET6);
5314 unregister_pernet_subsys(&ip6_route_net_late_ops);
5316 fib6_rules_cleanup();
5321 out_register_subsys:
5322 unregister_pernet_subsys(&ip6_route_net_ops);
5323 out_register_inetpeer:
5324 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5326 dst_entries_destroy(&ip6_dst_blackhole_ops);
5328 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5332 void ip6_route_cleanup(void)
5334 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5335 unregister_pernet_subsys(&ip6_route_net_late_ops);
5336 fib6_rules_cleanup();
5339 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5340 unregister_pernet_subsys(&ip6_route_net_ops);
5341 dst_entries_destroy(&ip6_dst_blackhole_ops);
5342 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);