2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
68 #include <linux/uaccess.h>
71 #include <linux/sysctl.h>
75 RT6_NUD_FAIL_HARD = -3,
76 RT6_NUD_FAIL_PROBE = -2,
77 RT6_NUD_FAIL_DO_RR = -1,
81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void ip6_dst_destroy(struct dst_entry *);
86 static void ip6_dst_ifdown(struct dst_entry *,
87 struct net_device *dev, int how);
88 static int ip6_dst_gc(struct dst_ops *ops);
90 static int ip6_pkt_discard(struct sk_buff *skb);
91 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int ip6_pkt_prohibit(struct sk_buff *skb);
93 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void ip6_link_failure(struct sk_buff *skb);
95 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96 struct sk_buff *skb, u32 mtu);
97 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
100 static size_t rt6_nlmsg_size(struct fib6_info *rt);
101 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
102 struct fib6_info *rt, struct dst_entry *dst,
103 struct in6_addr *dest, struct in6_addr *src,
104 int iif, int type, u32 portid, u32 seq,
106 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
107 struct in6_addr *daddr,
108 struct in6_addr *saddr);
110 #ifdef CONFIG_IPV6_ROUTE_INFO
111 static struct fib6_info *rt6_add_route_info(struct net *net,
112 const struct in6_addr *prefix, int prefixlen,
113 const struct in6_addr *gwaddr,
114 struct net_device *dev,
116 static struct fib6_info *rt6_get_route_info(struct net *net,
117 const struct in6_addr *prefix, int prefixlen,
118 const struct in6_addr *gwaddr,
119 struct net_device *dev);
122 struct uncached_list {
124 struct list_head head;
127 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
129 void rt6_uncached_list_add(struct rt6_info *rt)
131 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
133 rt->rt6i_uncached_list = ul;
135 spin_lock_bh(&ul->lock);
136 list_add_tail(&rt->rt6i_uncached, &ul->head);
137 spin_unlock_bh(&ul->lock);
140 void rt6_uncached_list_del(struct rt6_info *rt)
142 if (!list_empty(&rt->rt6i_uncached)) {
143 struct uncached_list *ul = rt->rt6i_uncached_list;
144 struct net *net = dev_net(rt->dst.dev);
146 spin_lock_bh(&ul->lock);
147 list_del(&rt->rt6i_uncached);
148 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
149 spin_unlock_bh(&ul->lock);
153 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
155 struct net_device *loopback_dev = net->loopback_dev;
158 if (dev == loopback_dev)
161 for_each_possible_cpu(cpu) {
162 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165 spin_lock_bh(&ul->lock);
166 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
167 struct inet6_dev *rt_idev = rt->rt6i_idev;
168 struct net_device *rt_dev = rt->dst.dev;
170 if (rt_idev->dev == dev) {
171 rt->rt6i_idev = in6_dev_get(loopback_dev);
172 in6_dev_put(rt_idev);
176 rt->dst.dev = loopback_dev;
177 dev_hold(rt->dst.dev);
181 spin_unlock_bh(&ul->lock);
185 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
189 if (!ipv6_addr_any(p))
190 return (const void *) p;
192 return &ipv6_hdr(skb)->daddr;
196 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
197 struct net_device *dev,
203 daddr = choose_neigh_daddr(gw, skb, daddr);
204 n = __ipv6_neigh_lookup(dev, daddr);
207 return neigh_create(&nd_tbl, daddr, dev);
210 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
214 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
216 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
219 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
221 struct net_device *dev = dst->dev;
222 struct rt6_info *rt = (struct rt6_info *)dst;
224 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
227 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
229 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
231 __ipv6_confirm_neigh(dev, daddr);
234 static struct dst_ops ip6_dst_ops_template = {
238 .check = ip6_dst_check,
239 .default_advmss = ip6_default_advmss,
241 .cow_metrics = dst_cow_metrics_generic,
242 .destroy = ip6_dst_destroy,
243 .ifdown = ip6_dst_ifdown,
244 .negative_advice = ip6_negative_advice,
245 .link_failure = ip6_link_failure,
246 .update_pmtu = ip6_rt_update_pmtu,
247 .redirect = rt6_do_redirect,
248 .local_out = __ip6_local_out,
249 .neigh_lookup = ip6_dst_neigh_lookup,
250 .confirm_neigh = ip6_confirm_neigh,
253 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
255 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
257 return mtu ? : dst->dev->mtu;
260 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
261 struct sk_buff *skb, u32 mtu)
265 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
270 static struct dst_ops ip6_dst_blackhole_ops = {
272 .destroy = ip6_dst_destroy,
273 .check = ip6_dst_check,
274 .mtu = ip6_blackhole_mtu,
275 .default_advmss = ip6_default_advmss,
276 .update_pmtu = ip6_rt_blackhole_update_pmtu,
277 .redirect = ip6_rt_blackhole_redirect,
278 .cow_metrics = dst_cow_metrics_generic,
279 .neigh_lookup = ip6_dst_neigh_lookup,
282 static const u32 ip6_template_metrics[RTAX_MAX] = {
283 [RTAX_HOPLIMIT - 1] = 0,
286 static const struct fib6_info fib6_null_entry_template = {
287 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
288 .fib6_protocol = RTPROT_KERNEL,
289 .fib6_metric = ~(u32)0,
290 .fib6_ref = ATOMIC_INIT(1),
291 .fib6_type = RTN_UNREACHABLE,
292 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
295 static const struct rt6_info ip6_null_entry_template = {
297 .__refcnt = ATOMIC_INIT(1),
299 .obsolete = DST_OBSOLETE_FORCE_CHK,
300 .error = -ENETUNREACH,
301 .input = ip6_pkt_discard,
302 .output = ip6_pkt_discard_out,
304 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
309 static const struct rt6_info ip6_prohibit_entry_template = {
311 .__refcnt = ATOMIC_INIT(1),
313 .obsolete = DST_OBSOLETE_FORCE_CHK,
315 .input = ip6_pkt_prohibit,
316 .output = ip6_pkt_prohibit_out,
318 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
321 static const struct rt6_info ip6_blk_hole_entry_template = {
323 .__refcnt = ATOMIC_INIT(1),
325 .obsolete = DST_OBSOLETE_FORCE_CHK,
327 .input = dst_discard,
328 .output = dst_discard_out,
330 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
335 static void rt6_info_init(struct rt6_info *rt)
337 struct dst_entry *dst = &rt->dst;
339 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
340 INIT_LIST_HEAD(&rt->rt6i_uncached);
343 /* allocate dst with ip6_dst_ops */
344 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
347 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
348 1, DST_OBSOLETE_FORCE_CHK, flags);
352 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
357 EXPORT_SYMBOL(ip6_dst_alloc);
359 static void ip6_dst_destroy(struct dst_entry *dst)
361 struct rt6_info *rt = (struct rt6_info *)dst;
362 struct fib6_info *from;
363 struct inet6_dev *idev;
365 dst_destroy_metrics_generic(dst);
366 rt6_uncached_list_del(rt);
368 idev = rt->rt6i_idev;
370 rt->rt6i_idev = NULL;
375 from = rcu_dereference(rt->from);
376 rcu_assign_pointer(rt->from, NULL);
377 fib6_info_release(from);
381 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
384 struct rt6_info *rt = (struct rt6_info *)dst;
385 struct inet6_dev *idev = rt->rt6i_idev;
386 struct net_device *loopback_dev =
387 dev_net(dev)->loopback_dev;
389 if (idev && idev->dev != loopback_dev) {
390 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
392 rt->rt6i_idev = loopback_idev;
398 static bool __rt6_check_expired(const struct rt6_info *rt)
400 if (rt->rt6i_flags & RTF_EXPIRES)
401 return time_after(jiffies, rt->dst.expires);
406 static bool rt6_check_expired(const struct rt6_info *rt)
408 struct fib6_info *from;
410 from = rcu_dereference(rt->from);
412 if (rt->rt6i_flags & RTF_EXPIRES) {
413 if (time_after(jiffies, rt->dst.expires))
416 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
417 fib6_check_expired(from);
422 static struct fib6_info *rt6_multipath_select(const struct net *net,
423 struct fib6_info *match,
424 struct flowi6 *fl6, int oif,
425 const struct sk_buff *skb,
428 struct fib6_info *sibling, *next_sibling;
430 /* We might have already computed the hash for ICMPv6 errors. In such
431 * case it will always be non-zero. Otherwise now is the time to do it.
434 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
436 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
439 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
443 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
444 if (fl6->mp_hash > nh_upper_bound)
446 if (rt6_score_route(sibling, oif, strict) < 0)
456 * Route lookup. rcu_read_lock() should be held.
459 static inline struct fib6_info *rt6_device_match(struct net *net,
460 struct fib6_info *rt,
461 const struct in6_addr *saddr,
465 struct fib6_info *sprt;
467 if (!oif && ipv6_addr_any(saddr) &&
468 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
471 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
472 const struct net_device *dev = sprt->fib6_nh.nh_dev;
474 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
478 if (dev->ifindex == oif)
481 if (ipv6_chk_addr(net, saddr, dev,
482 flags & RT6_LOOKUP_F_IFACE))
487 if (oif && flags & RT6_LOOKUP_F_IFACE)
488 return net->ipv6.fib6_null_entry;
490 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
493 #ifdef CONFIG_IPV6_ROUTER_PREF
494 struct __rt6_probe_work {
495 struct work_struct work;
496 struct in6_addr target;
497 struct net_device *dev;
500 static void rt6_probe_deferred(struct work_struct *w)
502 struct in6_addr mcaddr;
503 struct __rt6_probe_work *work =
504 container_of(w, struct __rt6_probe_work, work);
506 addrconf_addr_solict_mult(&work->target, &mcaddr);
507 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
512 static void rt6_probe(struct fib6_info *rt)
514 struct __rt6_probe_work *work;
515 const struct in6_addr *nh_gw;
516 struct neighbour *neigh;
517 struct net_device *dev;
520 * Okay, this does not seem to be appropriate
521 * for now, however, we need to check if it
522 * is really so; aka Router Reachability Probing.
524 * Router Reachability Probe MUST be rate-limited
525 * to no more than one per minute.
527 if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
530 nh_gw = &rt->fib6_nh.nh_gw;
531 dev = rt->fib6_nh.nh_dev;
533 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
535 struct inet6_dev *idev;
537 if (neigh->nud_state & NUD_VALID)
540 idev = __in6_dev_get(dev);
542 write_lock(&neigh->lock);
543 if (!(neigh->nud_state & NUD_VALID) &&
545 neigh->updated + idev->cnf.rtr_probe_interval)) {
546 work = kmalloc(sizeof(*work), GFP_ATOMIC);
548 __neigh_set_probe_once(neigh);
550 write_unlock(&neigh->lock);
552 work = kmalloc(sizeof(*work), GFP_ATOMIC);
556 INIT_WORK(&work->work, rt6_probe_deferred);
557 work->target = *nh_gw;
560 schedule_work(&work->work);
564 rcu_read_unlock_bh();
567 static inline void rt6_probe(struct fib6_info *rt)
573 * Default Router Selection (RFC 2461 6.3.6)
575 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
577 const struct net_device *dev = rt->fib6_nh.nh_dev;
579 if (!oif || dev->ifindex == oif)
584 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
586 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
587 struct neighbour *neigh;
589 if (rt->fib6_flags & RTF_NONEXTHOP ||
590 !(rt->fib6_flags & RTF_GATEWAY))
591 return RT6_NUD_SUCCEED;
594 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
597 read_lock(&neigh->lock);
598 if (neigh->nud_state & NUD_VALID)
599 ret = RT6_NUD_SUCCEED;
600 #ifdef CONFIG_IPV6_ROUTER_PREF
601 else if (!(neigh->nud_state & NUD_FAILED))
602 ret = RT6_NUD_SUCCEED;
604 ret = RT6_NUD_FAIL_PROBE;
606 read_unlock(&neigh->lock);
608 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
609 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
611 rcu_read_unlock_bh();
616 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
620 m = rt6_check_dev(rt, oif);
621 if (!m && (strict & RT6_LOOKUP_F_IFACE))
622 return RT6_NUD_FAIL_HARD;
623 #ifdef CONFIG_IPV6_ROUTER_PREF
624 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
626 if (strict & RT6_LOOKUP_F_REACHABLE) {
627 int n = rt6_check_neigh(rt);
634 /* called with rc_read_lock held */
635 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
637 const struct net_device *dev = fib6_info_nh_dev(f6i);
641 const struct inet6_dev *idev = __in6_dev_get(dev);
643 rc = !!idev->cnf.ignore_routes_with_linkdown;
649 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
650 int *mpri, struct fib6_info *match,
654 bool match_do_rr = false;
656 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
659 if (fib6_ignore_linkdown(rt) &&
660 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
661 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
664 if (fib6_check_expired(rt))
667 m = rt6_score_route(rt, oif, strict);
668 if (m == RT6_NUD_FAIL_DO_RR) {
670 m = 0; /* lowest valid score */
671 } else if (m == RT6_NUD_FAIL_HARD) {
675 if (strict & RT6_LOOKUP_F_REACHABLE)
678 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
680 *do_rr = match_do_rr;
688 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
689 struct fib6_info *leaf,
690 struct fib6_info *rr_head,
691 u32 metric, int oif, int strict,
694 struct fib6_info *rt, *match, *cont;
699 for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
700 if (rt->fib6_metric != metric) {
705 match = find_match(rt, oif, strict, &mpri, match, do_rr);
708 for (rt = leaf; rt && rt != rr_head;
709 rt = rcu_dereference(rt->rt6_next)) {
710 if (rt->fib6_metric != metric) {
715 match = find_match(rt, oif, strict, &mpri, match, do_rr);
721 for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
722 match = find_match(rt, oif, strict, &mpri, match, do_rr);
727 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
730 struct fib6_info *leaf = rcu_dereference(fn->leaf);
731 struct fib6_info *match, *rt0;
735 if (!leaf || leaf == net->ipv6.fib6_null_entry)
736 return net->ipv6.fib6_null_entry;
738 rt0 = rcu_dereference(fn->rr_ptr);
742 /* Double check to make sure fn is not an intermediate node
743 * and fn->leaf does not points to its child's leaf
744 * (This might happen if all routes under fn are deleted from
745 * the tree and fib6_repair_tree() is called on the node.)
747 key_plen = rt0->fib6_dst.plen;
748 #ifdef CONFIG_IPV6_SUBTREES
749 if (rt0->fib6_src.plen)
750 key_plen = rt0->fib6_src.plen;
752 if (fn->fn_bit != key_plen)
753 return net->ipv6.fib6_null_entry;
755 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
759 struct fib6_info *next = rcu_dereference(rt0->rt6_next);
761 /* no entries matched; do round-robin */
762 if (!next || next->fib6_metric != rt0->fib6_metric)
766 spin_lock_bh(&leaf->fib6_table->tb6_lock);
767 /* make sure next is not being deleted from the tree */
769 rcu_assign_pointer(fn->rr_ptr, next);
770 spin_unlock_bh(&leaf->fib6_table->tb6_lock);
774 return match ? match : net->ipv6.fib6_null_entry;
777 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
779 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
782 #ifdef CONFIG_IPV6_ROUTE_INFO
783 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
784 const struct in6_addr *gwaddr)
786 struct net *net = dev_net(dev);
787 struct route_info *rinfo = (struct route_info *) opt;
788 struct in6_addr prefix_buf, *prefix;
790 unsigned long lifetime;
791 struct fib6_info *rt;
793 if (len < sizeof(struct route_info)) {
797 /* Sanity check for prefix_len and length */
798 if (rinfo->length > 3) {
800 } else if (rinfo->prefix_len > 128) {
802 } else if (rinfo->prefix_len > 64) {
803 if (rinfo->length < 2) {
806 } else if (rinfo->prefix_len > 0) {
807 if (rinfo->length < 1) {
812 pref = rinfo->route_pref;
813 if (pref == ICMPV6_ROUTER_PREF_INVALID)
816 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
818 if (rinfo->length == 3)
819 prefix = (struct in6_addr *)rinfo->prefix;
821 /* this function is safe */
822 ipv6_addr_prefix(&prefix_buf,
823 (struct in6_addr *)rinfo->prefix,
825 prefix = &prefix_buf;
828 if (rinfo->prefix_len == 0)
829 rt = rt6_get_dflt_router(net, gwaddr, dev);
831 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
834 if (rt && !lifetime) {
840 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
843 rt->fib6_flags = RTF_ROUTEINFO |
844 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
847 if (!addrconf_finite_timeout(lifetime))
848 fib6_clean_expires(rt);
850 fib6_set_expires(rt, jiffies + HZ * lifetime);
852 fib6_info_release(rt);
859 * Misc support functions
862 /* called with rcu_lock held */
863 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
865 struct net_device *dev = rt->fib6_nh.nh_dev;
867 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
868 /* for copies of local routes, dst->dev needs to be the
869 * device if it is a master device, the master device if
870 * device is enslaved, and the loopback as the default
872 if (netif_is_l3_slave(dev) &&
873 !rt6_need_strict(&rt->fib6_dst.addr))
874 dev = l3mdev_master_dev_rcu(dev);
875 else if (!netif_is_l3_master(dev))
876 dev = dev_net(dev)->loopback_dev;
877 /* last case is netif_is_l3_master(dev) is true in which
878 * case we want dev returned to be dev
885 static const int fib6_prop[RTN_MAX + 1] = {
892 [RTN_BLACKHOLE] = -EINVAL,
893 [RTN_UNREACHABLE] = -EHOSTUNREACH,
894 [RTN_PROHIBIT] = -EACCES,
895 [RTN_THROW] = -EAGAIN,
897 [RTN_XRESOLVE] = -EINVAL,
900 static int ip6_rt_type_to_error(u8 fib6_type)
902 return fib6_prop[fib6_type];
905 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
907 unsigned short flags = 0;
910 flags |= DST_NOCOUNT;
911 if (rt->dst_nopolicy)
912 flags |= DST_NOPOLICY;
919 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
921 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
923 switch (ort->fib6_type) {
925 rt->dst.output = dst_discard_out;
926 rt->dst.input = dst_discard;
929 rt->dst.output = ip6_pkt_prohibit_out;
930 rt->dst.input = ip6_pkt_prohibit;
933 case RTN_UNREACHABLE:
935 rt->dst.output = ip6_pkt_discard_out;
936 rt->dst.input = ip6_pkt_discard;
941 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
943 rt->dst.flags |= fib6_info_dst_flags(ort);
945 if (ort->fib6_flags & RTF_REJECT) {
946 ip6_rt_init_dst_reject(rt, ort);
951 rt->dst.output = ip6_output;
953 if (ort->fib6_type == RTN_LOCAL) {
954 rt->dst.input = ip6_input;
955 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
956 rt->dst.input = ip6_mc_input;
958 rt->dst.input = ip6_forward;
961 if (ort->fib6_nh.nh_lwtstate) {
962 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
963 lwtunnel_set_redirect(&rt->dst);
966 rt->dst.lastuse = jiffies;
969 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
971 rt->rt6i_flags &= ~RTF_EXPIRES;
972 fib6_info_hold(from);
973 rcu_assign_pointer(rt->from, from);
974 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
975 if (from->fib6_metrics != &dst_default_metrics) {
976 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
977 refcount_inc(&from->fib6_metrics->refcnt);
981 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
983 struct net_device *dev = fib6_info_nh_dev(ort);
985 ip6_rt_init_dst(rt, ort);
987 rt->rt6i_dst = ort->fib6_dst;
988 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
989 rt->rt6i_gateway = ort->fib6_nh.nh_gw;
990 rt->rt6i_flags = ort->fib6_flags;
991 rt6_set_from(rt, ort);
992 #ifdef CONFIG_IPV6_SUBTREES
993 rt->rt6i_src = ort->fib6_src;
995 rt->rt6i_prefsrc = ort->fib6_prefsrc;
996 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
999 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1000 struct in6_addr *saddr)
1002 struct fib6_node *pn, *sn;
1004 if (fn->fn_flags & RTN_TL_ROOT)
1006 pn = rcu_dereference(fn->parent);
1007 sn = FIB6_SUBTREE(pn);
1009 fn = fib6_lookup(sn, NULL, saddr);
1012 if (fn->fn_flags & RTN_RTINFO)
1017 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1020 struct rt6_info *rt = *prt;
1022 if (dst_hold_safe(&rt->dst))
1024 if (null_fallback) {
1025 rt = net->ipv6.ip6_null_entry;
1034 /* called with rcu_lock held */
1035 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1037 unsigned short flags = fib6_info_dst_flags(rt);
1038 struct net_device *dev = rt->fib6_nh.nh_dev;
1039 struct rt6_info *nrt;
1041 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1043 ip6_rt_copy_init(nrt, rt);
1048 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1049 struct fib6_table *table,
1051 const struct sk_buff *skb,
1054 struct fib6_info *f6i;
1055 struct fib6_node *fn;
1056 struct rt6_info *rt;
1058 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1059 flags &= ~RT6_LOOKUP_F_IFACE;
1062 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1064 f6i = rcu_dereference(fn->leaf);
1066 f6i = net->ipv6.fib6_null_entry;
1068 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1069 fl6->flowi6_oif, flags);
1070 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1071 f6i = rt6_multipath_select(net, f6i, fl6,
1072 fl6->flowi6_oif, skb, flags);
1074 if (f6i == net->ipv6.fib6_null_entry) {
1075 fn = fib6_backtrack(fn, &fl6->saddr);
1080 /* Search through exception table */
1081 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1083 if (ip6_hold_safe(net, &rt, true))
1084 dst_use_noref(&rt->dst, jiffies);
1085 } else if (f6i == net->ipv6.fib6_null_entry) {
1086 rt = net->ipv6.ip6_null_entry;
1089 rt = ip6_create_rt_rcu(f6i);
1091 rt = net->ipv6.ip6_null_entry;
1098 trace_fib6_table_lookup(net, rt, table, fl6);
1103 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1104 const struct sk_buff *skb, int flags)
1106 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1108 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1110 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1111 const struct in6_addr *saddr, int oif,
1112 const struct sk_buff *skb, int strict)
1114 struct flowi6 fl6 = {
1118 struct dst_entry *dst;
1119 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1122 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1123 flags |= RT6_LOOKUP_F_HAS_SADDR;
1126 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1127 if (dst->error == 0)
1128 return (struct rt6_info *) dst;
1134 EXPORT_SYMBOL(rt6_lookup);
1136 /* ip6_ins_rt is called with FREE table->tb6_lock.
1137 * It takes new route entry, the addition fails by any reason the
1138 * route is released.
1139 * Caller must hold dst before calling it.
1142 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1143 struct netlink_ext_ack *extack)
1146 struct fib6_table *table;
1148 table = rt->fib6_table;
1149 spin_lock_bh(&table->tb6_lock);
1150 err = fib6_add(&table->tb6_root, rt, info, extack);
1151 spin_unlock_bh(&table->tb6_lock);
1156 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1158 struct nl_info info = { .nl_net = net, };
1160 return __ip6_ins_rt(rt, &info, NULL);
1163 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1164 const struct in6_addr *daddr,
1165 const struct in6_addr *saddr)
1167 struct net_device *dev;
1168 struct rt6_info *rt;
1174 dev = ip6_rt_get_dev_rcu(ort);
1175 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1179 ip6_rt_copy_init(rt, ort);
1180 rt->rt6i_flags |= RTF_CACHE;
1181 rt->dst.flags |= DST_HOST;
1182 rt->rt6i_dst.addr = *daddr;
1183 rt->rt6i_dst.plen = 128;
1185 if (!rt6_is_gw_or_nonexthop(ort)) {
1186 if (ort->fib6_dst.plen != 128 &&
1187 ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1188 rt->rt6i_flags |= RTF_ANYCAST;
1189 #ifdef CONFIG_IPV6_SUBTREES
1190 if (rt->rt6i_src.plen && saddr) {
1191 rt->rt6i_src.addr = *saddr;
1192 rt->rt6i_src.plen = 128;
1200 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1202 unsigned short flags = fib6_info_dst_flags(rt);
1203 struct net_device *dev;
1204 struct rt6_info *pcpu_rt;
1207 dev = ip6_rt_get_dev_rcu(rt);
1208 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1212 ip6_rt_copy_init(pcpu_rt, rt);
1213 pcpu_rt->rt6i_flags |= RTF_PCPU;
1217 /* It should be called with rcu_read_lock() acquired */
1218 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1220 struct rt6_info *pcpu_rt, **p;
1222 p = this_cpu_ptr(rt->rt6i_pcpu);
1226 ip6_hold_safe(NULL, &pcpu_rt, false);
1231 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1232 struct fib6_info *rt)
1234 struct rt6_info *pcpu_rt, *prev, **p;
1236 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1238 dst_hold(&net->ipv6.ip6_null_entry->dst);
1239 return net->ipv6.ip6_null_entry;
1242 dst_hold(&pcpu_rt->dst);
1243 p = this_cpu_ptr(rt->rt6i_pcpu);
1244 prev = cmpxchg(p, NULL, pcpu_rt);
1250 /* exception hash table implementation
1252 static DEFINE_SPINLOCK(rt6_exception_lock);
1254 /* Remove rt6_ex from hash table and free the memory
1255 * Caller must hold rt6_exception_lock
1257 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1258 struct rt6_exception *rt6_ex)
1262 if (!bucket || !rt6_ex)
1265 net = dev_net(rt6_ex->rt6i->dst.dev);
1266 hlist_del_rcu(&rt6_ex->hlist);
1267 dst_release(&rt6_ex->rt6i->dst);
1268 kfree_rcu(rt6_ex, rcu);
1269 WARN_ON_ONCE(!bucket->depth);
1271 net->ipv6.rt6_stats->fib_rt_cache--;
1274 /* Remove oldest rt6_ex in bucket and free the memory
1275 * Caller must hold rt6_exception_lock
1277 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1279 struct rt6_exception *rt6_ex, *oldest = NULL;
1284 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1285 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1288 rt6_remove_exception(bucket, oldest);
1291 static u32 rt6_exception_hash(const struct in6_addr *dst,
1292 const struct in6_addr *src)
1294 static u32 seed __read_mostly;
1297 net_get_random_once(&seed, sizeof(seed));
1298 val = jhash(dst, sizeof(*dst), seed);
1300 #ifdef CONFIG_IPV6_SUBTREES
1302 val = jhash(src, sizeof(*src), val);
1304 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1307 /* Helper function to find the cached rt in the hash table
1308 * and update bucket pointer to point to the bucket for this
1309 * (daddr, saddr) pair
1310 * Caller must hold rt6_exception_lock
1312 static struct rt6_exception *
1313 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1314 const struct in6_addr *daddr,
1315 const struct in6_addr *saddr)
1317 struct rt6_exception *rt6_ex;
1320 if (!(*bucket) || !daddr)
1323 hval = rt6_exception_hash(daddr, saddr);
1326 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1327 struct rt6_info *rt6 = rt6_ex->rt6i;
1328 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1330 #ifdef CONFIG_IPV6_SUBTREES
1331 if (matched && saddr)
1332 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1340 /* Helper function to find the cached rt in the hash table
1341 * and update bucket pointer to point to the bucket for this
1342 * (daddr, saddr) pair
1343 * Caller must hold rcu_read_lock()
1345 static struct rt6_exception *
1346 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1347 const struct in6_addr *daddr,
1348 const struct in6_addr *saddr)
1350 struct rt6_exception *rt6_ex;
1353 WARN_ON_ONCE(!rcu_read_lock_held());
1355 if (!(*bucket) || !daddr)
1358 hval = rt6_exception_hash(daddr, saddr);
1361 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1362 struct rt6_info *rt6 = rt6_ex->rt6i;
1363 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1365 #ifdef CONFIG_IPV6_SUBTREES
1366 if (matched && saddr)
1367 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1375 static unsigned int fib6_mtu(const struct fib6_info *rt)
1379 if (rt->fib6_pmtu) {
1380 mtu = rt->fib6_pmtu;
1382 struct net_device *dev = fib6_info_nh_dev(rt);
1383 struct inet6_dev *idev;
1386 idev = __in6_dev_get(dev);
1387 mtu = idev->cnf.mtu6;
1391 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1393 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1396 static int rt6_insert_exception(struct rt6_info *nrt,
1397 struct fib6_info *ort)
1399 struct net *net = dev_net(nrt->dst.dev);
1400 struct rt6_exception_bucket *bucket;
1401 struct in6_addr *src_key = NULL;
1402 struct rt6_exception *rt6_ex;
1405 spin_lock_bh(&rt6_exception_lock);
1407 if (ort->exception_bucket_flushed) {
1412 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1413 lockdep_is_held(&rt6_exception_lock));
1415 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1421 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1424 #ifdef CONFIG_IPV6_SUBTREES
1425 /* rt6i_src.plen != 0 indicates ort is in subtree
1426 * and exception table is indexed by a hash of
1427 * both rt6i_dst and rt6i_src.
1428 * Otherwise, the exception table is indexed by
1429 * a hash of only rt6i_dst.
1431 if (ort->fib6_src.plen)
1432 src_key = &nrt->rt6i_src.addr;
1435 /* Update rt6i_prefsrc as it could be changed
1436 * in rt6_remove_prefsrc()
1438 nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1439 /* rt6_mtu_change() might lower mtu on ort.
1440 * Only insert this exception route if its mtu
1441 * is less than ort's mtu value.
1443 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1448 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1451 rt6_remove_exception(bucket, rt6_ex);
1453 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1459 rt6_ex->stamp = jiffies;
1460 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1462 net->ipv6.rt6_stats->fib_rt_cache++;
1464 if (bucket->depth > FIB6_MAX_DEPTH)
1465 rt6_exception_remove_oldest(bucket);
1468 spin_unlock_bh(&rt6_exception_lock);
1470 /* Update fn->fn_sernum to invalidate all cached dst */
1472 spin_lock_bh(&ort->fib6_table->tb6_lock);
1473 fib6_update_sernum(net, ort);
1474 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1475 fib6_force_start_gc(net);
1481 void rt6_flush_exceptions(struct fib6_info *rt)
1483 struct rt6_exception_bucket *bucket;
1484 struct rt6_exception *rt6_ex;
1485 struct hlist_node *tmp;
1488 spin_lock_bh(&rt6_exception_lock);
1489 /* Prevent rt6_insert_exception() to recreate the bucket list */
1490 rt->exception_bucket_flushed = 1;
1492 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1493 lockdep_is_held(&rt6_exception_lock));
1497 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1498 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1499 rt6_remove_exception(bucket, rt6_ex);
1500 WARN_ON_ONCE(bucket->depth);
1505 spin_unlock_bh(&rt6_exception_lock);
1508 /* Find cached rt in the hash table inside passed in rt
1509 * Caller has to hold rcu_read_lock()
1511 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1512 struct in6_addr *daddr,
1513 struct in6_addr *saddr)
1515 struct rt6_exception_bucket *bucket;
1516 struct in6_addr *src_key = NULL;
1517 struct rt6_exception *rt6_ex;
1518 struct rt6_info *res = NULL;
1520 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1522 #ifdef CONFIG_IPV6_SUBTREES
1523 /* rt6i_src.plen != 0 indicates rt is in subtree
1524 * and exception table is indexed by a hash of
1525 * both rt6i_dst and rt6i_src.
1526 * Otherwise, the exception table is indexed by
1527 * a hash of only rt6i_dst.
1529 if (rt->fib6_src.plen)
1532 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1534 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1540 /* Remove the passed in cached rt from the hash table that contains it */
1541 static int rt6_remove_exception_rt(struct rt6_info *rt)
1543 struct rt6_exception_bucket *bucket;
1544 struct fib6_info *from = rt->from;
1545 struct in6_addr *src_key = NULL;
1546 struct rt6_exception *rt6_ex;
1550 !(rt->rt6i_flags & RTF_CACHE))
1553 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1556 spin_lock_bh(&rt6_exception_lock);
1557 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1558 lockdep_is_held(&rt6_exception_lock));
1559 #ifdef CONFIG_IPV6_SUBTREES
1560 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1561 * and exception table is indexed by a hash of
1562 * both rt6i_dst and rt6i_src.
1563 * Otherwise, the exception table is indexed by
1564 * a hash of only rt6i_dst.
1566 if (from->fib6_src.plen)
1567 src_key = &rt->rt6i_src.addr;
1569 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1573 rt6_remove_exception(bucket, rt6_ex);
1579 spin_unlock_bh(&rt6_exception_lock);
1583 /* Find rt6_ex which contains the passed in rt cache and
1586 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1588 struct rt6_exception_bucket *bucket;
1589 struct fib6_info *from = rt->from;
1590 struct in6_addr *src_key = NULL;
1591 struct rt6_exception *rt6_ex;
1594 !(rt->rt6i_flags & RTF_CACHE))
1598 bucket = rcu_dereference(from->rt6i_exception_bucket);
1600 #ifdef CONFIG_IPV6_SUBTREES
1601 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1602 * and exception table is indexed by a hash of
1603 * both rt6i_dst and rt6i_src.
1604 * Otherwise, the exception table is indexed by
1605 * a hash of only rt6i_dst.
1607 if (from->fib6_src.plen)
1608 src_key = &rt->rt6i_src.addr;
1610 rt6_ex = __rt6_find_exception_rcu(&bucket,
1614 rt6_ex->stamp = jiffies;
1619 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1621 struct rt6_exception_bucket *bucket;
1622 struct rt6_exception *rt6_ex;
1625 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1626 lockdep_is_held(&rt6_exception_lock));
1629 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1630 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1631 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1638 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1639 struct rt6_info *rt, int mtu)
1641 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1642 * lowest MTU in the path: always allow updating the route PMTU to
1643 * reflect PMTU decreases.
1645 * If the new MTU is higher, and the route PMTU is equal to the local
1646 * MTU, this means the old MTU is the lowest in the path, so allow
1647 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1651 if (dst_mtu(&rt->dst) >= mtu)
1654 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1660 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1661 struct fib6_info *rt, int mtu)
1663 struct rt6_exception_bucket *bucket;
1664 struct rt6_exception *rt6_ex;
1667 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1668 lockdep_is_held(&rt6_exception_lock));
1673 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1674 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1675 struct rt6_info *entry = rt6_ex->rt6i;
1677 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1678 * route), the metrics of its rt->from have already
1681 if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1682 rt6_mtu_change_route_allowed(idev, entry, mtu))
1683 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1689 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1691 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1692 struct in6_addr *gateway)
1694 struct rt6_exception_bucket *bucket;
1695 struct rt6_exception *rt6_ex;
1696 struct hlist_node *tmp;
1699 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1702 spin_lock_bh(&rt6_exception_lock);
1703 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1704 lockdep_is_held(&rt6_exception_lock));
1707 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1708 hlist_for_each_entry_safe(rt6_ex, tmp,
1709 &bucket->chain, hlist) {
1710 struct rt6_info *entry = rt6_ex->rt6i;
1712 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1713 RTF_CACHE_GATEWAY &&
1714 ipv6_addr_equal(gateway,
1715 &entry->rt6i_gateway)) {
1716 rt6_remove_exception(bucket, rt6_ex);
1723 spin_unlock_bh(&rt6_exception_lock);
1726 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1727 struct rt6_exception *rt6_ex,
1728 struct fib6_gc_args *gc_args,
1731 struct rt6_info *rt = rt6_ex->rt6i;
1733 /* we are pruning and obsoleting aged-out and non gateway exceptions
1734 * even if others have still references to them, so that on next
1735 * dst_check() such references can be dropped.
1736 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1737 * expired, independently from their aging, as per RFC 8201 section 4
1739 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1740 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1741 RT6_TRACE("aging clone %p\n", rt);
1742 rt6_remove_exception(bucket, rt6_ex);
1745 } else if (time_after(jiffies, rt->dst.expires)) {
1746 RT6_TRACE("purging expired route %p\n", rt);
1747 rt6_remove_exception(bucket, rt6_ex);
1751 if (rt->rt6i_flags & RTF_GATEWAY) {
1752 struct neighbour *neigh;
1753 __u8 neigh_flags = 0;
1755 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1757 neigh_flags = neigh->flags;
1759 if (!(neigh_flags & NTF_ROUTER)) {
1760 RT6_TRACE("purging route %p via non-router but gateway\n",
1762 rt6_remove_exception(bucket, rt6_ex);
1770 void rt6_age_exceptions(struct fib6_info *rt,
1771 struct fib6_gc_args *gc_args,
1774 struct rt6_exception_bucket *bucket;
1775 struct rt6_exception *rt6_ex;
1776 struct hlist_node *tmp;
1779 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1783 spin_lock(&rt6_exception_lock);
1784 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1785 lockdep_is_held(&rt6_exception_lock));
1788 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1789 hlist_for_each_entry_safe(rt6_ex, tmp,
1790 &bucket->chain, hlist) {
1791 rt6_age_examine_exception(bucket, rt6_ex,
1797 spin_unlock(&rt6_exception_lock);
1798 rcu_read_unlock_bh();
1801 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1802 int oif, struct flowi6 *fl6,
1803 const struct sk_buff *skb, int flags)
1805 struct fib6_node *fn, *saved_fn;
1806 struct fib6_info *f6i;
1807 struct rt6_info *rt;
1810 strict |= flags & RT6_LOOKUP_F_IFACE;
1811 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1812 if (net->ipv6.devconf_all->forwarding == 0)
1813 strict |= RT6_LOOKUP_F_REACHABLE;
1817 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1820 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1824 f6i = rt6_select(net, fn, oif, strict);
1825 if (f6i->fib6_nsiblings)
1826 f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict);
1827 if (f6i == net->ipv6.fib6_null_entry) {
1828 fn = fib6_backtrack(fn, &fl6->saddr);
1830 goto redo_rt6_select;
1831 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1832 /* also consider unreachable route */
1833 strict &= ~RT6_LOOKUP_F_REACHABLE;
1835 goto redo_rt6_select;
1839 if (f6i == net->ipv6.fib6_null_entry) {
1840 rt = net->ipv6.ip6_null_entry;
1843 trace_fib6_table_lookup(net, rt, table, fl6);
1847 /*Search through exception table */
1848 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1850 if (ip6_hold_safe(net, &rt, true))
1851 dst_use_noref(&rt->dst, jiffies);
1854 trace_fib6_table_lookup(net, rt, table, fl6);
1856 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1857 !(f6i->fib6_flags & RTF_GATEWAY))) {
1858 /* Create a RTF_CACHE clone which will not be
1859 * owned by the fib6 tree. It is for the special case where
1860 * the daddr in the skb during the neighbor look-up is different
1861 * from the fl6->daddr used to look-up route here.
1863 struct rt6_info *uncached_rt;
1865 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1870 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1871 * No need for another dst_hold()
1873 rt6_uncached_list_add(uncached_rt);
1874 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1876 uncached_rt = net->ipv6.ip6_null_entry;
1877 dst_hold(&uncached_rt->dst);
1880 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1884 /* Get a percpu copy */
1886 struct rt6_info *pcpu_rt;
1889 pcpu_rt = rt6_get_pcpu_route(f6i);
1892 pcpu_rt = rt6_make_pcpu_route(net, f6i);
1896 trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1900 EXPORT_SYMBOL_GPL(ip6_pol_route);
1902 static struct rt6_info *ip6_pol_route_input(struct net *net,
1903 struct fib6_table *table,
1905 const struct sk_buff *skb,
1908 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1911 struct dst_entry *ip6_route_input_lookup(struct net *net,
1912 struct net_device *dev,
1914 const struct sk_buff *skb,
1917 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1918 flags |= RT6_LOOKUP_F_IFACE;
1920 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1922 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1924 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1925 struct flow_keys *keys,
1926 struct flow_keys *flkeys)
1928 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1929 const struct ipv6hdr *key_iph = outer_iph;
1930 struct flow_keys *_flkeys = flkeys;
1931 const struct ipv6hdr *inner_iph;
1932 const struct icmp6hdr *icmph;
1933 struct ipv6hdr _inner_iph;
1935 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1938 icmph = icmp6_hdr(skb);
1939 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1940 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1941 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1942 icmph->icmp6_type != ICMPV6_PARAMPROB)
1945 inner_iph = skb_header_pointer(skb,
1946 skb_transport_offset(skb) + sizeof(*icmph),
1947 sizeof(_inner_iph), &_inner_iph);
1951 key_iph = inner_iph;
1955 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1956 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1957 keys->tags.flow_label = _flkeys->tags.flow_label;
1958 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1960 keys->addrs.v6addrs.src = key_iph->saddr;
1961 keys->addrs.v6addrs.dst = key_iph->daddr;
1962 keys->tags.flow_label = ip6_flowinfo(key_iph);
1963 keys->basic.ip_proto = key_iph->nexthdr;
1967 /* if skb is set it will be used and fl6 can be NULL */
1968 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1969 const struct sk_buff *skb, struct flow_keys *flkeys)
1971 struct flow_keys hash_keys;
1974 switch (ip6_multipath_hash_policy(net)) {
1976 memset(&hash_keys, 0, sizeof(hash_keys));
1977 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1979 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1981 hash_keys.addrs.v6addrs.src = fl6->saddr;
1982 hash_keys.addrs.v6addrs.dst = fl6->daddr;
1983 hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
1984 hash_keys.basic.ip_proto = fl6->flowi6_proto;
1989 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1990 struct flow_keys keys;
1992 /* short-circuit if we already have L4 hash present */
1994 return skb_get_hash_raw(skb) >> 1;
1996 memset(&hash_keys, 0, sizeof(hash_keys));
1999 skb_flow_dissect_flow_keys(skb, &keys, flag);
2002 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2003 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2004 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2005 hash_keys.ports.src = flkeys->ports.src;
2006 hash_keys.ports.dst = flkeys->ports.dst;
2007 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2009 memset(&hash_keys, 0, sizeof(hash_keys));
2010 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2011 hash_keys.addrs.v6addrs.src = fl6->saddr;
2012 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2013 hash_keys.ports.src = fl6->fl6_sport;
2014 hash_keys.ports.dst = fl6->fl6_dport;
2015 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2019 mhash = flow_hash_from_keys(&hash_keys);
2024 void ip6_route_input(struct sk_buff *skb)
2026 const struct ipv6hdr *iph = ipv6_hdr(skb);
2027 struct net *net = dev_net(skb->dev);
2028 int flags = RT6_LOOKUP_F_HAS_SADDR;
2029 struct ip_tunnel_info *tun_info;
2030 struct flowi6 fl6 = {
2031 .flowi6_iif = skb->dev->ifindex,
2032 .daddr = iph->daddr,
2033 .saddr = iph->saddr,
2034 .flowlabel = ip6_flowinfo(iph),
2035 .flowi6_mark = skb->mark,
2036 .flowi6_proto = iph->nexthdr,
2038 struct flow_keys *flkeys = NULL, _flkeys;
2040 tun_info = skb_tunnel_info(skb);
2041 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2042 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2044 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2047 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2048 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2051 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2054 static struct rt6_info *ip6_pol_route_output(struct net *net,
2055 struct fib6_table *table,
2057 const struct sk_buff *skb,
2060 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2063 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2064 struct flowi6 *fl6, int flags)
2068 if (rt6_need_strict(&fl6->daddr)) {
2069 struct dst_entry *dst;
2071 dst = l3mdev_link_scope_lookup(net, fl6);
2076 fl6->flowi6_iif = LOOPBACK_IFINDEX;
2078 any_src = ipv6_addr_any(&fl6->saddr);
2079 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2080 (fl6->flowi6_oif && any_src))
2081 flags |= RT6_LOOKUP_F_IFACE;
2084 flags |= RT6_LOOKUP_F_HAS_SADDR;
2086 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2088 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2090 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2092 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2094 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2095 struct net_device *loopback_dev = net->loopback_dev;
2096 struct dst_entry *new = NULL;
2098 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2099 DST_OBSOLETE_DEAD, 0);
2102 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2106 new->input = dst_discard;
2107 new->output = dst_discard_out;
2109 dst_copy_metrics(new, &ort->dst);
2111 rt->rt6i_idev = in6_dev_get(loopback_dev);
2112 rt->rt6i_gateway = ort->rt6i_gateway;
2113 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2115 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2116 #ifdef CONFIG_IPV6_SUBTREES
2117 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2121 dst_release(dst_orig);
2122 return new ? new : ERR_PTR(-ENOMEM);
2126 * Destination cache support functions
2129 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2133 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2136 if (fib6_check_expired(f6i))
2142 static struct dst_entry *rt6_check(struct rt6_info *rt,
2143 struct fib6_info *from,
2148 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2149 rt_cookie != cookie)
2152 if (rt6_check_expired(rt))
2158 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2159 struct fib6_info *from,
2162 if (!__rt6_check_expired(rt) &&
2163 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2164 fib6_check(from, cookie))
2170 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2172 struct dst_entry *dst_ret;
2173 struct fib6_info *from;
2174 struct rt6_info *rt;
2176 rt = container_of(dst, struct rt6_info, dst);
2180 /* All IPV6 dsts are created with ->obsolete set to the value
2181 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2182 * into this function always.
2185 from = rcu_dereference(rt->from);
2187 if (from && (rt->rt6i_flags & RTF_PCPU ||
2188 unlikely(!list_empty(&rt->rt6i_uncached))))
2189 dst_ret = rt6_dst_from_check(rt, from, cookie);
2191 dst_ret = rt6_check(rt, from, cookie);
2198 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2200 struct rt6_info *rt = (struct rt6_info *) dst;
2203 if (rt->rt6i_flags & RTF_CACHE) {
2205 if (rt6_check_expired(rt)) {
2206 rt6_remove_exception_rt(rt);
2218 static void ip6_link_failure(struct sk_buff *skb)
2220 struct rt6_info *rt;
2222 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2224 rt = (struct rt6_info *) skb_dst(skb);
2226 if (rt->rt6i_flags & RTF_CACHE) {
2227 if (dst_hold_safe(&rt->dst))
2228 rt6_remove_exception_rt(rt);
2230 struct fib6_info *from;
2231 struct fib6_node *fn;
2234 from = rcu_dereference(rt->from);
2236 fn = rcu_dereference(from->fib6_node);
2237 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2245 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2247 if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2248 struct fib6_info *from;
2251 from = rcu_dereference(rt0->from);
2253 rt0->dst.expires = from->expires;
2257 dst_set_expires(&rt0->dst, timeout);
2258 rt0->rt6i_flags |= RTF_EXPIRES;
2261 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2263 struct net *net = dev_net(rt->dst.dev);
2265 dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2266 rt->rt6i_flags |= RTF_MODIFIED;
2267 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2270 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2275 from_set = !!rcu_dereference(rt->from);
2278 return !(rt->rt6i_flags & RTF_CACHE) &&
2279 (rt->rt6i_flags & RTF_PCPU || from_set);
2282 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2283 const struct ipv6hdr *iph, u32 mtu)
2285 const struct in6_addr *daddr, *saddr;
2286 struct rt6_info *rt6 = (struct rt6_info *)dst;
2288 if (rt6->rt6i_flags & RTF_LOCAL)
2291 if (dst_metric_locked(dst, RTAX_MTU))
2295 daddr = &iph->daddr;
2296 saddr = &iph->saddr;
2298 daddr = &sk->sk_v6_daddr;
2299 saddr = &inet6_sk(sk)->saddr;
2304 dst_confirm_neigh(dst, daddr);
2305 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2306 if (mtu >= dst_mtu(dst))
2309 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2310 rt6_do_update_pmtu(rt6, mtu);
2311 /* update rt6_ex->stamp for cache */
2312 if (rt6->rt6i_flags & RTF_CACHE)
2313 rt6_update_exception_stamp_rt(rt6);
2315 struct fib6_info *from;
2316 struct rt6_info *nrt6;
2319 from = rcu_dereference(rt6->from);
2320 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2322 rt6_do_update_pmtu(nrt6, mtu);
2323 if (rt6_insert_exception(nrt6, from))
2324 dst_release_immediate(&nrt6->dst);
2330 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2331 struct sk_buff *skb, u32 mtu)
2333 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2336 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2337 int oif, u32 mark, kuid_t uid)
2339 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2340 struct dst_entry *dst;
2343 memset(&fl6, 0, sizeof(fl6));
2344 fl6.flowi6_oif = oif;
2345 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2346 fl6.daddr = iph->daddr;
2347 fl6.saddr = iph->saddr;
2348 fl6.flowlabel = ip6_flowinfo(iph);
2349 fl6.flowi6_uid = uid;
2351 dst = ip6_route_output(net, NULL, &fl6);
2353 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2356 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2358 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2360 struct dst_entry *dst;
2362 ip6_update_pmtu(skb, sock_net(sk), mtu,
2363 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2365 dst = __sk_dst_get(sk);
2366 if (!dst || !dst->obsolete ||
2367 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2371 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2372 ip6_datagram_dst_update(sk, false);
2375 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2377 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2378 const struct flowi6 *fl6)
2380 #ifdef CONFIG_IPV6_SUBTREES
2381 struct ipv6_pinfo *np = inet6_sk(sk);
2384 ip6_dst_store(sk, dst,
2385 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2386 &sk->sk_v6_daddr : NULL,
2387 #ifdef CONFIG_IPV6_SUBTREES
2388 ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2394 /* Handle redirects */
2395 struct ip6rd_flowi {
2397 struct in6_addr gateway;
2400 static struct rt6_info *__ip6_route_redirect(struct net *net,
2401 struct fib6_table *table,
2403 const struct sk_buff *skb,
2406 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2407 struct rt6_info *ret = NULL, *rt_cache;
2408 struct fib6_info *rt;
2409 struct fib6_node *fn;
2411 /* Get the "current" route for this destination and
2412 * check if the redirect has come from appropriate router.
2414 * RFC 4861 specifies that redirects should only be
2415 * accepted if they come from the nexthop to the target.
2416 * Due to the way the routes are chosen, this notion
2417 * is a bit fuzzy and one might need to check all possible
2422 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2424 for_each_fib6_node_rt_rcu(fn) {
2425 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2427 if (fib6_check_expired(rt))
2429 if (rt->fib6_flags & RTF_REJECT)
2431 if (!(rt->fib6_flags & RTF_GATEWAY))
2433 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2435 /* rt_cache's gateway might be different from its 'parent'
2436 * in the case of an ip redirect.
2437 * So we keep searching in the exception table if the gateway
2440 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2441 rt_cache = rt6_find_cached_rt(rt,
2445 ipv6_addr_equal(&rdfl->gateway,
2446 &rt_cache->rt6i_gateway)) {
2456 rt = net->ipv6.fib6_null_entry;
2457 else if (rt->fib6_flags & RTF_REJECT) {
2458 ret = net->ipv6.ip6_null_entry;
2462 if (rt == net->ipv6.fib6_null_entry) {
2463 fn = fib6_backtrack(fn, &fl6->saddr);
2470 dst_hold(&ret->dst);
2472 ret = ip6_create_rt_rcu(rt);
2476 trace_fib6_table_lookup(net, ret, table, fl6);
2480 static struct dst_entry *ip6_route_redirect(struct net *net,
2481 const struct flowi6 *fl6,
2482 const struct sk_buff *skb,
2483 const struct in6_addr *gateway)
2485 int flags = RT6_LOOKUP_F_HAS_SADDR;
2486 struct ip6rd_flowi rdfl;
2489 rdfl.gateway = *gateway;
2491 return fib6_rule_lookup(net, &rdfl.fl6, skb,
2492 flags, __ip6_route_redirect);
2495 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2498 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2499 struct dst_entry *dst;
2502 memset(&fl6, 0, sizeof(fl6));
2503 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2504 fl6.flowi6_oif = oif;
2505 fl6.flowi6_mark = mark;
2506 fl6.daddr = iph->daddr;
2507 fl6.saddr = iph->saddr;
2508 fl6.flowlabel = ip6_flowinfo(iph);
2509 fl6.flowi6_uid = uid;
2511 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2512 rt6_do_redirect(dst, NULL, skb);
2515 EXPORT_SYMBOL_GPL(ip6_redirect);
2517 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2520 const struct ipv6hdr *iph = ipv6_hdr(skb);
2521 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2522 struct dst_entry *dst;
2525 memset(&fl6, 0, sizeof(fl6));
2526 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2527 fl6.flowi6_oif = oif;
2528 fl6.flowi6_mark = mark;
2529 fl6.daddr = msg->dest;
2530 fl6.saddr = iph->daddr;
2531 fl6.flowi6_uid = sock_net_uid(net, NULL);
2533 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2534 rt6_do_redirect(dst, NULL, skb);
2538 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2540 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2543 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2545 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2547 struct net_device *dev = dst->dev;
2548 unsigned int mtu = dst_mtu(dst);
2549 struct net *net = dev_net(dev);
2551 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2553 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2554 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2557 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2558 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2559 * IPV6_MAXPLEN is also valid and means: "any MSS,
2560 * rely only on pmtu discovery"
2562 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2567 static unsigned int ip6_mtu(const struct dst_entry *dst)
2569 struct inet6_dev *idev;
2572 mtu = dst_metric_raw(dst, RTAX_MTU);
2579 idev = __in6_dev_get(dst->dev);
2581 mtu = idev->cnf.mtu6;
2585 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2587 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2590 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2593 struct dst_entry *dst;
2594 struct rt6_info *rt;
2595 struct inet6_dev *idev = in6_dev_get(dev);
2596 struct net *net = dev_net(dev);
2598 if (unlikely(!idev))
2599 return ERR_PTR(-ENODEV);
2601 rt = ip6_dst_alloc(net, dev, 0);
2602 if (unlikely(!rt)) {
2604 dst = ERR_PTR(-ENOMEM);
2608 rt->dst.flags |= DST_HOST;
2609 rt->dst.input = ip6_input;
2610 rt->dst.output = ip6_output;
2611 rt->rt6i_gateway = fl6->daddr;
2612 rt->rt6i_dst.addr = fl6->daddr;
2613 rt->rt6i_dst.plen = 128;
2614 rt->rt6i_idev = idev;
2615 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2617 /* Add this dst into uncached_list so that rt6_disable_ip() can
2618 * do proper release of the net_device
2620 rt6_uncached_list_add(rt);
2621 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2623 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2629 static int ip6_dst_gc(struct dst_ops *ops)
2631 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2632 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2633 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2634 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2635 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2636 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2639 entries = dst_entries_get_fast(ops);
2640 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2641 entries <= rt_max_size)
2644 net->ipv6.ip6_rt_gc_expire++;
2645 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2646 entries = dst_entries_get_slow(ops);
2647 if (entries < ops->gc_thresh)
2648 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2650 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2651 return entries > rt_max_size;
2654 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2655 struct fib6_config *cfg)
2657 struct dst_metrics *p;
2662 p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2666 refcount_set(&p->refcnt, 1);
2667 rt->fib6_metrics = p;
2669 return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2672 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2673 struct fib6_config *cfg,
2674 const struct in6_addr *gw_addr,
2675 u32 tbid, int flags)
2677 struct flowi6 fl6 = {
2678 .flowi6_oif = cfg->fc_ifindex,
2680 .saddr = cfg->fc_prefsrc,
2682 struct fib6_table *table;
2683 struct rt6_info *rt;
2685 table = fib6_get_table(net, tbid);
2689 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2690 flags |= RT6_LOOKUP_F_HAS_SADDR;
2692 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2693 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2695 /* if table lookup failed, fall back to full lookup */
2696 if (rt == net->ipv6.ip6_null_entry) {
2704 static int ip6_route_check_nh_onlink(struct net *net,
2705 struct fib6_config *cfg,
2706 const struct net_device *dev,
2707 struct netlink_ext_ack *extack)
2709 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2710 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2711 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2712 struct rt6_info *grt;
2716 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2718 if (!grt->dst.error &&
2719 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2720 NL_SET_ERR_MSG(extack,
2721 "Nexthop has invalid gateway or device mismatch");
2731 static int ip6_route_check_nh(struct net *net,
2732 struct fib6_config *cfg,
2733 struct net_device **_dev,
2734 struct inet6_dev **idev)
2736 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2737 struct net_device *dev = _dev ? *_dev : NULL;
2738 struct rt6_info *grt = NULL;
2739 int err = -EHOSTUNREACH;
2741 if (cfg->fc_table) {
2742 int flags = RT6_LOOKUP_F_IFACE;
2744 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2745 cfg->fc_table, flags);
2747 if (grt->rt6i_flags & RTF_GATEWAY ||
2748 (dev && dev != grt->dst.dev)) {
2756 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2762 if (dev != grt->dst.dev) {
2767 *_dev = dev = grt->dst.dev;
2768 *idev = grt->rt6i_idev;
2770 in6_dev_hold(grt->rt6i_idev);
2773 if (!(grt->rt6i_flags & RTF_GATEWAY))
2782 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2783 struct net_device **_dev, struct inet6_dev **idev,
2784 struct netlink_ext_ack *extack)
2786 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2787 int gwa_type = ipv6_addr_type(gw_addr);
2788 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2789 const struct net_device *dev = *_dev;
2790 bool need_addr_check = !dev;
2793 /* if gw_addr is local we will fail to detect this in case
2794 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2795 * will return already-added prefix route via interface that
2796 * prefix route was assigned to, which might be non-loopback.
2799 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2800 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2804 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2805 /* IPv6 strictly inhibits using not link-local
2806 * addresses as nexthop address.
2807 * Otherwise, router will not able to send redirects.
2808 * It is very good, but in some (rare!) circumstances
2809 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2810 * some exceptions. --ANK
2811 * We allow IPv4-mapped nexthops to support RFC4798-type
2814 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2815 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2819 if (cfg->fc_flags & RTNH_F_ONLINK)
2820 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2822 err = ip6_route_check_nh(net, cfg, _dev, idev);
2828 /* reload in case device was changed */
2833 NL_SET_ERR_MSG(extack, "Egress device not specified");
2835 } else if (dev->flags & IFF_LOOPBACK) {
2836 NL_SET_ERR_MSG(extack,
2837 "Egress device can not be loopback device for this route");
2841 /* if we did not check gw_addr above, do so now that the
2842 * egress device has been resolved.
2844 if (need_addr_check &&
2845 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2846 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2855 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2857 struct netlink_ext_ack *extack)
2859 struct net *net = cfg->fc_nlinfo.nl_net;
2860 struct fib6_info *rt = NULL;
2861 struct net_device *dev = NULL;
2862 struct inet6_dev *idev = NULL;
2863 struct fib6_table *table;
2867 /* RTF_PCPU is an internal flag; can not be set by userspace */
2868 if (cfg->fc_flags & RTF_PCPU) {
2869 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2873 /* RTF_CACHE is an internal flag; can not be set by userspace */
2874 if (cfg->fc_flags & RTF_CACHE) {
2875 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2879 if (cfg->fc_type > RTN_MAX) {
2880 NL_SET_ERR_MSG(extack, "Invalid route type");
2884 if (cfg->fc_dst_len > 128) {
2885 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2888 if (cfg->fc_src_len > 128) {
2889 NL_SET_ERR_MSG(extack, "Invalid source address length");
2892 #ifndef CONFIG_IPV6_SUBTREES
2893 if (cfg->fc_src_len) {
2894 NL_SET_ERR_MSG(extack,
2895 "Specifying source address requires IPV6_SUBTREES to be enabled");
2899 if (cfg->fc_ifindex) {
2901 dev = dev_get_by_index(net, cfg->fc_ifindex);
2904 idev = in6_dev_get(dev);
2909 if (cfg->fc_metric == 0)
2910 cfg->fc_metric = IP6_RT_PRIO_USER;
2912 if (cfg->fc_flags & RTNH_F_ONLINK) {
2914 NL_SET_ERR_MSG(extack,
2915 "Nexthop device required for onlink");
2920 if (!(dev->flags & IFF_UP)) {
2921 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2928 if (cfg->fc_nlinfo.nlh &&
2929 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2930 table = fib6_get_table(net, cfg->fc_table);
2932 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2933 table = fib6_new_table(net, cfg->fc_table);
2936 table = fib6_new_table(net, cfg->fc_table);
2943 rt = fib6_info_alloc(gfp_flags);
2947 if (cfg->fc_flags & RTF_ADDRCONF)
2948 rt->dst_nocount = true;
2950 err = ip6_convert_metrics(net, rt, cfg);
2954 if (cfg->fc_flags & RTF_EXPIRES)
2955 fib6_set_expires(rt, jiffies +
2956 clock_t_to_jiffies(cfg->fc_expires));
2958 fib6_clean_expires(rt);
2960 if (cfg->fc_protocol == RTPROT_UNSPEC)
2961 cfg->fc_protocol = RTPROT_BOOT;
2962 rt->fib6_protocol = cfg->fc_protocol;
2964 addr_type = ipv6_addr_type(&cfg->fc_dst);
2966 if (cfg->fc_encap) {
2967 struct lwtunnel_state *lwtstate;
2969 err = lwtunnel_build_state(cfg->fc_encap_type,
2970 cfg->fc_encap, AF_INET6, cfg,
2974 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
2977 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2978 rt->fib6_dst.plen = cfg->fc_dst_len;
2979 if (rt->fib6_dst.plen == 128)
2980 rt->dst_host = true;
2982 #ifdef CONFIG_IPV6_SUBTREES
2983 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
2984 rt->fib6_src.plen = cfg->fc_src_len;
2987 rt->fib6_metric = cfg->fc_metric;
2988 rt->fib6_nh.nh_weight = 1;
2990 rt->fib6_type = cfg->fc_type;
2992 /* We cannot add true routes via loopback here,
2993 they would result in kernel looping; promote them to reject routes
2995 if ((cfg->fc_flags & RTF_REJECT) ||
2996 (dev && (dev->flags & IFF_LOOPBACK) &&
2997 !(addr_type & IPV6_ADDR_LOOPBACK) &&
2998 !(cfg->fc_flags & RTF_LOCAL))) {
2999 /* hold loopback dev/idev if we haven't done so. */
3000 if (dev != net->loopback_dev) {
3005 dev = net->loopback_dev;
3007 idev = in6_dev_get(dev);
3013 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3017 if (cfg->fc_flags & RTF_GATEWAY) {
3018 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3022 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3029 if (idev->cnf.disable_ipv6) {
3030 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3035 if (!(dev->flags & IFF_UP)) {
3036 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3041 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3042 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3043 NL_SET_ERR_MSG(extack, "Invalid source address");
3047 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3048 rt->fib6_prefsrc.plen = 128;
3050 rt->fib6_prefsrc.plen = 0;
3052 rt->fib6_flags = cfg->fc_flags;
3055 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3056 !netif_carrier_ok(dev))
3057 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3058 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3059 rt->fib6_nh.nh_dev = dev;
3060 rt->fib6_table = table;
3062 cfg->fc_nlinfo.nl_net = dev_net(dev);
3074 fib6_info_release(rt);
3075 return ERR_PTR(err);
3078 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3079 struct netlink_ext_ack *extack)
3081 struct fib6_info *rt;
3084 rt = ip6_route_info_create(cfg, gfp_flags, extack);
3088 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3089 fib6_info_release(rt);
3094 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3096 struct net *net = info->nl_net;
3097 struct fib6_table *table;
3100 if (rt == net->ipv6.fib6_null_entry) {
3105 table = rt->fib6_table;
3106 spin_lock_bh(&table->tb6_lock);
3107 err = fib6_del(rt, info);
3108 spin_unlock_bh(&table->tb6_lock);
3111 fib6_info_release(rt);
3115 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3117 struct nl_info info = { .nl_net = net };
3119 return __ip6_del_rt(rt, &info);
3122 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3124 struct nl_info *info = &cfg->fc_nlinfo;
3125 struct net *net = info->nl_net;
3126 struct sk_buff *skb = NULL;
3127 struct fib6_table *table;
3130 if (rt == net->ipv6.fib6_null_entry)
3132 table = rt->fib6_table;
3133 spin_lock_bh(&table->tb6_lock);
3135 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3136 struct fib6_info *sibling, *next_sibling;
3138 /* prefer to send a single notification with all hops */
3139 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3141 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3143 if (rt6_fill_node(net, skb, rt, NULL,
3144 NULL, NULL, 0, RTM_DELROUTE,
3145 info->portid, seq, 0) < 0) {
3149 info->skip_notify = 1;
3152 list_for_each_entry_safe(sibling, next_sibling,
3155 err = fib6_del(sibling, info);
3161 err = fib6_del(rt, info);
3163 spin_unlock_bh(&table->tb6_lock);
3165 fib6_info_release(rt);
3168 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3169 info->nlh, gfp_any());
3174 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3178 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3181 if (cfg->fc_flags & RTF_GATEWAY &&
3182 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3184 if (dst_hold_safe(&rt->dst))
3185 rc = rt6_remove_exception_rt(rt);
3190 static int ip6_route_del(struct fib6_config *cfg,
3191 struct netlink_ext_ack *extack)
3193 struct rt6_info *rt_cache;
3194 struct fib6_table *table;
3195 struct fib6_info *rt;
3196 struct fib6_node *fn;
3199 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3201 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3207 fn = fib6_locate(&table->tb6_root,
3208 &cfg->fc_dst, cfg->fc_dst_len,
3209 &cfg->fc_src, cfg->fc_src_len,
3210 !(cfg->fc_flags & RTF_CACHE));
3213 for_each_fib6_node_rt_rcu(fn) {
3214 if (cfg->fc_flags & RTF_CACHE) {
3217 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3220 rc = ip6_del_cached_rt(rt_cache, cfg);
3226 if (cfg->fc_ifindex &&
3227 (!rt->fib6_nh.nh_dev ||
3228 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3230 if (cfg->fc_flags & RTF_GATEWAY &&
3231 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3233 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3235 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3240 /* if gateway was specified only delete the one hop */
3241 if (cfg->fc_flags & RTF_GATEWAY)
3242 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3244 return __ip6_del_rt_siblings(rt, cfg);
3252 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3254 struct netevent_redirect netevent;
3255 struct rt6_info *rt, *nrt = NULL;
3256 struct ndisc_options ndopts;
3257 struct inet6_dev *in6_dev;
3258 struct neighbour *neigh;
3259 struct fib6_info *from;
3261 int optlen, on_link;
3264 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3265 optlen -= sizeof(*msg);
3268 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3272 msg = (struct rd_msg *)icmp6_hdr(skb);
3274 if (ipv6_addr_is_multicast(&msg->dest)) {
3275 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3280 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3282 } else if (ipv6_addr_type(&msg->target) !=
3283 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3284 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3288 in6_dev = __in6_dev_get(skb->dev);
3291 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3295 * The IP source address of the Redirect MUST be the same as the current
3296 * first-hop router for the specified ICMP Destination Address.
3299 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3300 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3305 if (ndopts.nd_opts_tgt_lladdr) {
3306 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3309 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3314 rt = (struct rt6_info *) dst;
3315 if (rt->rt6i_flags & RTF_REJECT) {
3316 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3320 /* Redirect received -> path was valid.
3321 * Look, redirects are sent only in response to data packets,
3322 * so that this nexthop apparently is reachable. --ANK
3324 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3326 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3331 * We have finally decided to accept it.
3334 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3335 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3336 NEIGH_UPDATE_F_OVERRIDE|
3337 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3338 NEIGH_UPDATE_F_ISROUTER)),
3339 NDISC_REDIRECT, &ndopts);
3342 from = rcu_dereference(rt->from);
3343 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3348 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3350 nrt->rt6i_flags &= ~RTF_GATEWAY;
3352 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3354 /* No need to remove rt from the exception table if rt is
3355 * a cached route because rt6_insert_exception() will
3358 if (rt6_insert_exception(nrt, rt->from)) {
3359 dst_release_immediate(&nrt->dst);
3363 netevent.old = &rt->dst;
3364 netevent.new = &nrt->dst;
3365 netevent.daddr = &msg->dest;
3366 netevent.neigh = neigh;
3367 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3370 neigh_release(neigh);
3373 #ifdef CONFIG_IPV6_ROUTE_INFO
3374 static struct fib6_info *rt6_get_route_info(struct net *net,
3375 const struct in6_addr *prefix, int prefixlen,
3376 const struct in6_addr *gwaddr,
3377 struct net_device *dev)
3379 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3380 int ifindex = dev->ifindex;
3381 struct fib6_node *fn;
3382 struct fib6_info *rt = NULL;
3383 struct fib6_table *table;
3385 table = fib6_get_table(net, tb_id);
3390 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3394 for_each_fib6_node_rt_rcu(fn) {
3395 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3397 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3399 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3409 static struct fib6_info *rt6_add_route_info(struct net *net,
3410 const struct in6_addr *prefix, int prefixlen,
3411 const struct in6_addr *gwaddr,
3412 struct net_device *dev,
3415 struct fib6_config cfg = {
3416 .fc_metric = IP6_RT_PRIO_USER,
3417 .fc_ifindex = dev->ifindex,
3418 .fc_dst_len = prefixlen,
3419 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3420 RTF_UP | RTF_PREF(pref),
3421 .fc_protocol = RTPROT_RA,
3422 .fc_type = RTN_UNICAST,
3423 .fc_nlinfo.portid = 0,
3424 .fc_nlinfo.nlh = NULL,
3425 .fc_nlinfo.nl_net = net,
3428 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3429 cfg.fc_dst = *prefix;
3430 cfg.fc_gateway = *gwaddr;
3432 /* We should treat it as a default route if prefix length is 0. */
3434 cfg.fc_flags |= RTF_DEFAULT;
3436 ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3438 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3442 struct fib6_info *rt6_get_dflt_router(struct net *net,
3443 const struct in6_addr *addr,
3444 struct net_device *dev)
3446 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3447 struct fib6_info *rt;
3448 struct fib6_table *table;
3450 table = fib6_get_table(net, tb_id);
3455 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3456 if (dev == rt->fib6_nh.nh_dev &&
3457 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3458 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3467 struct fib6_info *rt6_add_dflt_router(struct net *net,
3468 const struct in6_addr *gwaddr,
3469 struct net_device *dev,
3472 struct fib6_config cfg = {
3473 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3474 .fc_metric = IP6_RT_PRIO_USER,
3475 .fc_ifindex = dev->ifindex,
3476 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3477 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3478 .fc_protocol = RTPROT_RA,
3479 .fc_type = RTN_UNICAST,
3480 .fc_nlinfo.portid = 0,
3481 .fc_nlinfo.nlh = NULL,
3482 .fc_nlinfo.nl_net = net,
3485 cfg.fc_gateway = *gwaddr;
3487 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3488 struct fib6_table *table;
3490 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3492 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3495 return rt6_get_dflt_router(net, gwaddr, dev);
3498 static void __rt6_purge_dflt_routers(struct net *net,
3499 struct fib6_table *table)
3501 struct fib6_info *rt;
3505 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3506 struct net_device *dev = fib6_info_nh_dev(rt);
3507 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3509 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3510 (!idev || idev->cnf.accept_ra != 2)) {
3513 ip6_del_rt(net, rt);
3519 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3522 void rt6_purge_dflt_routers(struct net *net)
3524 struct fib6_table *table;
3525 struct hlist_head *head;
3530 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3531 head = &net->ipv6.fib_table_hash[h];
3532 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3533 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3534 __rt6_purge_dflt_routers(net, table);
3541 static void rtmsg_to_fib6_config(struct net *net,
3542 struct in6_rtmsg *rtmsg,
3543 struct fib6_config *cfg)
3545 memset(cfg, 0, sizeof(*cfg));
3547 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3549 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3550 cfg->fc_metric = rtmsg->rtmsg_metric;
3551 cfg->fc_expires = rtmsg->rtmsg_info;
3552 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3553 cfg->fc_src_len = rtmsg->rtmsg_src_len;
3554 cfg->fc_flags = rtmsg->rtmsg_flags;
3555 cfg->fc_type = rtmsg->rtmsg_type;
3557 cfg->fc_nlinfo.nl_net = net;
3559 cfg->fc_dst = rtmsg->rtmsg_dst;
3560 cfg->fc_src = rtmsg->rtmsg_src;
3561 cfg->fc_gateway = rtmsg->rtmsg_gateway;
3564 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3566 struct fib6_config cfg;
3567 struct in6_rtmsg rtmsg;
3571 case SIOCADDRT: /* Add a route */
3572 case SIOCDELRT: /* Delete a route */
3573 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3575 err = copy_from_user(&rtmsg, arg,
3576 sizeof(struct in6_rtmsg));
3580 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3585 err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3588 err = ip6_route_del(&cfg, NULL);
3602 * Drop the packet on the floor
3605 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3608 struct dst_entry *dst = skb_dst(skb);
3609 switch (ipstats_mib_noroutes) {
3610 case IPSTATS_MIB_INNOROUTES:
3611 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3612 if (type == IPV6_ADDR_ANY) {
3613 IP6_INC_STATS(dev_net(dst->dev),
3614 __in6_dev_get_safely(skb->dev),
3615 IPSTATS_MIB_INADDRERRORS);
3619 case IPSTATS_MIB_OUTNOROUTES:
3620 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3621 ipstats_mib_noroutes);
3624 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3629 static int ip6_pkt_discard(struct sk_buff *skb)
3631 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3634 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3636 skb->dev = skb_dst(skb)->dev;
3637 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3640 static int ip6_pkt_prohibit(struct sk_buff *skb)
3642 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3645 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3647 skb->dev = skb_dst(skb)->dev;
3648 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3652 * Allocate a dst for local (unicast / anycast) address.
3655 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3656 struct inet6_dev *idev,
3657 const struct in6_addr *addr,
3658 bool anycast, gfp_t gfp_flags)
3661 struct net_device *dev = idev->dev;
3662 struct fib6_info *f6i;
3664 f6i = fib6_info_alloc(gfp_flags);
3666 return ERR_PTR(-ENOMEM);
3668 f6i->dst_nocount = true;
3669 f6i->dst_host = true;
3670 f6i->fib6_protocol = RTPROT_KERNEL;
3671 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3673 f6i->fib6_type = RTN_ANYCAST;
3674 f6i->fib6_flags |= RTF_ANYCAST;
3676 f6i->fib6_type = RTN_LOCAL;
3677 f6i->fib6_flags |= RTF_LOCAL;
3680 f6i->fib6_nh.nh_gw = *addr;
3682 f6i->fib6_nh.nh_dev = dev;
3683 f6i->fib6_dst.addr = *addr;
3684 f6i->fib6_dst.plen = 128;
3685 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3686 f6i->fib6_table = fib6_get_table(net, tb_id);
3691 /* remove deleted ip from prefsrc entries */
3692 struct arg_dev_net_ip {
3693 struct net_device *dev;
3695 struct in6_addr *addr;
3698 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3700 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3701 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3702 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3704 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3705 rt != net->ipv6.fib6_null_entry &&
3706 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3707 spin_lock_bh(&rt6_exception_lock);
3708 /* remove prefsrc entry */
3709 rt->fib6_prefsrc.plen = 0;
3710 /* need to update cache as well */
3711 rt6_exceptions_remove_prefsrc(rt);
3712 spin_unlock_bh(&rt6_exception_lock);
3717 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3719 struct net *net = dev_net(ifp->idev->dev);
3720 struct arg_dev_net_ip adni = {
3721 .dev = ifp->idev->dev,
3725 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3728 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3730 /* Remove routers and update dst entries when gateway turn into host. */
3731 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3733 struct in6_addr *gateway = (struct in6_addr *)arg;
3735 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3736 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3740 /* Further clean up cached routes in exception table.
3741 * This is needed because cached route may have a different
3742 * gateway than its 'parent' in the case of an ip redirect.
3744 rt6_exceptions_clean_tohost(rt, gateway);
3749 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3751 fib6_clean_all(net, fib6_clean_tohost, gateway);
3754 struct arg_netdev_event {
3755 const struct net_device *dev;
3757 unsigned int nh_flags;
3758 unsigned long event;
3762 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3764 struct fib6_info *iter;
3765 struct fib6_node *fn;
3767 fn = rcu_dereference_protected(rt->fib6_node,
3768 lockdep_is_held(&rt->fib6_table->tb6_lock));
3769 iter = rcu_dereference_protected(fn->leaf,
3770 lockdep_is_held(&rt->fib6_table->tb6_lock));
3772 if (iter->fib6_metric == rt->fib6_metric &&
3773 rt6_qualify_for_ecmp(iter))
3775 iter = rcu_dereference_protected(iter->rt6_next,
3776 lockdep_is_held(&rt->fib6_table->tb6_lock));
3782 static bool rt6_is_dead(const struct fib6_info *rt)
3784 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3785 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3786 fib6_ignore_linkdown(rt)))
3792 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3794 struct fib6_info *iter;
3797 if (!rt6_is_dead(rt))
3798 total += rt->fib6_nh.nh_weight;
3800 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3801 if (!rt6_is_dead(iter))
3802 total += iter->fib6_nh.nh_weight;
3808 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3810 int upper_bound = -1;
3812 if (!rt6_is_dead(rt)) {
3813 *weight += rt->fib6_nh.nh_weight;
3814 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3817 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3820 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3822 struct fib6_info *iter;
3825 rt6_upper_bound_set(rt, &weight, total);
3827 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3828 rt6_upper_bound_set(iter, &weight, total);
3831 void rt6_multipath_rebalance(struct fib6_info *rt)
3833 struct fib6_info *first;
3836 /* In case the entire multipath route was marked for flushing,
3837 * then there is no need to rebalance upon the removal of every
3840 if (!rt->fib6_nsiblings || rt->should_flush)
3843 /* During lookup routes are evaluated in order, so we need to
3844 * make sure upper bounds are assigned from the first sibling
3847 first = rt6_multipath_first_sibling(rt);
3848 if (WARN_ON_ONCE(!first))
3851 total = rt6_multipath_total_weight(first);
3852 rt6_multipath_upper_bound_set(first, total);
3855 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3857 const struct arg_netdev_event *arg = p_arg;
3858 struct net *net = dev_net(arg->dev);
3860 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3861 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3862 fib6_update_sernum_upto_root(net, rt);
3863 rt6_multipath_rebalance(rt);
3869 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3871 struct arg_netdev_event arg = {
3874 .nh_flags = nh_flags,
3878 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3879 arg.nh_flags |= RTNH_F_LINKDOWN;
3881 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3884 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3885 const struct net_device *dev)
3887 struct fib6_info *iter;
3889 if (rt->fib6_nh.nh_dev == dev)
3891 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3892 if (iter->fib6_nh.nh_dev == dev)
3898 static void rt6_multipath_flush(struct fib6_info *rt)
3900 struct fib6_info *iter;
3902 rt->should_flush = 1;
3903 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3904 iter->should_flush = 1;
3907 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3908 const struct net_device *down_dev)
3910 struct fib6_info *iter;
3911 unsigned int dead = 0;
3913 if (rt->fib6_nh.nh_dev == down_dev ||
3914 rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3916 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3917 if (iter->fib6_nh.nh_dev == down_dev ||
3918 iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3924 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3925 const struct net_device *dev,
3926 unsigned int nh_flags)
3928 struct fib6_info *iter;
3930 if (rt->fib6_nh.nh_dev == dev)
3931 rt->fib6_nh.nh_flags |= nh_flags;
3932 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3933 if (iter->fib6_nh.nh_dev == dev)
3934 iter->fib6_nh.nh_flags |= nh_flags;
3937 /* called with write lock held for table with rt */
3938 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
3940 const struct arg_netdev_event *arg = p_arg;
3941 const struct net_device *dev = arg->dev;
3942 struct net *net = dev_net(dev);
3944 if (rt == net->ipv6.fib6_null_entry)
3947 switch (arg->event) {
3948 case NETDEV_UNREGISTER:
3949 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3951 if (rt->should_flush)
3953 if (!rt->fib6_nsiblings)
3954 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3955 if (rt6_multipath_uses_dev(rt, dev)) {
3958 count = rt6_multipath_dead_count(rt, dev);
3959 if (rt->fib6_nsiblings + 1 == count) {
3960 rt6_multipath_flush(rt);
3963 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3965 fib6_update_sernum(net, rt);
3966 rt6_multipath_rebalance(rt);
3970 if (rt->fib6_nh.nh_dev != dev ||
3971 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
3973 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3974 rt6_multipath_rebalance(rt);
3981 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3983 struct arg_netdev_event arg = {
3990 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3993 void rt6_disable_ip(struct net_device *dev, unsigned long event)
3995 rt6_sync_down_dev(dev, event);
3996 rt6_uncached_list_flush_dev(dev_net(dev), dev);
3997 neigh_ifdown(&nd_tbl, dev);
4000 struct rt6_mtu_change_arg {
4001 struct net_device *dev;
4005 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4007 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4008 struct inet6_dev *idev;
4010 /* In IPv6 pmtu discovery is not optional,
4011 so that RTAX_MTU lock cannot disable it.
4012 We still use this lock to block changes
4013 caused by addrconf/ndisc.
4016 idev = __in6_dev_get(arg->dev);
4020 /* For administrative MTU increase, there is no way to discover
4021 IPv6 PMTU increase, so PMTU increase should be updated here.
4022 Since RFC 1981 doesn't include administrative MTU increase
4023 update PMTU increase is a MUST. (i.e. jumbo frame)
4025 if (rt->fib6_nh.nh_dev == arg->dev &&
4026 !fib6_metric_locked(rt, RTAX_MTU)) {
4027 u32 mtu = rt->fib6_pmtu;
4029 if (mtu >= arg->mtu ||
4030 (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4031 fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4033 spin_lock_bh(&rt6_exception_lock);
4034 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4035 spin_unlock_bh(&rt6_exception_lock);
4040 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4042 struct rt6_mtu_change_arg arg = {
4047 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4050 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4051 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
4052 [RTA_OIF] = { .type = NLA_U32 },
4053 [RTA_IIF] = { .type = NLA_U32 },
4054 [RTA_PRIORITY] = { .type = NLA_U32 },
4055 [RTA_METRICS] = { .type = NLA_NESTED },
4056 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
4057 [RTA_PREF] = { .type = NLA_U8 },
4058 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
4059 [RTA_ENCAP] = { .type = NLA_NESTED },
4060 [RTA_EXPIRES] = { .type = NLA_U32 },
4061 [RTA_UID] = { .type = NLA_U32 },
4062 [RTA_MARK] = { .type = NLA_U32 },
4065 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4066 struct fib6_config *cfg,
4067 struct netlink_ext_ack *extack)
4070 struct nlattr *tb[RTA_MAX+1];
4074 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4080 rtm = nlmsg_data(nlh);
4081 memset(cfg, 0, sizeof(*cfg));
4083 cfg->fc_table = rtm->rtm_table;
4084 cfg->fc_dst_len = rtm->rtm_dst_len;
4085 cfg->fc_src_len = rtm->rtm_src_len;
4086 cfg->fc_flags = RTF_UP;
4087 cfg->fc_protocol = rtm->rtm_protocol;
4088 cfg->fc_type = rtm->rtm_type;
4090 if (rtm->rtm_type == RTN_UNREACHABLE ||
4091 rtm->rtm_type == RTN_BLACKHOLE ||
4092 rtm->rtm_type == RTN_PROHIBIT ||
4093 rtm->rtm_type == RTN_THROW)
4094 cfg->fc_flags |= RTF_REJECT;
4096 if (rtm->rtm_type == RTN_LOCAL)
4097 cfg->fc_flags |= RTF_LOCAL;
4099 if (rtm->rtm_flags & RTM_F_CLONED)
4100 cfg->fc_flags |= RTF_CACHE;
4102 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4104 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4105 cfg->fc_nlinfo.nlh = nlh;
4106 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4108 if (tb[RTA_GATEWAY]) {
4109 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4110 cfg->fc_flags |= RTF_GATEWAY;
4114 int plen = (rtm->rtm_dst_len + 7) >> 3;
4116 if (nla_len(tb[RTA_DST]) < plen)
4119 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4123 int plen = (rtm->rtm_src_len + 7) >> 3;
4125 if (nla_len(tb[RTA_SRC]) < plen)
4128 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4131 if (tb[RTA_PREFSRC])
4132 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4135 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4137 if (tb[RTA_PRIORITY])
4138 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4140 if (tb[RTA_METRICS]) {
4141 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4142 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4146 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4148 if (tb[RTA_MULTIPATH]) {
4149 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4150 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4152 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4153 cfg->fc_mp_len, extack);
4159 pref = nla_get_u8(tb[RTA_PREF]);
4160 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4161 pref != ICMPV6_ROUTER_PREF_HIGH)
4162 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4163 cfg->fc_flags |= RTF_PREF(pref);
4167 cfg->fc_encap = tb[RTA_ENCAP];
4169 if (tb[RTA_ENCAP_TYPE]) {
4170 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4172 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4177 if (tb[RTA_EXPIRES]) {
4178 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4180 if (addrconf_finite_timeout(timeout)) {
4181 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4182 cfg->fc_flags |= RTF_EXPIRES;
4192 struct fib6_info *fib6_info;
4193 struct fib6_config r_cfg;
4194 struct list_head next;
4197 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4201 list_for_each_entry(nh, rt6_nh_list, next) {
4202 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4203 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4204 nh->r_cfg.fc_ifindex);
4208 static int ip6_route_info_append(struct net *net,
4209 struct list_head *rt6_nh_list,
4210 struct fib6_info *rt,
4211 struct fib6_config *r_cfg)
4216 list_for_each_entry(nh, rt6_nh_list, next) {
4217 /* check if fib6_info already exists */
4218 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4222 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4226 err = ip6_convert_metrics(net, rt, r_cfg);
4231 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4232 list_add_tail(&nh->next, rt6_nh_list);
4237 static void ip6_route_mpath_notify(struct fib6_info *rt,
4238 struct fib6_info *rt_last,
4239 struct nl_info *info,
4242 /* if this is an APPEND route, then rt points to the first route
4243 * inserted and rt_last points to last route inserted. Userspace
4244 * wants a consistent dump of the route which starts at the first
4245 * nexthop. Since sibling routes are always added at the end of
4246 * the list, find the first sibling of the last route appended
4248 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4249 rt = list_first_entry(&rt_last->fib6_siblings,
4255 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4258 static int ip6_route_multipath_add(struct fib6_config *cfg,
4259 struct netlink_ext_ack *extack)
4261 struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4262 struct nl_info *info = &cfg->fc_nlinfo;
4263 struct fib6_config r_cfg;
4264 struct rtnexthop *rtnh;
4265 struct fib6_info *rt;
4266 struct rt6_nh *err_nh;
4267 struct rt6_nh *nh, *nh_safe;
4273 int replace = (cfg->fc_nlinfo.nlh &&
4274 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4275 LIST_HEAD(rt6_nh_list);
4277 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4278 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4279 nlflags |= NLM_F_APPEND;
4281 remaining = cfg->fc_mp_len;
4282 rtnh = (struct rtnexthop *)cfg->fc_mp;
4284 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4285 * fib6_info structs per nexthop
4287 while (rtnh_ok(rtnh, remaining)) {
4288 memcpy(&r_cfg, cfg, sizeof(*cfg));
4289 if (rtnh->rtnh_ifindex)
4290 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4292 attrlen = rtnh_attrlen(rtnh);
4294 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4296 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4298 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4299 r_cfg.fc_flags |= RTF_GATEWAY;
4301 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4302 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4304 r_cfg.fc_encap_type = nla_get_u16(nla);
4307 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4308 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4315 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4317 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4320 fib6_info_release(rt);
4324 rtnh = rtnh_next(rtnh, &remaining);
4327 /* for add and replace send one notification with all nexthops.
4328 * Skip the notification in fib6_add_rt2node and send one with
4329 * the full route when done
4331 info->skip_notify = 1;
4334 list_for_each_entry(nh, &rt6_nh_list, next) {
4335 rt_last = nh->fib6_info;
4336 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4337 fib6_info_release(nh->fib6_info);
4339 /* save reference to first route for notification */
4340 if (!rt_notif && !err)
4341 rt_notif = nh->fib6_info;
4343 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4344 nh->fib6_info = NULL;
4347 ip6_print_replace_route_err(&rt6_nh_list);
4352 /* Because each route is added like a single route we remove
4353 * these flags after the first nexthop: if there is a collision,
4354 * we have already failed to add the first nexthop:
4355 * fib6_add_rt2node() has rejected it; when replacing, old
4356 * nexthops have been replaced by first new, the rest should
4359 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4364 /* success ... tell user about new route */
4365 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4369 /* send notification for routes that were added so that
4370 * the delete notifications sent by ip6_route_del are
4374 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4376 /* Delete routes that were already added */
4377 list_for_each_entry(nh, &rt6_nh_list, next) {
4380 ip6_route_del(&nh->r_cfg, extack);
4384 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4386 fib6_info_release(nh->fib6_info);
4387 list_del(&nh->next);
4394 static int ip6_route_multipath_del(struct fib6_config *cfg,
4395 struct netlink_ext_ack *extack)
4397 struct fib6_config r_cfg;
4398 struct rtnexthop *rtnh;
4401 int err = 1, last_err = 0;
4403 remaining = cfg->fc_mp_len;
4404 rtnh = (struct rtnexthop *)cfg->fc_mp;
4406 /* Parse a Multipath Entry */
4407 while (rtnh_ok(rtnh, remaining)) {
4408 memcpy(&r_cfg, cfg, sizeof(*cfg));
4409 if (rtnh->rtnh_ifindex)
4410 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4412 attrlen = rtnh_attrlen(rtnh);
4414 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4416 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4418 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4419 r_cfg.fc_flags |= RTF_GATEWAY;
4422 err = ip6_route_del(&r_cfg, extack);
4426 rtnh = rtnh_next(rtnh, &remaining);
4432 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4433 struct netlink_ext_ack *extack)
4435 struct fib6_config cfg;
4438 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4443 return ip6_route_multipath_del(&cfg, extack);
4445 cfg.fc_delete_all_nh = 1;
4446 return ip6_route_del(&cfg, extack);
4450 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4451 struct netlink_ext_ack *extack)
4453 struct fib6_config cfg;
4456 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4461 return ip6_route_multipath_add(&cfg, extack);
4463 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4466 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4468 int nexthop_len = 0;
4470 if (rt->fib6_nsiblings) {
4471 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4472 + NLA_ALIGN(sizeof(struct rtnexthop))
4473 + nla_total_size(16) /* RTA_GATEWAY */
4474 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4476 nexthop_len *= rt->fib6_nsiblings;
4479 return NLMSG_ALIGN(sizeof(struct rtmsg))
4480 + nla_total_size(16) /* RTA_SRC */
4481 + nla_total_size(16) /* RTA_DST */
4482 + nla_total_size(16) /* RTA_GATEWAY */
4483 + nla_total_size(16) /* RTA_PREFSRC */
4484 + nla_total_size(4) /* RTA_TABLE */
4485 + nla_total_size(4) /* RTA_IIF */
4486 + nla_total_size(4) /* RTA_OIF */
4487 + nla_total_size(4) /* RTA_PRIORITY */
4488 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4489 + nla_total_size(sizeof(struct rta_cacheinfo))
4490 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4491 + nla_total_size(1) /* RTA_PREF */
4492 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4496 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4497 unsigned int *flags, bool skip_oif)
4499 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4500 *flags |= RTNH_F_DEAD;
4502 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4503 *flags |= RTNH_F_LINKDOWN;
4506 if (fib6_ignore_linkdown(rt))
4507 *flags |= RTNH_F_DEAD;
4511 if (rt->fib6_flags & RTF_GATEWAY) {
4512 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4513 goto nla_put_failure;
4516 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4517 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4518 *flags |= RTNH_F_OFFLOAD;
4520 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4521 if (!skip_oif && rt->fib6_nh.nh_dev &&
4522 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4523 goto nla_put_failure;
4525 if (rt->fib6_nh.nh_lwtstate &&
4526 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4527 goto nla_put_failure;
4535 /* add multipath next hop */
4536 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4538 const struct net_device *dev = rt->fib6_nh.nh_dev;
4539 struct rtnexthop *rtnh;
4540 unsigned int flags = 0;
4542 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4544 goto nla_put_failure;
4546 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4547 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4549 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4550 goto nla_put_failure;
4552 rtnh->rtnh_flags = flags;
4554 /* length of rtnetlink header + attributes */
4555 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4563 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4564 struct fib6_info *rt, struct dst_entry *dst,
4565 struct in6_addr *dest, struct in6_addr *src,
4566 int iif, int type, u32 portid, u32 seq,
4570 struct nlmsghdr *nlh;
4575 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4579 rtm = nlmsg_data(nlh);
4580 rtm->rtm_family = AF_INET6;
4581 rtm->rtm_dst_len = rt->fib6_dst.plen;
4582 rtm->rtm_src_len = rt->fib6_src.plen;
4585 table = rt->fib6_table->tb6_id;
4587 table = RT6_TABLE_UNSPEC;
4588 rtm->rtm_table = table;
4589 if (nla_put_u32(skb, RTA_TABLE, table))
4590 goto nla_put_failure;
4592 rtm->rtm_type = rt->fib6_type;
4594 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4595 rtm->rtm_protocol = rt->fib6_protocol;
4597 if (rt->fib6_flags & RTF_CACHE)
4598 rtm->rtm_flags |= RTM_F_CLONED;
4601 if (nla_put_in6_addr(skb, RTA_DST, dest))
4602 goto nla_put_failure;
4603 rtm->rtm_dst_len = 128;
4604 } else if (rtm->rtm_dst_len)
4605 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr))
4606 goto nla_put_failure;
4607 #ifdef CONFIG_IPV6_SUBTREES
4609 if (nla_put_in6_addr(skb, RTA_SRC, src))
4610 goto nla_put_failure;
4611 rtm->rtm_src_len = 128;
4612 } else if (rtm->rtm_src_len &&
4613 nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr))
4614 goto nla_put_failure;
4617 #ifdef CONFIG_IPV6_MROUTE
4618 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) {
4619 int err = ip6mr_get_route(net, skb, rtm, portid);
4624 goto nla_put_failure;
4627 if (nla_put_u32(skb, RTA_IIF, iif))
4628 goto nla_put_failure;
4630 struct in6_addr saddr_buf;
4631 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4632 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4633 goto nla_put_failure;
4636 if (rt->fib6_prefsrc.plen) {
4637 struct in6_addr saddr_buf;
4638 saddr_buf = rt->fib6_prefsrc.addr;
4639 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4640 goto nla_put_failure;
4643 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4644 if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4645 goto nla_put_failure;
4647 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4648 goto nla_put_failure;
4650 /* For multipath routes, walk the siblings list and add
4651 * each as a nexthop within RTA_MULTIPATH.
4653 if (rt->fib6_nsiblings) {
4654 struct fib6_info *sibling, *next_sibling;
4657 mp = nla_nest_start(skb, RTA_MULTIPATH);
4659 goto nla_put_failure;
4661 if (rt6_add_nexthop(skb, rt) < 0)
4662 goto nla_put_failure;
4664 list_for_each_entry_safe(sibling, next_sibling,
4665 &rt->fib6_siblings, fib6_siblings) {
4666 if (rt6_add_nexthop(skb, sibling) < 0)
4667 goto nla_put_failure;
4670 nla_nest_end(skb, mp);
4672 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4673 goto nla_put_failure;
4676 if (rt->fib6_flags & RTF_EXPIRES) {
4677 expires = dst ? dst->expires : rt->expires;
4681 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4682 goto nla_put_failure;
4684 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags)))
4685 goto nla_put_failure;
4688 nlmsg_end(skb, nlh);
4692 nlmsg_cancel(skb, nlh);
4696 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4698 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4699 struct net *net = arg->net;
4701 if (rt == net->ipv6.fib6_null_entry)
4704 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4705 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4707 /* user wants prefix routes only */
4708 if (rtm->rtm_flags & RTM_F_PREFIX &&
4709 !(rt->fib6_flags & RTF_PREFIX_RT)) {
4710 /* success since this is not a prefix route */
4715 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4716 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4717 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4720 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4721 struct netlink_ext_ack *extack)
4723 struct net *net = sock_net(in_skb->sk);
4724 struct nlattr *tb[RTA_MAX+1];
4725 int err, iif = 0, oif = 0;
4726 struct fib6_info *from;
4727 struct dst_entry *dst;
4728 struct rt6_info *rt;
4729 struct sk_buff *skb;
4734 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4740 memset(&fl6, 0, sizeof(fl6));
4741 rtm = nlmsg_data(nlh);
4742 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4743 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4746 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4749 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4753 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4756 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4760 iif = nla_get_u32(tb[RTA_IIF]);
4763 oif = nla_get_u32(tb[RTA_OIF]);
4766 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4769 fl6.flowi6_uid = make_kuid(current_user_ns(),
4770 nla_get_u32(tb[RTA_UID]));
4772 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4775 struct net_device *dev;
4780 dev = dev_get_by_index_rcu(net, iif);
4787 fl6.flowi6_iif = iif;
4789 if (!ipv6_addr_any(&fl6.saddr))
4790 flags |= RT6_LOOKUP_F_HAS_SADDR;
4792 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4796 fl6.flowi6_oif = oif;
4798 dst = ip6_route_output(net, NULL, &fl6);
4802 rt = container_of(dst, struct rt6_info, dst);
4803 if (rt->dst.error) {
4804 err = rt->dst.error;
4809 if (rt == net->ipv6.ip6_null_entry) {
4810 err = rt->dst.error;
4815 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4822 skb_dst_set(skb, &rt->dst);
4825 from = rcu_dereference(rt->from);
4828 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4829 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4832 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4833 &fl6.saddr, iif, RTM_NEWROUTE,
4834 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4843 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4848 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4849 unsigned int nlm_flags)
4851 struct sk_buff *skb;
4852 struct net *net = info->nl_net;
4857 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4859 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4863 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4864 event, info->portid, seq, nlm_flags);
4866 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4867 WARN_ON(err == -EMSGSIZE);
4871 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4872 info->nlh, gfp_any());
4876 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4879 static int ip6_route_dev_notify(struct notifier_block *this,
4880 unsigned long event, void *ptr)
4882 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4883 struct net *net = dev_net(dev);
4885 if (!(dev->flags & IFF_LOOPBACK))
4888 if (event == NETDEV_REGISTER) {
4889 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
4890 net->ipv6.ip6_null_entry->dst.dev = dev;
4891 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4892 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4893 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4894 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4895 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4896 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4898 } else if (event == NETDEV_UNREGISTER &&
4899 dev->reg_state != NETREG_UNREGISTERED) {
4900 /* NETDEV_UNREGISTER could be fired for multiple times by
4901 * netdev_wait_allrefs(). Make sure we only call this once.
4903 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4904 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4905 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4906 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4917 #ifdef CONFIG_PROC_FS
4919 static const struct file_operations ipv6_route_proc_fops = {
4920 .open = ipv6_route_open,
4922 .llseek = seq_lseek,
4923 .release = seq_release_net,
4926 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4928 struct net *net = (struct net *)seq->private;
4929 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4930 net->ipv6.rt6_stats->fib_nodes,
4931 net->ipv6.rt6_stats->fib_route_nodes,
4932 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4933 net->ipv6.rt6_stats->fib_rt_entries,
4934 net->ipv6.rt6_stats->fib_rt_cache,
4935 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4936 net->ipv6.rt6_stats->fib_discarded_routes);
4941 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4943 return single_open_net(inode, file, rt6_stats_seq_show);
4946 static const struct file_operations rt6_stats_seq_fops = {
4947 .open = rt6_stats_seq_open,
4949 .llseek = seq_lseek,
4950 .release = single_release_net,
4952 #endif /* CONFIG_PROC_FS */
4954 #ifdef CONFIG_SYSCTL
4957 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4958 void __user *buffer, size_t *lenp, loff_t *ppos)
4965 net = (struct net *)ctl->extra1;
4966 delay = net->ipv6.sysctl.flush_delay;
4967 proc_dointvec(ctl, write, buffer, lenp, ppos);
4968 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4972 struct ctl_table ipv6_route_table_template[] = {
4974 .procname = "flush",
4975 .data = &init_net.ipv6.sysctl.flush_delay,
4976 .maxlen = sizeof(int),
4978 .proc_handler = ipv6_sysctl_rtcache_flush
4981 .procname = "gc_thresh",
4982 .data = &ip6_dst_ops_template.gc_thresh,
4983 .maxlen = sizeof(int),
4985 .proc_handler = proc_dointvec,
4988 .procname = "max_size",
4989 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
4990 .maxlen = sizeof(int),
4992 .proc_handler = proc_dointvec,
4995 .procname = "gc_min_interval",
4996 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4997 .maxlen = sizeof(int),
4999 .proc_handler = proc_dointvec_jiffies,
5002 .procname = "gc_timeout",
5003 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5004 .maxlen = sizeof(int),
5006 .proc_handler = proc_dointvec_jiffies,
5009 .procname = "gc_interval",
5010 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5011 .maxlen = sizeof(int),
5013 .proc_handler = proc_dointvec_jiffies,
5016 .procname = "gc_elasticity",
5017 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5018 .maxlen = sizeof(int),
5020 .proc_handler = proc_dointvec,
5023 .procname = "mtu_expires",
5024 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5025 .maxlen = sizeof(int),
5027 .proc_handler = proc_dointvec_jiffies,
5030 .procname = "min_adv_mss",
5031 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5032 .maxlen = sizeof(int),
5034 .proc_handler = proc_dointvec,
5037 .procname = "gc_min_interval_ms",
5038 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5039 .maxlen = sizeof(int),
5041 .proc_handler = proc_dointvec_ms_jiffies,
5046 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5048 struct ctl_table *table;
5050 table = kmemdup(ipv6_route_table_template,
5051 sizeof(ipv6_route_table_template),
5055 table[0].data = &net->ipv6.sysctl.flush_delay;
5056 table[0].extra1 = net;
5057 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5058 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5059 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5060 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5061 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5062 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5063 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5064 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5065 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5067 /* Don't export sysctls to unprivileged users */
5068 if (net->user_ns != &init_user_ns)
5069 table[0].procname = NULL;
5076 static int __net_init ip6_route_net_init(struct net *net)
5080 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5081 sizeof(net->ipv6.ip6_dst_ops));
5083 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5084 goto out_ip6_dst_ops;
5086 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5087 sizeof(*net->ipv6.fib6_null_entry),
5089 if (!net->ipv6.fib6_null_entry)
5090 goto out_ip6_dst_entries;
5092 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5093 sizeof(*net->ipv6.ip6_null_entry),
5095 if (!net->ipv6.ip6_null_entry)
5096 goto out_fib6_null_entry;
5097 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5098 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5099 ip6_template_metrics, true);
5101 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5102 net->ipv6.fib6_has_custom_rules = false;
5103 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5104 sizeof(*net->ipv6.ip6_prohibit_entry),
5106 if (!net->ipv6.ip6_prohibit_entry)
5107 goto out_ip6_null_entry;
5108 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5109 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5110 ip6_template_metrics, true);
5112 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5113 sizeof(*net->ipv6.ip6_blk_hole_entry),
5115 if (!net->ipv6.ip6_blk_hole_entry)
5116 goto out_ip6_prohibit_entry;
5117 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5118 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5119 ip6_template_metrics, true);
5122 net->ipv6.sysctl.flush_delay = 0;
5123 net->ipv6.sysctl.ip6_rt_max_size = 4096;
5124 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5125 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5126 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5127 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5128 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5129 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5131 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5137 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5138 out_ip6_prohibit_entry:
5139 kfree(net->ipv6.ip6_prohibit_entry);
5141 kfree(net->ipv6.ip6_null_entry);
5143 out_fib6_null_entry:
5144 kfree(net->ipv6.fib6_null_entry);
5145 out_ip6_dst_entries:
5146 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5151 static void __net_exit ip6_route_net_exit(struct net *net)
5153 kfree(net->ipv6.fib6_null_entry);
5154 kfree(net->ipv6.ip6_null_entry);
5155 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5156 kfree(net->ipv6.ip6_prohibit_entry);
5157 kfree(net->ipv6.ip6_blk_hole_entry);
5159 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5162 static int __net_init ip6_route_net_init_late(struct net *net)
5164 #ifdef CONFIG_PROC_FS
5165 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
5166 proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
5171 static void __net_exit ip6_route_net_exit_late(struct net *net)
5173 #ifdef CONFIG_PROC_FS
5174 remove_proc_entry("ipv6_route", net->proc_net);
5175 remove_proc_entry("rt6_stats", net->proc_net);
5179 static struct pernet_operations ip6_route_net_ops = {
5180 .init = ip6_route_net_init,
5181 .exit = ip6_route_net_exit,
5184 static int __net_init ipv6_inetpeer_init(struct net *net)
5186 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5190 inet_peer_base_init(bp);
5191 net->ipv6.peers = bp;
5195 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5197 struct inet_peer_base *bp = net->ipv6.peers;
5199 net->ipv6.peers = NULL;
5200 inetpeer_invalidate_tree(bp);
5204 static struct pernet_operations ipv6_inetpeer_ops = {
5205 .init = ipv6_inetpeer_init,
5206 .exit = ipv6_inetpeer_exit,
5209 static struct pernet_operations ip6_route_net_late_ops = {
5210 .init = ip6_route_net_init_late,
5211 .exit = ip6_route_net_exit_late,
5214 static struct notifier_block ip6_route_dev_notifier = {
5215 .notifier_call = ip6_route_dev_notify,
5216 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5219 void __init ip6_route_init_special_entries(void)
5221 /* Registering of the loopback is done before this portion of code,
5222 * the loopback reference in rt6_info will not be taken, do it
5223 * manually for init_net */
5224 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5225 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5226 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5227 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5228 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5229 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5230 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5231 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5235 int __init ip6_route_init(void)
5241 ip6_dst_ops_template.kmem_cachep =
5242 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5243 SLAB_HWCACHE_ALIGN, NULL);
5244 if (!ip6_dst_ops_template.kmem_cachep)
5247 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5249 goto out_kmem_cache;
5251 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5253 goto out_dst_entries;
5255 ret = register_pernet_subsys(&ip6_route_net_ops);
5257 goto out_register_inetpeer;
5259 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5263 goto out_register_subsys;
5269 ret = fib6_rules_init();
5273 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5275 goto fib6_rules_init;
5277 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5278 inet6_rtm_newroute, NULL, 0);
5280 goto out_register_late_subsys;
5282 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5283 inet6_rtm_delroute, NULL, 0);
5285 goto out_register_late_subsys;
5287 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5288 inet6_rtm_getroute, NULL,
5289 RTNL_FLAG_DOIT_UNLOCKED);
5291 goto out_register_late_subsys;
5293 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5295 goto out_register_late_subsys;
5297 for_each_possible_cpu(cpu) {
5298 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5300 INIT_LIST_HEAD(&ul->head);
5301 spin_lock_init(&ul->lock);
5307 out_register_late_subsys:
5308 rtnl_unregister_all(PF_INET6);
5309 unregister_pernet_subsys(&ip6_route_net_late_ops);
5311 fib6_rules_cleanup();
5316 out_register_subsys:
5317 unregister_pernet_subsys(&ip6_route_net_ops);
5318 out_register_inetpeer:
5319 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5321 dst_entries_destroy(&ip6_dst_blackhole_ops);
5323 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5327 void ip6_route_cleanup(void)
5329 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5330 unregister_pernet_subsys(&ip6_route_net_late_ops);
5331 fib6_rules_cleanup();
5334 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5335 unregister_pernet_subsys(&ip6_route_net_ops);
5336 dst_entries_destroy(&ip6_dst_blackhole_ops);
5337 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);