2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
67 #include <linux/uaccess.h>
70 #include <linux/sysctl.h>
73 static int ip6_rt_type_to_error(u8 fib6_type);
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
81 RT6_NUD_FAIL_HARD = -3,
82 RT6_NUD_FAIL_PROBE = -2,
83 RT6_NUD_FAIL_DO_RR = -1,
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void ip6_dst_destroy(struct dst_entry *);
92 static void ip6_dst_ifdown(struct dst_entry *,
93 struct net_device *dev, int how);
94 static int ip6_dst_gc(struct dst_ops *ops);
96 static int ip6_pkt_discard(struct sk_buff *skb);
97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int ip6_pkt_prohibit(struct sk_buff *skb);
99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void ip6_link_failure(struct sk_buff *skb);
101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102 struct sk_buff *skb, u32 mtu);
103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104 struct sk_buff *skb);
105 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
107 static size_t rt6_nlmsg_size(struct fib6_info *rt);
108 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
109 struct fib6_info *rt, struct dst_entry *dst,
110 struct in6_addr *dest, struct in6_addr *src,
111 int iif, int type, u32 portid, u32 seq,
113 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
114 struct in6_addr *daddr,
115 struct in6_addr *saddr);
117 #ifdef CONFIG_IPV6_ROUTE_INFO
118 static struct fib6_info *rt6_add_route_info(struct net *net,
119 const struct in6_addr *prefix, int prefixlen,
120 const struct in6_addr *gwaddr,
121 struct net_device *dev,
123 static struct fib6_info *rt6_get_route_info(struct net *net,
124 const struct in6_addr *prefix, int prefixlen,
125 const struct in6_addr *gwaddr,
126 struct net_device *dev);
129 struct uncached_list {
131 struct list_head head;
134 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
136 void rt6_uncached_list_add(struct rt6_info *rt)
138 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
140 rt->rt6i_uncached_list = ul;
142 spin_lock_bh(&ul->lock);
143 list_add_tail(&rt->rt6i_uncached, &ul->head);
144 spin_unlock_bh(&ul->lock);
147 void rt6_uncached_list_del(struct rt6_info *rt)
149 if (!list_empty(&rt->rt6i_uncached)) {
150 struct uncached_list *ul = rt->rt6i_uncached_list;
151 struct net *net = dev_net(rt->dst.dev);
153 spin_lock_bh(&ul->lock);
154 list_del(&rt->rt6i_uncached);
155 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
156 spin_unlock_bh(&ul->lock);
160 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
162 struct net_device *loopback_dev = net->loopback_dev;
165 if (dev == loopback_dev)
168 for_each_possible_cpu(cpu) {
169 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
172 spin_lock_bh(&ul->lock);
173 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
174 struct inet6_dev *rt_idev = rt->rt6i_idev;
175 struct net_device *rt_dev = rt->dst.dev;
177 if (rt_idev->dev == dev) {
178 rt->rt6i_idev = in6_dev_get(loopback_dev);
179 in6_dev_put(rt_idev);
183 rt->dst.dev = loopback_dev;
184 dev_hold(rt->dst.dev);
188 spin_unlock_bh(&ul->lock);
192 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
196 if (!ipv6_addr_any(p))
197 return (const void *) p;
199 return &ipv6_hdr(skb)->daddr;
203 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
204 struct net_device *dev,
210 daddr = choose_neigh_daddr(gw, skb, daddr);
211 n = __ipv6_neigh_lookup(dev, daddr);
215 n = neigh_create(&nd_tbl, daddr, dev);
216 return IS_ERR(n) ? NULL : n;
219 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
223 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
225 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
228 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
230 struct net_device *dev = dst->dev;
231 struct rt6_info *rt = (struct rt6_info *)dst;
233 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
236 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
238 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
240 __ipv6_confirm_neigh(dev, daddr);
243 static struct dst_ops ip6_dst_ops_template = {
247 .check = ip6_dst_check,
248 .default_advmss = ip6_default_advmss,
250 .cow_metrics = dst_cow_metrics_generic,
251 .destroy = ip6_dst_destroy,
252 .ifdown = ip6_dst_ifdown,
253 .negative_advice = ip6_negative_advice,
254 .link_failure = ip6_link_failure,
255 .update_pmtu = ip6_rt_update_pmtu,
256 .redirect = rt6_do_redirect,
257 .local_out = __ip6_local_out,
258 .neigh_lookup = ip6_dst_neigh_lookup,
259 .confirm_neigh = ip6_confirm_neigh,
262 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
264 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
266 return mtu ? : dst->dev->mtu;
269 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
270 struct sk_buff *skb, u32 mtu)
274 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
279 static struct dst_ops ip6_dst_blackhole_ops = {
281 .destroy = ip6_dst_destroy,
282 .check = ip6_dst_check,
283 .mtu = ip6_blackhole_mtu,
284 .default_advmss = ip6_default_advmss,
285 .update_pmtu = ip6_rt_blackhole_update_pmtu,
286 .redirect = ip6_rt_blackhole_redirect,
287 .cow_metrics = dst_cow_metrics_generic,
288 .neigh_lookup = ip6_dst_neigh_lookup,
291 static const u32 ip6_template_metrics[RTAX_MAX] = {
292 [RTAX_HOPLIMIT - 1] = 0,
295 static const struct fib6_info fib6_null_entry_template = {
296 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
297 .fib6_protocol = RTPROT_KERNEL,
298 .fib6_metric = ~(u32)0,
299 .fib6_ref = ATOMIC_INIT(1),
300 .fib6_type = RTN_UNREACHABLE,
301 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
304 static const struct rt6_info ip6_null_entry_template = {
306 .__refcnt = ATOMIC_INIT(1),
308 .obsolete = DST_OBSOLETE_FORCE_CHK,
309 .error = -ENETUNREACH,
310 .input = ip6_pkt_discard,
311 .output = ip6_pkt_discard_out,
313 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
316 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
318 static const struct rt6_info ip6_prohibit_entry_template = {
320 .__refcnt = ATOMIC_INIT(1),
322 .obsolete = DST_OBSOLETE_FORCE_CHK,
324 .input = ip6_pkt_prohibit,
325 .output = ip6_pkt_prohibit_out,
327 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
330 static const struct rt6_info ip6_blk_hole_entry_template = {
332 .__refcnt = ATOMIC_INIT(1),
334 .obsolete = DST_OBSOLETE_FORCE_CHK,
336 .input = dst_discard,
337 .output = dst_discard_out,
339 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
344 static void rt6_info_init(struct rt6_info *rt)
346 struct dst_entry *dst = &rt->dst;
348 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
349 INIT_LIST_HEAD(&rt->rt6i_uncached);
352 /* allocate dst with ip6_dst_ops */
353 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
356 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
357 1, DST_OBSOLETE_FORCE_CHK, flags);
361 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
366 EXPORT_SYMBOL(ip6_dst_alloc);
368 static void ip6_dst_destroy(struct dst_entry *dst)
370 struct rt6_info *rt = (struct rt6_info *)dst;
371 struct fib6_info *from;
372 struct inet6_dev *idev;
374 ip_dst_metrics_put(dst);
375 rt6_uncached_list_del(rt);
377 idev = rt->rt6i_idev;
379 rt->rt6i_idev = NULL;
384 from = rcu_dereference(rt->from);
385 rcu_assign_pointer(rt->from, NULL);
386 fib6_info_release(from);
390 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
393 struct rt6_info *rt = (struct rt6_info *)dst;
394 struct inet6_dev *idev = rt->rt6i_idev;
395 struct net_device *loopback_dev =
396 dev_net(dev)->loopback_dev;
398 if (idev && idev->dev != loopback_dev) {
399 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
401 rt->rt6i_idev = loopback_idev;
407 static bool __rt6_check_expired(const struct rt6_info *rt)
409 if (rt->rt6i_flags & RTF_EXPIRES)
410 return time_after(jiffies, rt->dst.expires);
415 static bool rt6_check_expired(const struct rt6_info *rt)
417 struct fib6_info *from;
419 from = rcu_dereference(rt->from);
421 if (rt->rt6i_flags & RTF_EXPIRES) {
422 if (time_after(jiffies, rt->dst.expires))
425 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
426 fib6_check_expired(from);
431 struct fib6_info *fib6_multipath_select(const struct net *net,
432 struct fib6_info *match,
433 struct flowi6 *fl6, int oif,
434 const struct sk_buff *skb,
437 struct fib6_info *sibling, *next_sibling;
439 /* We might have already computed the hash for ICMPv6 errors. In such
440 * case it will always be non-zero. Otherwise now is the time to do it.
443 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
445 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
448 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
450 const struct fib6_nh *nh = &sibling->fib6_nh;
453 nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
454 if (fl6->mp_hash > nh_upper_bound)
456 if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
466 * Route lookup. rcu_read_lock() should be held.
469 static inline struct fib6_info *rt6_device_match(struct net *net,
470 struct fib6_info *rt,
471 const struct in6_addr *saddr,
475 struct fib6_info *sprt;
477 if (!oif && ipv6_addr_any(saddr) &&
478 !(rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD))
481 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
482 const struct net_device *dev = sprt->fib6_nh.fib_nh_dev;
484 if (sprt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
488 if (dev->ifindex == oif)
491 if (ipv6_chk_addr(net, saddr, dev,
492 flags & RT6_LOOKUP_F_IFACE))
497 if (oif && flags & RT6_LOOKUP_F_IFACE)
498 return net->ipv6.fib6_null_entry;
500 return rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
503 #ifdef CONFIG_IPV6_ROUTER_PREF
504 struct __rt6_probe_work {
505 struct work_struct work;
506 struct in6_addr target;
507 struct net_device *dev;
510 static void rt6_probe_deferred(struct work_struct *w)
512 struct in6_addr mcaddr;
513 struct __rt6_probe_work *work =
514 container_of(w, struct __rt6_probe_work, work);
516 addrconf_addr_solict_mult(&work->target, &mcaddr);
517 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
522 static void rt6_probe(struct fib6_nh *fib6_nh)
524 struct __rt6_probe_work *work = NULL;
525 const struct in6_addr *nh_gw;
526 struct neighbour *neigh;
527 struct net_device *dev;
528 struct inet6_dev *idev;
531 * Okay, this does not seem to be appropriate
532 * for now, however, we need to check if it
533 * is really so; aka Router Reachability Probing.
535 * Router Reachability Probe MUST be rate-limited
536 * to no more than one per minute.
538 if (fib6_nh->fib_nh_gw_family)
541 nh_gw = &fib6_nh->fib_nh_gw6;
542 dev = fib6_nh->fib_nh_dev;
544 idev = __in6_dev_get(dev);
545 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
547 if (neigh->nud_state & NUD_VALID)
550 write_lock(&neigh->lock);
551 if (!(neigh->nud_state & NUD_VALID) &&
553 neigh->updated + idev->cnf.rtr_probe_interval)) {
554 work = kmalloc(sizeof(*work), GFP_ATOMIC);
556 __neigh_set_probe_once(neigh);
558 write_unlock(&neigh->lock);
559 } else if (time_after(jiffies, fib6_nh->last_probe +
560 idev->cnf.rtr_probe_interval)) {
561 work = kmalloc(sizeof(*work), GFP_ATOMIC);
565 fib6_nh->last_probe = jiffies;
566 INIT_WORK(&work->work, rt6_probe_deferred);
567 work->target = *nh_gw;
570 schedule_work(&work->work);
574 rcu_read_unlock_bh();
577 static inline void rt6_probe(struct fib6_nh *fib6_nh)
583 * Default Router Selection (RFC 2461 6.3.6)
585 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
587 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
588 struct neighbour *neigh;
591 neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
592 &fib6_nh->fib_nh_gw6);
594 read_lock(&neigh->lock);
595 if (neigh->nud_state & NUD_VALID)
596 ret = RT6_NUD_SUCCEED;
597 #ifdef CONFIG_IPV6_ROUTER_PREF
598 else if (!(neigh->nud_state & NUD_FAILED))
599 ret = RT6_NUD_SUCCEED;
601 ret = RT6_NUD_FAIL_PROBE;
603 read_unlock(&neigh->lock);
605 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
606 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
608 rcu_read_unlock_bh();
613 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
618 if (!oif || nh->fib_nh_dev->ifindex == oif)
621 if (!m && (strict & RT6_LOOKUP_F_IFACE))
622 return RT6_NUD_FAIL_HARD;
623 #ifdef CONFIG_IPV6_ROUTER_PREF
624 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
626 if ((strict & RT6_LOOKUP_F_REACHABLE) &&
627 !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
628 int n = rt6_check_neigh(nh);
635 static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
636 int oif, int strict, int *mpri, bool *do_rr)
638 bool match_do_rr = false;
642 if (nh->fib_nh_flags & RTNH_F_DEAD)
645 if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
646 nh->fib_nh_flags & RTNH_F_LINKDOWN &&
647 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
650 m = rt6_score_route(nh, fib6_flags, oif, strict);
651 if (m == RT6_NUD_FAIL_DO_RR) {
653 m = 0; /* lowest valid score */
654 } else if (m == RT6_NUD_FAIL_HARD) {
658 if (strict & RT6_LOOKUP_F_REACHABLE)
661 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
663 *do_rr = match_do_rr;
671 static void __find_rr_leaf(struct fib6_info *rt_start,
672 struct fib6_info *nomatch, u32 metric,
673 struct fib6_info **match, struct fib6_info **cont,
674 int oif, int strict, bool *do_rr, int *mpri)
676 struct fib6_info *rt;
680 rt = rcu_dereference(rt->fib6_next)) {
683 if (cont && rt->fib6_metric != metric) {
688 if (fib6_check_expired(rt))
692 if (find_match(nh, rt->fib6_flags, oif, strict, mpri, do_rr))
697 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
698 struct fib6_info *leaf,
699 struct fib6_info *rr_head,
700 u32 metric, int oif, int strict,
703 struct fib6_info *match = NULL, *cont = NULL;
706 __find_rr_leaf(rr_head, NULL, metric, &match, &cont,
707 oif, strict, do_rr, &mpri);
709 __find_rr_leaf(leaf, rr_head, metric, &match, &cont,
710 oif, strict, do_rr, &mpri);
715 __find_rr_leaf(cont, NULL, metric, &match, NULL,
716 oif, strict, do_rr, &mpri);
721 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
724 struct fib6_info *leaf = rcu_dereference(fn->leaf);
725 struct fib6_info *match, *rt0;
729 if (!leaf || leaf == net->ipv6.fib6_null_entry)
730 return net->ipv6.fib6_null_entry;
732 rt0 = rcu_dereference(fn->rr_ptr);
736 /* Double check to make sure fn is not an intermediate node
737 * and fn->leaf does not points to its child's leaf
738 * (This might happen if all routes under fn are deleted from
739 * the tree and fib6_repair_tree() is called on the node.)
741 key_plen = rt0->fib6_dst.plen;
742 #ifdef CONFIG_IPV6_SUBTREES
743 if (rt0->fib6_src.plen)
744 key_plen = rt0->fib6_src.plen;
746 if (fn->fn_bit != key_plen)
747 return net->ipv6.fib6_null_entry;
749 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
753 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
755 /* no entries matched; do round-robin */
756 if (!next || next->fib6_metric != rt0->fib6_metric)
760 spin_lock_bh(&leaf->fib6_table->tb6_lock);
761 /* make sure next is not being deleted from the tree */
763 rcu_assign_pointer(fn->rr_ptr, next);
764 spin_unlock_bh(&leaf->fib6_table->tb6_lock);
768 return match ? match : net->ipv6.fib6_null_entry;
771 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
773 return (rt->fib6_flags & RTF_NONEXTHOP) || rt->fib6_nh.fib_nh_gw_family;
776 #ifdef CONFIG_IPV6_ROUTE_INFO
777 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
778 const struct in6_addr *gwaddr)
780 struct net *net = dev_net(dev);
781 struct route_info *rinfo = (struct route_info *) opt;
782 struct in6_addr prefix_buf, *prefix;
784 unsigned long lifetime;
785 struct fib6_info *rt;
787 if (len < sizeof(struct route_info)) {
791 /* Sanity check for prefix_len and length */
792 if (rinfo->length > 3) {
794 } else if (rinfo->prefix_len > 128) {
796 } else if (rinfo->prefix_len > 64) {
797 if (rinfo->length < 2) {
800 } else if (rinfo->prefix_len > 0) {
801 if (rinfo->length < 1) {
806 pref = rinfo->route_pref;
807 if (pref == ICMPV6_ROUTER_PREF_INVALID)
810 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
812 if (rinfo->length == 3)
813 prefix = (struct in6_addr *)rinfo->prefix;
815 /* this function is safe */
816 ipv6_addr_prefix(&prefix_buf,
817 (struct in6_addr *)rinfo->prefix,
819 prefix = &prefix_buf;
822 if (rinfo->prefix_len == 0)
823 rt = rt6_get_dflt_router(net, gwaddr, dev);
825 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
828 if (rt && !lifetime) {
834 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
837 rt->fib6_flags = RTF_ROUTEINFO |
838 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
841 if (!addrconf_finite_timeout(lifetime))
842 fib6_clean_expires(rt);
844 fib6_set_expires(rt, jiffies + HZ * lifetime);
846 fib6_info_release(rt);
853 * Misc support functions
856 /* called with rcu_lock held */
857 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
859 struct net_device *dev = rt->fib6_nh.fib_nh_dev;
861 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
862 /* for copies of local routes, dst->dev needs to be the
863 * device if it is a master device, the master device if
864 * device is enslaved, and the loopback as the default
866 if (netif_is_l3_slave(dev) &&
867 !rt6_need_strict(&rt->fib6_dst.addr))
868 dev = l3mdev_master_dev_rcu(dev);
869 else if (!netif_is_l3_master(dev))
870 dev = dev_net(dev)->loopback_dev;
871 /* last case is netif_is_l3_master(dev) is true in which
872 * case we want dev returned to be dev
879 static const int fib6_prop[RTN_MAX + 1] = {
886 [RTN_BLACKHOLE] = -EINVAL,
887 [RTN_UNREACHABLE] = -EHOSTUNREACH,
888 [RTN_PROHIBIT] = -EACCES,
889 [RTN_THROW] = -EAGAIN,
891 [RTN_XRESOLVE] = -EINVAL,
894 static int ip6_rt_type_to_error(u8 fib6_type)
896 return fib6_prop[fib6_type];
899 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
901 unsigned short flags = 0;
904 flags |= DST_NOCOUNT;
905 if (rt->dst_nopolicy)
906 flags |= DST_NOPOLICY;
913 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
915 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
917 switch (ort->fib6_type) {
919 rt->dst.output = dst_discard_out;
920 rt->dst.input = dst_discard;
923 rt->dst.output = ip6_pkt_prohibit_out;
924 rt->dst.input = ip6_pkt_prohibit;
927 case RTN_UNREACHABLE:
929 rt->dst.output = ip6_pkt_discard_out;
930 rt->dst.input = ip6_pkt_discard;
935 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
937 if (ort->fib6_flags & RTF_REJECT) {
938 ip6_rt_init_dst_reject(rt, ort);
943 rt->dst.output = ip6_output;
945 if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
946 rt->dst.input = ip6_input;
947 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
948 rt->dst.input = ip6_mc_input;
950 rt->dst.input = ip6_forward;
953 if (ort->fib6_nh.fib_nh_lws) {
954 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.fib_nh_lws);
955 lwtunnel_set_redirect(&rt->dst);
958 rt->dst.lastuse = jiffies;
961 /* Caller must already hold reference to @from */
962 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
964 rt->rt6i_flags &= ~RTF_EXPIRES;
965 rcu_assign_pointer(rt->from, from);
966 ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
969 /* Caller must already hold reference to @ort */
970 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
972 struct net_device *dev = fib6_info_nh_dev(ort);
974 ip6_rt_init_dst(rt, ort);
976 rt->rt6i_dst = ort->fib6_dst;
977 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
978 rt->rt6i_flags = ort->fib6_flags;
979 if (ort->fib6_nh.fib_nh_gw_family) {
980 rt->rt6i_gateway = ort->fib6_nh.fib_nh_gw6;
981 rt->rt6i_flags |= RTF_GATEWAY;
983 rt6_set_from(rt, ort);
984 #ifdef CONFIG_IPV6_SUBTREES
985 rt->rt6i_src = ort->fib6_src;
989 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
990 struct in6_addr *saddr)
992 struct fib6_node *pn, *sn;
994 if (fn->fn_flags & RTN_TL_ROOT)
996 pn = rcu_dereference(fn->parent);
997 sn = FIB6_SUBTREE(pn);
999 fn = fib6_node_lookup(sn, NULL, saddr);
1002 if (fn->fn_flags & RTN_RTINFO)
1007 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1009 struct rt6_info *rt = *prt;
1011 if (dst_hold_safe(&rt->dst))
1014 rt = net->ipv6.ip6_null_entry;
1023 /* called with rcu_lock held */
1024 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1026 unsigned short flags = fib6_info_dst_flags(rt);
1027 struct net_device *dev = rt->fib6_nh.fib_nh_dev;
1028 struct rt6_info *nrt;
1030 if (!fib6_info_hold_safe(rt))
1033 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1035 fib6_info_release(rt);
1039 ip6_rt_copy_init(nrt, rt);
1043 nrt = dev_net(dev)->ipv6.ip6_null_entry;
1044 dst_hold(&nrt->dst);
1048 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1049 struct fib6_table *table,
1051 const struct sk_buff *skb,
1054 struct fib6_info *f6i;
1055 struct fib6_node *fn;
1056 struct rt6_info *rt;
1058 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1059 flags &= ~RT6_LOOKUP_F_IFACE;
1062 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1064 f6i = rcu_dereference(fn->leaf);
1066 f6i = net->ipv6.fib6_null_entry;
1068 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1069 fl6->flowi6_oif, flags);
1071 if (f6i == net->ipv6.fib6_null_entry) {
1072 fn = fib6_backtrack(fn, &fl6->saddr);
1076 rt = net->ipv6.ip6_null_entry;
1081 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1082 f6i = fib6_multipath_select(net, f6i, fl6, fl6->flowi6_oif, skb,
1084 /* Search through exception table */
1085 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1087 if (ip6_hold_safe(net, &rt))
1088 dst_use_noref(&rt->dst, jiffies);
1090 rt = ip6_create_rt_rcu(f6i);
1094 trace_fib6_table_lookup(net, f6i, table, fl6);
1101 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1102 const struct sk_buff *skb, int flags)
1104 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1106 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1108 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1109 const struct in6_addr *saddr, int oif,
1110 const struct sk_buff *skb, int strict)
1112 struct flowi6 fl6 = {
1116 struct dst_entry *dst;
1117 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1120 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1121 flags |= RT6_LOOKUP_F_HAS_SADDR;
1124 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1125 if (dst->error == 0)
1126 return (struct rt6_info *) dst;
1132 EXPORT_SYMBOL(rt6_lookup);
1134 /* ip6_ins_rt is called with FREE table->tb6_lock.
1135 * It takes new route entry, the addition fails by any reason the
1136 * route is released.
1137 * Caller must hold dst before calling it.
1140 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1141 struct netlink_ext_ack *extack)
1144 struct fib6_table *table;
1146 table = rt->fib6_table;
1147 spin_lock_bh(&table->tb6_lock);
1148 err = fib6_add(&table->tb6_root, rt, info, extack);
1149 spin_unlock_bh(&table->tb6_lock);
1154 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1156 struct nl_info info = { .nl_net = net, };
1158 return __ip6_ins_rt(rt, &info, NULL);
1161 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1162 const struct in6_addr *daddr,
1163 const struct in6_addr *saddr)
1165 struct net_device *dev;
1166 struct rt6_info *rt;
1172 if (!fib6_info_hold_safe(ort))
1175 dev = ip6_rt_get_dev_rcu(ort);
1176 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1178 fib6_info_release(ort);
1182 ip6_rt_copy_init(rt, ort);
1183 rt->rt6i_flags |= RTF_CACHE;
1184 rt->dst.flags |= DST_HOST;
1185 rt->rt6i_dst.addr = *daddr;
1186 rt->rt6i_dst.plen = 128;
1188 if (!rt6_is_gw_or_nonexthop(ort)) {
1189 if (ort->fib6_dst.plen != 128 &&
1190 ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1191 rt->rt6i_flags |= RTF_ANYCAST;
1192 #ifdef CONFIG_IPV6_SUBTREES
1193 if (rt->rt6i_src.plen && saddr) {
1194 rt->rt6i_src.addr = *saddr;
1195 rt->rt6i_src.plen = 128;
1203 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1205 unsigned short flags = fib6_info_dst_flags(rt);
1206 struct net_device *dev;
1207 struct rt6_info *pcpu_rt;
1209 if (!fib6_info_hold_safe(rt))
1213 dev = ip6_rt_get_dev_rcu(rt);
1214 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1217 fib6_info_release(rt);
1220 ip6_rt_copy_init(pcpu_rt, rt);
1221 pcpu_rt->rt6i_flags |= RTF_PCPU;
1225 /* It should be called with rcu_read_lock() acquired */
1226 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1228 struct rt6_info *pcpu_rt, **p;
1230 p = this_cpu_ptr(rt->rt6i_pcpu);
1234 ip6_hold_safe(NULL, &pcpu_rt);
1239 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1240 struct fib6_info *rt)
1242 struct rt6_info *pcpu_rt, *prev, **p;
1244 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1246 dst_hold(&net->ipv6.ip6_null_entry->dst);
1247 return net->ipv6.ip6_null_entry;
1250 dst_hold(&pcpu_rt->dst);
1251 p = this_cpu_ptr(rt->rt6i_pcpu);
1252 prev = cmpxchg(p, NULL, pcpu_rt);
1258 /* exception hash table implementation
1260 static DEFINE_SPINLOCK(rt6_exception_lock);
1262 /* Remove rt6_ex from hash table and free the memory
1263 * Caller must hold rt6_exception_lock
1265 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1266 struct rt6_exception *rt6_ex)
1268 struct fib6_info *from;
1271 if (!bucket || !rt6_ex)
1274 net = dev_net(rt6_ex->rt6i->dst.dev);
1275 net->ipv6.rt6_stats->fib_rt_cache--;
1277 /* purge completely the exception to allow releasing the held resources:
1278 * some [sk] cache may keep the dst around for unlimited time
1280 from = rcu_dereference_protected(rt6_ex->rt6i->from,
1281 lockdep_is_held(&rt6_exception_lock));
1282 rcu_assign_pointer(rt6_ex->rt6i->from, NULL);
1283 fib6_info_release(from);
1284 dst_dev_put(&rt6_ex->rt6i->dst);
1286 hlist_del_rcu(&rt6_ex->hlist);
1287 dst_release(&rt6_ex->rt6i->dst);
1288 kfree_rcu(rt6_ex, rcu);
1289 WARN_ON_ONCE(!bucket->depth);
1293 /* Remove oldest rt6_ex in bucket and free the memory
1294 * Caller must hold rt6_exception_lock
1296 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1298 struct rt6_exception *rt6_ex, *oldest = NULL;
1303 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1304 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1307 rt6_remove_exception(bucket, oldest);
1310 static u32 rt6_exception_hash(const struct in6_addr *dst,
1311 const struct in6_addr *src)
1313 static u32 seed __read_mostly;
1316 net_get_random_once(&seed, sizeof(seed));
1317 val = jhash(dst, sizeof(*dst), seed);
1319 #ifdef CONFIG_IPV6_SUBTREES
1321 val = jhash(src, sizeof(*src), val);
1323 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1326 /* Helper function to find the cached rt in the hash table
1327 * and update bucket pointer to point to the bucket for this
1328 * (daddr, saddr) pair
1329 * Caller must hold rt6_exception_lock
1331 static struct rt6_exception *
1332 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1333 const struct in6_addr *daddr,
1334 const struct in6_addr *saddr)
1336 struct rt6_exception *rt6_ex;
1339 if (!(*bucket) || !daddr)
1342 hval = rt6_exception_hash(daddr, saddr);
1345 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1346 struct rt6_info *rt6 = rt6_ex->rt6i;
1347 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1349 #ifdef CONFIG_IPV6_SUBTREES
1350 if (matched && saddr)
1351 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1359 /* Helper function to find the cached rt in the hash table
1360 * and update bucket pointer to point to the bucket for this
1361 * (daddr, saddr) pair
1362 * Caller must hold rcu_read_lock()
1364 static struct rt6_exception *
1365 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1366 const struct in6_addr *daddr,
1367 const struct in6_addr *saddr)
1369 struct rt6_exception *rt6_ex;
1372 WARN_ON_ONCE(!rcu_read_lock_held());
1374 if (!(*bucket) || !daddr)
1377 hval = rt6_exception_hash(daddr, saddr);
1380 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1381 struct rt6_info *rt6 = rt6_ex->rt6i;
1382 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1384 #ifdef CONFIG_IPV6_SUBTREES
1385 if (matched && saddr)
1386 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1394 static unsigned int fib6_mtu(const struct fib6_info *rt)
1398 if (rt->fib6_pmtu) {
1399 mtu = rt->fib6_pmtu;
1401 struct net_device *dev = fib6_info_nh_dev(rt);
1402 struct inet6_dev *idev;
1405 idev = __in6_dev_get(dev);
1406 mtu = idev->cnf.mtu6;
1410 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1412 return mtu - lwtunnel_headroom(rt->fib6_nh.fib_nh_lws, mtu);
1415 static int rt6_insert_exception(struct rt6_info *nrt,
1416 struct fib6_info *ort)
1418 struct net *net = dev_net(nrt->dst.dev);
1419 struct rt6_exception_bucket *bucket;
1420 struct in6_addr *src_key = NULL;
1421 struct rt6_exception *rt6_ex;
1424 spin_lock_bh(&rt6_exception_lock);
1426 if (ort->exception_bucket_flushed) {
1431 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1432 lockdep_is_held(&rt6_exception_lock));
1434 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1440 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1443 #ifdef CONFIG_IPV6_SUBTREES
1444 /* rt6i_src.plen != 0 indicates ort is in subtree
1445 * and exception table is indexed by a hash of
1446 * both rt6i_dst and rt6i_src.
1447 * Otherwise, the exception table is indexed by
1448 * a hash of only rt6i_dst.
1450 if (ort->fib6_src.plen)
1451 src_key = &nrt->rt6i_src.addr;
1453 /* rt6_mtu_change() might lower mtu on ort.
1454 * Only insert this exception route if its mtu
1455 * is less than ort's mtu value.
1457 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1462 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1465 rt6_remove_exception(bucket, rt6_ex);
1467 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1473 rt6_ex->stamp = jiffies;
1474 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1476 net->ipv6.rt6_stats->fib_rt_cache++;
1478 if (bucket->depth > FIB6_MAX_DEPTH)
1479 rt6_exception_remove_oldest(bucket);
1482 spin_unlock_bh(&rt6_exception_lock);
1484 /* Update fn->fn_sernum to invalidate all cached dst */
1486 spin_lock_bh(&ort->fib6_table->tb6_lock);
1487 fib6_update_sernum(net, ort);
1488 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1489 fib6_force_start_gc(net);
1495 void rt6_flush_exceptions(struct fib6_info *rt)
1497 struct rt6_exception_bucket *bucket;
1498 struct rt6_exception *rt6_ex;
1499 struct hlist_node *tmp;
1502 spin_lock_bh(&rt6_exception_lock);
1503 /* Prevent rt6_insert_exception() to recreate the bucket list */
1504 rt->exception_bucket_flushed = 1;
1506 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1507 lockdep_is_held(&rt6_exception_lock));
1511 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1512 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1513 rt6_remove_exception(bucket, rt6_ex);
1514 WARN_ON_ONCE(bucket->depth);
1519 spin_unlock_bh(&rt6_exception_lock);
1522 /* Find cached rt in the hash table inside passed in rt
1523 * Caller has to hold rcu_read_lock()
1525 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1526 struct in6_addr *daddr,
1527 struct in6_addr *saddr)
1529 struct rt6_exception_bucket *bucket;
1530 struct in6_addr *src_key = NULL;
1531 struct rt6_exception *rt6_ex;
1532 struct rt6_info *res = NULL;
1534 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1536 #ifdef CONFIG_IPV6_SUBTREES
1537 /* rt6i_src.plen != 0 indicates rt is in subtree
1538 * and exception table is indexed by a hash of
1539 * both rt6i_dst and rt6i_src.
1540 * Otherwise, the exception table is indexed by
1541 * a hash of only rt6i_dst.
1543 if (rt->fib6_src.plen)
1546 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1548 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1554 /* Remove the passed in cached rt from the hash table that contains it */
1555 static int rt6_remove_exception_rt(struct rt6_info *rt)
1557 struct rt6_exception_bucket *bucket;
1558 struct in6_addr *src_key = NULL;
1559 struct rt6_exception *rt6_ex;
1560 struct fib6_info *from;
1563 from = rcu_dereference(rt->from);
1565 !(rt->rt6i_flags & RTF_CACHE))
1568 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1571 spin_lock_bh(&rt6_exception_lock);
1572 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1573 lockdep_is_held(&rt6_exception_lock));
1574 #ifdef CONFIG_IPV6_SUBTREES
1575 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1576 * and exception table is indexed by a hash of
1577 * both rt6i_dst and rt6i_src.
1578 * Otherwise, the exception table is indexed by
1579 * a hash of only rt6i_dst.
1581 if (from->fib6_src.plen)
1582 src_key = &rt->rt6i_src.addr;
1584 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1588 rt6_remove_exception(bucket, rt6_ex);
1594 spin_unlock_bh(&rt6_exception_lock);
1598 /* Find rt6_ex which contains the passed in rt cache and
1601 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1603 struct rt6_exception_bucket *bucket;
1604 struct in6_addr *src_key = NULL;
1605 struct rt6_exception *rt6_ex;
1606 struct fib6_info *from;
1609 from = rcu_dereference(rt->from);
1610 if (!from || !(rt->rt6i_flags & RTF_CACHE))
1613 bucket = rcu_dereference(from->rt6i_exception_bucket);
1615 #ifdef CONFIG_IPV6_SUBTREES
1616 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1617 * and exception table is indexed by a hash of
1618 * both rt6i_dst and rt6i_src.
1619 * Otherwise, the exception table is indexed by
1620 * a hash of only rt6i_dst.
1622 if (from->fib6_src.plen)
1623 src_key = &rt->rt6i_src.addr;
1625 rt6_ex = __rt6_find_exception_rcu(&bucket,
1629 rt6_ex->stamp = jiffies;
1635 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1636 struct rt6_info *rt, int mtu)
1638 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1639 * lowest MTU in the path: always allow updating the route PMTU to
1640 * reflect PMTU decreases.
1642 * If the new MTU is higher, and the route PMTU is equal to the local
1643 * MTU, this means the old MTU is the lowest in the path, so allow
1644 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1648 if (dst_mtu(&rt->dst) >= mtu)
1651 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1657 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1658 struct fib6_info *rt, int mtu)
1660 struct rt6_exception_bucket *bucket;
1661 struct rt6_exception *rt6_ex;
1664 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1665 lockdep_is_held(&rt6_exception_lock));
1670 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1671 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1672 struct rt6_info *entry = rt6_ex->rt6i;
1674 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1675 * route), the metrics of its rt->from have already
1678 if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1679 rt6_mtu_change_route_allowed(idev, entry, mtu))
1680 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1686 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1688 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1689 struct in6_addr *gateway)
1691 struct rt6_exception_bucket *bucket;
1692 struct rt6_exception *rt6_ex;
1693 struct hlist_node *tmp;
1696 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1699 spin_lock_bh(&rt6_exception_lock);
1700 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1701 lockdep_is_held(&rt6_exception_lock));
1704 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1705 hlist_for_each_entry_safe(rt6_ex, tmp,
1706 &bucket->chain, hlist) {
1707 struct rt6_info *entry = rt6_ex->rt6i;
1709 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1710 RTF_CACHE_GATEWAY &&
1711 ipv6_addr_equal(gateway,
1712 &entry->rt6i_gateway)) {
1713 rt6_remove_exception(bucket, rt6_ex);
1720 spin_unlock_bh(&rt6_exception_lock);
1723 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1724 struct rt6_exception *rt6_ex,
1725 struct fib6_gc_args *gc_args,
1728 struct rt6_info *rt = rt6_ex->rt6i;
1730 /* we are pruning and obsoleting aged-out and non gateway exceptions
1731 * even if others have still references to them, so that on next
1732 * dst_check() such references can be dropped.
1733 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1734 * expired, independently from their aging, as per RFC 8201 section 4
1736 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1737 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1738 RT6_TRACE("aging clone %p\n", rt);
1739 rt6_remove_exception(bucket, rt6_ex);
1742 } else if (time_after(jiffies, rt->dst.expires)) {
1743 RT6_TRACE("purging expired route %p\n", rt);
1744 rt6_remove_exception(bucket, rt6_ex);
1748 if (rt->rt6i_flags & RTF_GATEWAY) {
1749 struct neighbour *neigh;
1750 __u8 neigh_flags = 0;
1752 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1754 neigh_flags = neigh->flags;
1756 if (!(neigh_flags & NTF_ROUTER)) {
1757 RT6_TRACE("purging route %p via non-router but gateway\n",
1759 rt6_remove_exception(bucket, rt6_ex);
1767 void rt6_age_exceptions(struct fib6_info *rt,
1768 struct fib6_gc_args *gc_args,
1771 struct rt6_exception_bucket *bucket;
1772 struct rt6_exception *rt6_ex;
1773 struct hlist_node *tmp;
1776 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1780 spin_lock(&rt6_exception_lock);
1781 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1782 lockdep_is_held(&rt6_exception_lock));
1785 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1786 hlist_for_each_entry_safe(rt6_ex, tmp,
1787 &bucket->chain, hlist) {
1788 rt6_age_examine_exception(bucket, rt6_ex,
1794 spin_unlock(&rt6_exception_lock);
1795 rcu_read_unlock_bh();
1798 /* must be called with rcu lock held */
1799 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1800 int oif, struct flowi6 *fl6, int strict)
1802 struct fib6_node *fn, *saved_fn;
1803 struct fib6_info *f6i;
1805 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1808 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1812 f6i = rt6_select(net, fn, oif, strict);
1813 if (f6i == net->ipv6.fib6_null_entry) {
1814 fn = fib6_backtrack(fn, &fl6->saddr);
1816 goto redo_rt6_select;
1817 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1818 /* also consider unreachable route */
1819 strict &= ~RT6_LOOKUP_F_REACHABLE;
1821 goto redo_rt6_select;
1825 trace_fib6_table_lookup(net, f6i, table, fl6);
1830 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1831 int oif, struct flowi6 *fl6,
1832 const struct sk_buff *skb, int flags)
1834 struct fib6_info *f6i;
1835 struct rt6_info *rt;
1838 strict |= flags & RT6_LOOKUP_F_IFACE;
1839 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1840 if (net->ipv6.devconf_all->forwarding == 0)
1841 strict |= RT6_LOOKUP_F_REACHABLE;
1845 f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1846 if (f6i->fib6_nsiblings)
1847 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1849 if (f6i == net->ipv6.fib6_null_entry) {
1850 rt = net->ipv6.ip6_null_entry;
1856 /*Search through exception table */
1857 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1859 if (ip6_hold_safe(net, &rt))
1860 dst_use_noref(&rt->dst, jiffies);
1864 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1865 !f6i->fib6_nh.fib_nh_gw_family)) {
1866 /* Create a RTF_CACHE clone which will not be
1867 * owned by the fib6 tree. It is for the special case where
1868 * the daddr in the skb during the neighbor look-up is different
1869 * from the fl6->daddr used to look-up route here.
1871 struct rt6_info *uncached_rt;
1873 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1878 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1879 * No need for another dst_hold()
1881 rt6_uncached_list_add(uncached_rt);
1882 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1884 uncached_rt = net->ipv6.ip6_null_entry;
1885 dst_hold(&uncached_rt->dst);
1890 /* Get a percpu copy */
1892 struct rt6_info *pcpu_rt;
1895 pcpu_rt = rt6_get_pcpu_route(f6i);
1898 pcpu_rt = rt6_make_pcpu_route(net, f6i);
1906 EXPORT_SYMBOL_GPL(ip6_pol_route);
1908 static struct rt6_info *ip6_pol_route_input(struct net *net,
1909 struct fib6_table *table,
1911 const struct sk_buff *skb,
1914 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1917 struct dst_entry *ip6_route_input_lookup(struct net *net,
1918 struct net_device *dev,
1920 const struct sk_buff *skb,
1923 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1924 flags |= RT6_LOOKUP_F_IFACE;
1926 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1928 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1930 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1931 struct flow_keys *keys,
1932 struct flow_keys *flkeys)
1934 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1935 const struct ipv6hdr *key_iph = outer_iph;
1936 struct flow_keys *_flkeys = flkeys;
1937 const struct ipv6hdr *inner_iph;
1938 const struct icmp6hdr *icmph;
1939 struct ipv6hdr _inner_iph;
1940 struct icmp6hdr _icmph;
1942 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1945 icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1946 sizeof(_icmph), &_icmph);
1950 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1951 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1952 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1953 icmph->icmp6_type != ICMPV6_PARAMPROB)
1956 inner_iph = skb_header_pointer(skb,
1957 skb_transport_offset(skb) + sizeof(*icmph),
1958 sizeof(_inner_iph), &_inner_iph);
1962 key_iph = inner_iph;
1966 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1967 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1968 keys->tags.flow_label = _flkeys->tags.flow_label;
1969 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1971 keys->addrs.v6addrs.src = key_iph->saddr;
1972 keys->addrs.v6addrs.dst = key_iph->daddr;
1973 keys->tags.flow_label = ip6_flowlabel(key_iph);
1974 keys->basic.ip_proto = key_iph->nexthdr;
1978 /* if skb is set it will be used and fl6 can be NULL */
1979 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1980 const struct sk_buff *skb, struct flow_keys *flkeys)
1982 struct flow_keys hash_keys;
1985 switch (ip6_multipath_hash_policy(net)) {
1987 memset(&hash_keys, 0, sizeof(hash_keys));
1988 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1990 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1992 hash_keys.addrs.v6addrs.src = fl6->saddr;
1993 hash_keys.addrs.v6addrs.dst = fl6->daddr;
1994 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
1995 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2000 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2001 struct flow_keys keys;
2003 /* short-circuit if we already have L4 hash present */
2005 return skb_get_hash_raw(skb) >> 1;
2007 memset(&hash_keys, 0, sizeof(hash_keys));
2010 skb_flow_dissect_flow_keys(skb, &keys, flag);
2013 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2014 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2015 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2016 hash_keys.ports.src = flkeys->ports.src;
2017 hash_keys.ports.dst = flkeys->ports.dst;
2018 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2020 memset(&hash_keys, 0, sizeof(hash_keys));
2021 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2022 hash_keys.addrs.v6addrs.src = fl6->saddr;
2023 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2024 hash_keys.ports.src = fl6->fl6_sport;
2025 hash_keys.ports.dst = fl6->fl6_dport;
2026 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2030 mhash = flow_hash_from_keys(&hash_keys);
2035 void ip6_route_input(struct sk_buff *skb)
2037 const struct ipv6hdr *iph = ipv6_hdr(skb);
2038 struct net *net = dev_net(skb->dev);
2039 int flags = RT6_LOOKUP_F_HAS_SADDR;
2040 struct ip_tunnel_info *tun_info;
2041 struct flowi6 fl6 = {
2042 .flowi6_iif = skb->dev->ifindex,
2043 .daddr = iph->daddr,
2044 .saddr = iph->saddr,
2045 .flowlabel = ip6_flowinfo(iph),
2046 .flowi6_mark = skb->mark,
2047 .flowi6_proto = iph->nexthdr,
2049 struct flow_keys *flkeys = NULL, _flkeys;
2051 tun_info = skb_tunnel_info(skb);
2052 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2053 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2055 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2058 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2059 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2062 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2065 static struct rt6_info *ip6_pol_route_output(struct net *net,
2066 struct fib6_table *table,
2068 const struct sk_buff *skb,
2071 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2074 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2075 struct flowi6 *fl6, int flags)
2079 if (ipv6_addr_type(&fl6->daddr) &
2080 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2081 struct dst_entry *dst;
2083 dst = l3mdev_link_scope_lookup(net, fl6);
2088 fl6->flowi6_iif = LOOPBACK_IFINDEX;
2090 any_src = ipv6_addr_any(&fl6->saddr);
2091 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2092 (fl6->flowi6_oif && any_src))
2093 flags |= RT6_LOOKUP_F_IFACE;
2096 flags |= RT6_LOOKUP_F_HAS_SADDR;
2098 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2100 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2102 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2104 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2106 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2107 struct net_device *loopback_dev = net->loopback_dev;
2108 struct dst_entry *new = NULL;
2110 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2111 DST_OBSOLETE_DEAD, 0);
2114 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2118 new->input = dst_discard;
2119 new->output = dst_discard_out;
2121 dst_copy_metrics(new, &ort->dst);
2123 rt->rt6i_idev = in6_dev_get(loopback_dev);
2124 rt->rt6i_gateway = ort->rt6i_gateway;
2125 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2127 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2128 #ifdef CONFIG_IPV6_SUBTREES
2129 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2133 dst_release(dst_orig);
2134 return new ? new : ERR_PTR(-ENOMEM);
2138 * Destination cache support functions
2141 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2145 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2148 if (fib6_check_expired(f6i))
2154 static struct dst_entry *rt6_check(struct rt6_info *rt,
2155 struct fib6_info *from,
2160 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2161 rt_cookie != cookie)
2164 if (rt6_check_expired(rt))
2170 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2171 struct fib6_info *from,
2174 if (!__rt6_check_expired(rt) &&
2175 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2176 fib6_check(from, cookie))
2182 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2184 struct dst_entry *dst_ret;
2185 struct fib6_info *from;
2186 struct rt6_info *rt;
2188 rt = container_of(dst, struct rt6_info, dst);
2192 /* All IPV6 dsts are created with ->obsolete set to the value
2193 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2194 * into this function always.
2197 from = rcu_dereference(rt->from);
2199 if (from && (rt->rt6i_flags & RTF_PCPU ||
2200 unlikely(!list_empty(&rt->rt6i_uncached))))
2201 dst_ret = rt6_dst_from_check(rt, from, cookie);
2203 dst_ret = rt6_check(rt, from, cookie);
2210 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2212 struct rt6_info *rt = (struct rt6_info *) dst;
2215 if (rt->rt6i_flags & RTF_CACHE) {
2217 if (rt6_check_expired(rt)) {
2218 rt6_remove_exception_rt(rt);
2230 static void ip6_link_failure(struct sk_buff *skb)
2232 struct rt6_info *rt;
2234 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2236 rt = (struct rt6_info *) skb_dst(skb);
2239 if (rt->rt6i_flags & RTF_CACHE) {
2240 rt6_remove_exception_rt(rt);
2242 struct fib6_info *from;
2243 struct fib6_node *fn;
2245 from = rcu_dereference(rt->from);
2247 fn = rcu_dereference(from->fib6_node);
2248 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2256 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2258 if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2259 struct fib6_info *from;
2262 from = rcu_dereference(rt0->from);
2264 rt0->dst.expires = from->expires;
2268 dst_set_expires(&rt0->dst, timeout);
2269 rt0->rt6i_flags |= RTF_EXPIRES;
2272 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2274 struct net *net = dev_net(rt->dst.dev);
2276 dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2277 rt->rt6i_flags |= RTF_MODIFIED;
2278 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2281 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2283 return !(rt->rt6i_flags & RTF_CACHE) &&
2284 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2287 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2288 const struct ipv6hdr *iph, u32 mtu)
2290 const struct in6_addr *daddr, *saddr;
2291 struct rt6_info *rt6 = (struct rt6_info *)dst;
2293 if (dst_metric_locked(dst, RTAX_MTU))
2297 daddr = &iph->daddr;
2298 saddr = &iph->saddr;
2300 daddr = &sk->sk_v6_daddr;
2301 saddr = &inet6_sk(sk)->saddr;
2306 dst_confirm_neigh(dst, daddr);
2307 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2308 if (mtu >= dst_mtu(dst))
2311 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2312 rt6_do_update_pmtu(rt6, mtu);
2313 /* update rt6_ex->stamp for cache */
2314 if (rt6->rt6i_flags & RTF_CACHE)
2315 rt6_update_exception_stamp_rt(rt6);
2317 struct fib6_info *from;
2318 struct rt6_info *nrt6;
2321 from = rcu_dereference(rt6->from);
2322 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2324 rt6_do_update_pmtu(nrt6, mtu);
2325 if (rt6_insert_exception(nrt6, from))
2326 dst_release_immediate(&nrt6->dst);
2332 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2333 struct sk_buff *skb, u32 mtu)
2335 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2338 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2339 int oif, u32 mark, kuid_t uid)
2341 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2342 struct dst_entry *dst;
2343 struct flowi6 fl6 = {
2345 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2346 .daddr = iph->daddr,
2347 .saddr = iph->saddr,
2348 .flowlabel = ip6_flowinfo(iph),
2352 dst = ip6_route_output(net, NULL, &fl6);
2354 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2357 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2359 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2361 int oif = sk->sk_bound_dev_if;
2362 struct dst_entry *dst;
2364 if (!oif && skb->dev)
2365 oif = l3mdev_master_ifindex(skb->dev);
2367 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2369 dst = __sk_dst_get(sk);
2370 if (!dst || !dst->obsolete ||
2371 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2375 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2376 ip6_datagram_dst_update(sk, false);
2379 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2381 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2382 const struct flowi6 *fl6)
2384 #ifdef CONFIG_IPV6_SUBTREES
2385 struct ipv6_pinfo *np = inet6_sk(sk);
2388 ip6_dst_store(sk, dst,
2389 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2390 &sk->sk_v6_daddr : NULL,
2391 #ifdef CONFIG_IPV6_SUBTREES
2392 ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2398 /* Handle redirects */
2399 struct ip6rd_flowi {
2401 struct in6_addr gateway;
2404 static struct rt6_info *__ip6_route_redirect(struct net *net,
2405 struct fib6_table *table,
2407 const struct sk_buff *skb,
2410 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2411 struct rt6_info *ret = NULL, *rt_cache;
2412 struct fib6_info *rt;
2413 struct fib6_node *fn;
2415 /* Get the "current" route for this destination and
2416 * check if the redirect has come from appropriate router.
2418 * RFC 4861 specifies that redirects should only be
2419 * accepted if they come from the nexthop to the target.
2420 * Due to the way the routes are chosen, this notion
2421 * is a bit fuzzy and one might need to check all possible
2426 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2428 for_each_fib6_node_rt_rcu(fn) {
2429 if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
2431 if (fib6_check_expired(rt))
2433 if (rt->fib6_flags & RTF_REJECT)
2435 if (!rt->fib6_nh.fib_nh_gw_family)
2437 if (fl6->flowi6_oif != rt->fib6_nh.fib_nh_dev->ifindex)
2439 /* rt_cache's gateway might be different from its 'parent'
2440 * in the case of an ip redirect.
2441 * So we keep searching in the exception table if the gateway
2444 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.fib_nh_gw6)) {
2445 rt_cache = rt6_find_cached_rt(rt,
2449 ipv6_addr_equal(&rdfl->gateway,
2450 &rt_cache->rt6i_gateway)) {
2460 rt = net->ipv6.fib6_null_entry;
2461 else if (rt->fib6_flags & RTF_REJECT) {
2462 ret = net->ipv6.ip6_null_entry;
2466 if (rt == net->ipv6.fib6_null_entry) {
2467 fn = fib6_backtrack(fn, &fl6->saddr);
2474 ip6_hold_safe(net, &ret);
2476 ret = ip6_create_rt_rcu(rt);
2480 trace_fib6_table_lookup(net, rt, table, fl6);
2484 static struct dst_entry *ip6_route_redirect(struct net *net,
2485 const struct flowi6 *fl6,
2486 const struct sk_buff *skb,
2487 const struct in6_addr *gateway)
2489 int flags = RT6_LOOKUP_F_HAS_SADDR;
2490 struct ip6rd_flowi rdfl;
2493 rdfl.gateway = *gateway;
2495 return fib6_rule_lookup(net, &rdfl.fl6, skb,
2496 flags, __ip6_route_redirect);
2499 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2502 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2503 struct dst_entry *dst;
2504 struct flowi6 fl6 = {
2505 .flowi6_iif = LOOPBACK_IFINDEX,
2507 .flowi6_mark = mark,
2508 .daddr = iph->daddr,
2509 .saddr = iph->saddr,
2510 .flowlabel = ip6_flowinfo(iph),
2514 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2515 rt6_do_redirect(dst, NULL, skb);
2518 EXPORT_SYMBOL_GPL(ip6_redirect);
2520 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2522 const struct ipv6hdr *iph = ipv6_hdr(skb);
2523 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2524 struct dst_entry *dst;
2525 struct flowi6 fl6 = {
2526 .flowi6_iif = LOOPBACK_IFINDEX,
2529 .saddr = iph->daddr,
2530 .flowi6_uid = sock_net_uid(net, NULL),
2533 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2534 rt6_do_redirect(dst, NULL, skb);
2538 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2540 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2543 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2545 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2547 struct net_device *dev = dst->dev;
2548 unsigned int mtu = dst_mtu(dst);
2549 struct net *net = dev_net(dev);
2551 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2553 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2554 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2557 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2558 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2559 * IPV6_MAXPLEN is also valid and means: "any MSS,
2560 * rely only on pmtu discovery"
2562 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2567 static unsigned int ip6_mtu(const struct dst_entry *dst)
2569 struct inet6_dev *idev;
2572 mtu = dst_metric_raw(dst, RTAX_MTU);
2579 idev = __in6_dev_get(dst->dev);
2581 mtu = idev->cnf.mtu6;
2585 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2587 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2591 * 1. mtu on route is locked - use it
2592 * 2. mtu from nexthop exception
2593 * 3. mtu from egress device
2595 * based on ip6_dst_mtu_forward and exception logic of
2596 * rt6_find_cached_rt; called with rcu_read_lock
2598 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2599 struct in6_addr *saddr)
2601 struct rt6_exception_bucket *bucket;
2602 struct rt6_exception *rt6_ex;
2603 struct in6_addr *src_key;
2604 struct inet6_dev *idev;
2607 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2608 mtu = f6i->fib6_pmtu;
2614 #ifdef CONFIG_IPV6_SUBTREES
2615 if (f6i->fib6_src.plen)
2619 bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2620 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2621 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2622 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2625 struct net_device *dev = fib6_info_nh_dev(f6i);
2628 idev = __in6_dev_get(dev);
2629 if (idev && idev->cnf.mtu6 > mtu)
2630 mtu = idev->cnf.mtu6;
2633 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2635 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2638 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2641 struct dst_entry *dst;
2642 struct rt6_info *rt;
2643 struct inet6_dev *idev = in6_dev_get(dev);
2644 struct net *net = dev_net(dev);
2646 if (unlikely(!idev))
2647 return ERR_PTR(-ENODEV);
2649 rt = ip6_dst_alloc(net, dev, 0);
2650 if (unlikely(!rt)) {
2652 dst = ERR_PTR(-ENOMEM);
2656 rt->dst.flags |= DST_HOST;
2657 rt->dst.input = ip6_input;
2658 rt->dst.output = ip6_output;
2659 rt->rt6i_gateway = fl6->daddr;
2660 rt->rt6i_dst.addr = fl6->daddr;
2661 rt->rt6i_dst.plen = 128;
2662 rt->rt6i_idev = idev;
2663 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2665 /* Add this dst into uncached_list so that rt6_disable_ip() can
2666 * do proper release of the net_device
2668 rt6_uncached_list_add(rt);
2669 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2671 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2677 static int ip6_dst_gc(struct dst_ops *ops)
2679 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2680 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2681 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2682 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2683 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2684 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2687 entries = dst_entries_get_fast(ops);
2688 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2689 entries <= rt_max_size)
2692 net->ipv6.ip6_rt_gc_expire++;
2693 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2694 entries = dst_entries_get_slow(ops);
2695 if (entries < ops->gc_thresh)
2696 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2698 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2699 return entries > rt_max_size;
2702 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2703 struct fib6_config *cfg,
2704 const struct in6_addr *gw_addr,
2705 u32 tbid, int flags)
2707 struct flowi6 fl6 = {
2708 .flowi6_oif = cfg->fc_ifindex,
2710 .saddr = cfg->fc_prefsrc,
2712 struct fib6_table *table;
2713 struct rt6_info *rt;
2715 table = fib6_get_table(net, tbid);
2719 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2720 flags |= RT6_LOOKUP_F_HAS_SADDR;
2722 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2723 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2725 /* if table lookup failed, fall back to full lookup */
2726 if (rt == net->ipv6.ip6_null_entry) {
2734 static int ip6_route_check_nh_onlink(struct net *net,
2735 struct fib6_config *cfg,
2736 const struct net_device *dev,
2737 struct netlink_ext_ack *extack)
2739 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2740 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2741 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2742 struct fib6_info *from;
2743 struct rt6_info *grt;
2747 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2750 from = rcu_dereference(grt->from);
2751 if (!grt->dst.error &&
2752 /* ignore match if it is the default route */
2753 from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2754 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2755 NL_SET_ERR_MSG(extack,
2756 "Nexthop has invalid gateway or device mismatch");
2767 static int ip6_route_check_nh(struct net *net,
2768 struct fib6_config *cfg,
2769 struct net_device **_dev,
2770 struct inet6_dev **idev)
2772 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2773 struct net_device *dev = _dev ? *_dev : NULL;
2774 struct rt6_info *grt = NULL;
2775 int err = -EHOSTUNREACH;
2777 if (cfg->fc_table) {
2778 int flags = RT6_LOOKUP_F_IFACE;
2780 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2781 cfg->fc_table, flags);
2783 if (grt->rt6i_flags & RTF_GATEWAY ||
2784 (dev && dev != grt->dst.dev)) {
2792 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2798 if (dev != grt->dst.dev) {
2803 *_dev = dev = grt->dst.dev;
2804 *idev = grt->rt6i_idev;
2806 in6_dev_hold(grt->rt6i_idev);
2809 if (!(grt->rt6i_flags & RTF_GATEWAY))
2818 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2819 struct net_device **_dev, struct inet6_dev **idev,
2820 struct netlink_ext_ack *extack)
2822 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2823 int gwa_type = ipv6_addr_type(gw_addr);
2824 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2825 const struct net_device *dev = *_dev;
2826 bool need_addr_check = !dev;
2829 /* if gw_addr is local we will fail to detect this in case
2830 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2831 * will return already-added prefix route via interface that
2832 * prefix route was assigned to, which might be non-loopback.
2835 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2836 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2840 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2841 /* IPv6 strictly inhibits using not link-local
2842 * addresses as nexthop address.
2843 * Otherwise, router will not able to send redirects.
2844 * It is very good, but in some (rare!) circumstances
2845 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2846 * some exceptions. --ANK
2847 * We allow IPv4-mapped nexthops to support RFC4798-type
2850 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2851 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2855 if (cfg->fc_flags & RTNH_F_ONLINK)
2856 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2858 err = ip6_route_check_nh(net, cfg, _dev, idev);
2864 /* reload in case device was changed */
2869 NL_SET_ERR_MSG(extack, "Egress device not specified");
2871 } else if (dev->flags & IFF_LOOPBACK) {
2872 NL_SET_ERR_MSG(extack,
2873 "Egress device can not be loopback device for this route");
2877 /* if we did not check gw_addr above, do so now that the
2878 * egress device has been resolved.
2880 if (need_addr_check &&
2881 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2882 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2891 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
2893 if ((flags & RTF_REJECT) ||
2894 (dev && (dev->flags & IFF_LOOPBACK) &&
2895 !(addr_type & IPV6_ADDR_LOOPBACK) &&
2896 !(flags & RTF_LOCAL)))
2902 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
2903 struct fib6_config *cfg, gfp_t gfp_flags,
2904 struct netlink_ext_ack *extack)
2906 struct net_device *dev = NULL;
2907 struct inet6_dev *idev = NULL;
2911 fib6_nh->fib_nh_family = AF_INET6;
2914 if (cfg->fc_ifindex) {
2915 dev = dev_get_by_index(net, cfg->fc_ifindex);
2918 idev = in6_dev_get(dev);
2923 if (cfg->fc_flags & RTNH_F_ONLINK) {
2925 NL_SET_ERR_MSG(extack,
2926 "Nexthop device required for onlink");
2930 if (!(dev->flags & IFF_UP)) {
2931 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2936 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
2939 fib6_nh->fib_nh_weight = 1;
2941 /* We cannot add true routes via loopback here,
2942 * they would result in kernel looping; promote them to reject routes
2944 addr_type = ipv6_addr_type(&cfg->fc_dst);
2945 if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
2946 /* hold loopback dev/idev if we haven't done so. */
2947 if (dev != net->loopback_dev) {
2952 dev = net->loopback_dev;
2954 idev = in6_dev_get(dev);
2963 if (cfg->fc_flags & RTF_GATEWAY) {
2964 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
2968 fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
2969 fib6_nh->fib_nh_gw_family = AF_INET6;
2976 if (idev->cnf.disable_ipv6) {
2977 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
2982 if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
2983 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2988 if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
2989 !netif_carrier_ok(dev))
2990 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
2992 err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap,
2993 cfg->fc_encap_type, cfg, gfp_flags, extack);
2997 fib6_nh->fib_nh_dev = dev;
2998 fib6_nh->fib_nh_oif = dev->ifindex;
3005 lwtstate_put(fib6_nh->fib_nh_lws);
3006 fib6_nh->fib_nh_lws = NULL;
3014 void fib6_nh_release(struct fib6_nh *fib6_nh)
3016 fib_nh_common_release(&fib6_nh->nh_common);
3019 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3021 struct netlink_ext_ack *extack)
3023 struct net *net = cfg->fc_nlinfo.nl_net;
3024 struct fib6_info *rt = NULL;
3025 struct fib6_table *table;
3029 /* RTF_PCPU is an internal flag; can not be set by userspace */
3030 if (cfg->fc_flags & RTF_PCPU) {
3031 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3035 /* RTF_CACHE is an internal flag; can not be set by userspace */
3036 if (cfg->fc_flags & RTF_CACHE) {
3037 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3041 if (cfg->fc_type > RTN_MAX) {
3042 NL_SET_ERR_MSG(extack, "Invalid route type");
3046 if (cfg->fc_dst_len > 128) {
3047 NL_SET_ERR_MSG(extack, "Invalid prefix length");
3050 if (cfg->fc_src_len > 128) {
3051 NL_SET_ERR_MSG(extack, "Invalid source address length");
3054 #ifndef CONFIG_IPV6_SUBTREES
3055 if (cfg->fc_src_len) {
3056 NL_SET_ERR_MSG(extack,
3057 "Specifying source address requires IPV6_SUBTREES to be enabled");
3063 if (cfg->fc_nlinfo.nlh &&
3064 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3065 table = fib6_get_table(net, cfg->fc_table);
3067 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3068 table = fib6_new_table(net, cfg->fc_table);
3071 table = fib6_new_table(net, cfg->fc_table);
3078 rt = fib6_info_alloc(gfp_flags);
3082 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3084 if (IS_ERR(rt->fib6_metrics)) {
3085 err = PTR_ERR(rt->fib6_metrics);
3086 /* Do not leave garbage there. */
3087 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3091 if (cfg->fc_flags & RTF_ADDRCONF)
3092 rt->dst_nocount = true;
3094 if (cfg->fc_flags & RTF_EXPIRES)
3095 fib6_set_expires(rt, jiffies +
3096 clock_t_to_jiffies(cfg->fc_expires));
3098 fib6_clean_expires(rt);
3100 if (cfg->fc_protocol == RTPROT_UNSPEC)
3101 cfg->fc_protocol = RTPROT_BOOT;
3102 rt->fib6_protocol = cfg->fc_protocol;
3104 rt->fib6_table = table;
3105 rt->fib6_metric = cfg->fc_metric;
3106 rt->fib6_type = cfg->fc_type;
3107 rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3109 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3110 rt->fib6_dst.plen = cfg->fc_dst_len;
3111 if (rt->fib6_dst.plen == 128)
3112 rt->dst_host = true;
3114 #ifdef CONFIG_IPV6_SUBTREES
3115 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3116 rt->fib6_src.plen = cfg->fc_src_len;
3118 err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
3122 /* We cannot add true routes via loopback here,
3123 * they would result in kernel looping; promote them to reject routes
3125 addr_type = ipv6_addr_type(&cfg->fc_dst);
3126 if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type))
3127 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3129 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3130 struct net_device *dev = fib6_info_nh_dev(rt);
3132 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3133 NL_SET_ERR_MSG(extack, "Invalid source address");
3137 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3138 rt->fib6_prefsrc.plen = 128;
3140 rt->fib6_prefsrc.plen = 0;
3144 fib6_info_release(rt);
3145 return ERR_PTR(err);
3148 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3149 struct netlink_ext_ack *extack)
3151 struct fib6_info *rt;
3154 rt = ip6_route_info_create(cfg, gfp_flags, extack);
3158 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3159 fib6_info_release(rt);
3164 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3166 struct net *net = info->nl_net;
3167 struct fib6_table *table;
3170 if (rt == net->ipv6.fib6_null_entry) {
3175 table = rt->fib6_table;
3176 spin_lock_bh(&table->tb6_lock);
3177 err = fib6_del(rt, info);
3178 spin_unlock_bh(&table->tb6_lock);
3181 fib6_info_release(rt);
3185 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3187 struct nl_info info = { .nl_net = net };
3189 return __ip6_del_rt(rt, &info);
3192 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3194 struct nl_info *info = &cfg->fc_nlinfo;
3195 struct net *net = info->nl_net;
3196 struct sk_buff *skb = NULL;
3197 struct fib6_table *table;
3200 if (rt == net->ipv6.fib6_null_entry)
3202 table = rt->fib6_table;
3203 spin_lock_bh(&table->tb6_lock);
3205 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3206 struct fib6_info *sibling, *next_sibling;
3208 /* prefer to send a single notification with all hops */
3209 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3211 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3213 if (rt6_fill_node(net, skb, rt, NULL,
3214 NULL, NULL, 0, RTM_DELROUTE,
3215 info->portid, seq, 0) < 0) {
3219 info->skip_notify = 1;
3222 list_for_each_entry_safe(sibling, next_sibling,
3225 err = fib6_del(sibling, info);
3231 err = fib6_del(rt, info);
3233 spin_unlock_bh(&table->tb6_lock);
3235 fib6_info_release(rt);
3238 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3239 info->nlh, gfp_any());
3244 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3248 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3251 if (cfg->fc_flags & RTF_GATEWAY &&
3252 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3255 rc = rt6_remove_exception_rt(rt);
3260 static int ip6_route_del(struct fib6_config *cfg,
3261 struct netlink_ext_ack *extack)
3263 struct rt6_info *rt_cache;
3264 struct fib6_table *table;
3265 struct fib6_info *rt;
3266 struct fib6_node *fn;
3269 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3271 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3277 fn = fib6_locate(&table->tb6_root,
3278 &cfg->fc_dst, cfg->fc_dst_len,
3279 &cfg->fc_src, cfg->fc_src_len,
3280 !(cfg->fc_flags & RTF_CACHE));
3283 for_each_fib6_node_rt_rcu(fn) {
3286 if (cfg->fc_flags & RTF_CACHE) {
3289 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3292 rc = ip6_del_cached_rt(rt_cache, cfg);
3302 if (cfg->fc_ifindex &&
3304 nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
3306 if (cfg->fc_flags & RTF_GATEWAY &&
3307 !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
3309 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3311 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3313 if (!fib6_info_hold_safe(rt))
3317 /* if gateway was specified only delete the one hop */
3318 if (cfg->fc_flags & RTF_GATEWAY)
3319 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3321 return __ip6_del_rt_siblings(rt, cfg);
3329 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3331 struct netevent_redirect netevent;
3332 struct rt6_info *rt, *nrt = NULL;
3333 struct ndisc_options ndopts;
3334 struct inet6_dev *in6_dev;
3335 struct neighbour *neigh;
3336 struct fib6_info *from;
3338 int optlen, on_link;
3341 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3342 optlen -= sizeof(*msg);
3345 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3349 msg = (struct rd_msg *)icmp6_hdr(skb);
3351 if (ipv6_addr_is_multicast(&msg->dest)) {
3352 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3357 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3359 } else if (ipv6_addr_type(&msg->target) !=
3360 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3361 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3365 in6_dev = __in6_dev_get(skb->dev);
3368 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3372 * The IP source address of the Redirect MUST be the same as the current
3373 * first-hop router for the specified ICMP Destination Address.
3376 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3377 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3382 if (ndopts.nd_opts_tgt_lladdr) {
3383 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3386 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3391 rt = (struct rt6_info *) dst;
3392 if (rt->rt6i_flags & RTF_REJECT) {
3393 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3397 /* Redirect received -> path was valid.
3398 * Look, redirects are sent only in response to data packets,
3399 * so that this nexthop apparently is reachable. --ANK
3401 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3403 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3408 * We have finally decided to accept it.
3411 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3412 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3413 NEIGH_UPDATE_F_OVERRIDE|
3414 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3415 NEIGH_UPDATE_F_ISROUTER)),
3416 NDISC_REDIRECT, &ndopts);
3419 from = rcu_dereference(rt->from);
3420 /* This fib6_info_hold() is safe here because we hold reference to rt
3421 * and rt already holds reference to fib6_info.
3423 fib6_info_hold(from);
3426 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3430 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3432 nrt->rt6i_flags &= ~RTF_GATEWAY;
3434 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3436 /* No need to remove rt from the exception table if rt is
3437 * a cached route because rt6_insert_exception() will
3440 if (rt6_insert_exception(nrt, from)) {
3441 dst_release_immediate(&nrt->dst);
3445 netevent.old = &rt->dst;
3446 netevent.new = &nrt->dst;
3447 netevent.daddr = &msg->dest;
3448 netevent.neigh = neigh;
3449 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3452 fib6_info_release(from);
3453 neigh_release(neigh);
3456 #ifdef CONFIG_IPV6_ROUTE_INFO
3457 static struct fib6_info *rt6_get_route_info(struct net *net,
3458 const struct in6_addr *prefix, int prefixlen,
3459 const struct in6_addr *gwaddr,
3460 struct net_device *dev)
3462 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3463 int ifindex = dev->ifindex;
3464 struct fib6_node *fn;
3465 struct fib6_info *rt = NULL;
3466 struct fib6_table *table;
3468 table = fib6_get_table(net, tb_id);
3473 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3477 for_each_fib6_node_rt_rcu(fn) {
3478 if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
3480 if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
3481 !rt->fib6_nh.fib_nh_gw_family)
3483 if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
3485 if (!fib6_info_hold_safe(rt))
3494 static struct fib6_info *rt6_add_route_info(struct net *net,
3495 const struct in6_addr *prefix, int prefixlen,
3496 const struct in6_addr *gwaddr,
3497 struct net_device *dev,
3500 struct fib6_config cfg = {
3501 .fc_metric = IP6_RT_PRIO_USER,
3502 .fc_ifindex = dev->ifindex,
3503 .fc_dst_len = prefixlen,
3504 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3505 RTF_UP | RTF_PREF(pref),
3506 .fc_protocol = RTPROT_RA,
3507 .fc_type = RTN_UNICAST,
3508 .fc_nlinfo.portid = 0,
3509 .fc_nlinfo.nlh = NULL,
3510 .fc_nlinfo.nl_net = net,
3513 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3514 cfg.fc_dst = *prefix;
3515 cfg.fc_gateway = *gwaddr;
3517 /* We should treat it as a default route if prefix length is 0. */
3519 cfg.fc_flags |= RTF_DEFAULT;
3521 ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3523 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3527 struct fib6_info *rt6_get_dflt_router(struct net *net,
3528 const struct in6_addr *addr,
3529 struct net_device *dev)
3531 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3532 struct fib6_info *rt;
3533 struct fib6_table *table;
3535 table = fib6_get_table(net, tb_id);
3540 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3541 struct fib6_nh *nh = &rt->fib6_nh;
3543 if (dev == nh->fib_nh_dev &&
3544 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3545 ipv6_addr_equal(&nh->fib_nh_gw6, addr))
3548 if (rt && !fib6_info_hold_safe(rt))
3554 struct fib6_info *rt6_add_dflt_router(struct net *net,
3555 const struct in6_addr *gwaddr,
3556 struct net_device *dev,
3559 struct fib6_config cfg = {
3560 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3561 .fc_metric = IP6_RT_PRIO_USER,
3562 .fc_ifindex = dev->ifindex,
3563 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3564 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3565 .fc_protocol = RTPROT_RA,
3566 .fc_type = RTN_UNICAST,
3567 .fc_nlinfo.portid = 0,
3568 .fc_nlinfo.nlh = NULL,
3569 .fc_nlinfo.nl_net = net,
3572 cfg.fc_gateway = *gwaddr;
3574 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3575 struct fib6_table *table;
3577 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3579 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3582 return rt6_get_dflt_router(net, gwaddr, dev);
3585 static void __rt6_purge_dflt_routers(struct net *net,
3586 struct fib6_table *table)
3588 struct fib6_info *rt;
3592 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3593 struct net_device *dev = fib6_info_nh_dev(rt);
3594 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3596 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3597 (!idev || idev->cnf.accept_ra != 2) &&
3598 fib6_info_hold_safe(rt)) {
3600 ip6_del_rt(net, rt);
3606 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3609 void rt6_purge_dflt_routers(struct net *net)
3611 struct fib6_table *table;
3612 struct hlist_head *head;
3617 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3618 head = &net->ipv6.fib_table_hash[h];
3619 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3620 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3621 __rt6_purge_dflt_routers(net, table);
3628 static void rtmsg_to_fib6_config(struct net *net,
3629 struct in6_rtmsg *rtmsg,
3630 struct fib6_config *cfg)
3632 *cfg = (struct fib6_config){
3633 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3635 .fc_ifindex = rtmsg->rtmsg_ifindex,
3636 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
3637 .fc_expires = rtmsg->rtmsg_info,
3638 .fc_dst_len = rtmsg->rtmsg_dst_len,
3639 .fc_src_len = rtmsg->rtmsg_src_len,
3640 .fc_flags = rtmsg->rtmsg_flags,
3641 .fc_type = rtmsg->rtmsg_type,
3643 .fc_nlinfo.nl_net = net,
3645 .fc_dst = rtmsg->rtmsg_dst,
3646 .fc_src = rtmsg->rtmsg_src,
3647 .fc_gateway = rtmsg->rtmsg_gateway,
3651 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3653 struct fib6_config cfg;
3654 struct in6_rtmsg rtmsg;
3658 case SIOCADDRT: /* Add a route */
3659 case SIOCDELRT: /* Delete a route */
3660 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3662 err = copy_from_user(&rtmsg, arg,
3663 sizeof(struct in6_rtmsg));
3667 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3672 err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3675 err = ip6_route_del(&cfg, NULL);
3689 * Drop the packet on the floor
3692 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3695 struct dst_entry *dst = skb_dst(skb);
3696 switch (ipstats_mib_noroutes) {
3697 case IPSTATS_MIB_INNOROUTES:
3698 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3699 if (type == IPV6_ADDR_ANY) {
3700 IP6_INC_STATS(dev_net(dst->dev),
3701 __in6_dev_get_safely(skb->dev),
3702 IPSTATS_MIB_INADDRERRORS);
3706 case IPSTATS_MIB_OUTNOROUTES:
3707 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3708 ipstats_mib_noroutes);
3711 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3716 static int ip6_pkt_discard(struct sk_buff *skb)
3718 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3721 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3723 skb->dev = skb_dst(skb)->dev;
3724 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3727 static int ip6_pkt_prohibit(struct sk_buff *skb)
3729 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3732 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3734 skb->dev = skb_dst(skb)->dev;
3735 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3739 * Allocate a dst for local (unicast / anycast) address.
3742 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3743 struct inet6_dev *idev,
3744 const struct in6_addr *addr,
3745 bool anycast, gfp_t gfp_flags)
3747 struct fib6_config cfg = {
3748 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
3749 .fc_ifindex = idev->dev->ifindex,
3750 .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
3753 .fc_protocol = RTPROT_KERNEL,
3754 .fc_nlinfo.nl_net = net,
3755 .fc_ignore_dev_down = true,
3759 cfg.fc_type = RTN_ANYCAST;
3760 cfg.fc_flags |= RTF_ANYCAST;
3762 cfg.fc_type = RTN_LOCAL;
3763 cfg.fc_flags |= RTF_LOCAL;
3766 return ip6_route_info_create(&cfg, gfp_flags, NULL);
3769 /* remove deleted ip from prefsrc entries */
3770 struct arg_dev_net_ip {
3771 struct net_device *dev;
3773 struct in6_addr *addr;
3776 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3778 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3779 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3780 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3782 if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) &&
3783 rt != net->ipv6.fib6_null_entry &&
3784 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3785 spin_lock_bh(&rt6_exception_lock);
3786 /* remove prefsrc entry */
3787 rt->fib6_prefsrc.plen = 0;
3788 spin_unlock_bh(&rt6_exception_lock);
3793 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3795 struct net *net = dev_net(ifp->idev->dev);
3796 struct arg_dev_net_ip adni = {
3797 .dev = ifp->idev->dev,
3801 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3804 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT)
3806 /* Remove routers and update dst entries when gateway turn into host. */
3807 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3809 struct in6_addr *gateway = (struct in6_addr *)arg;
3811 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3812 rt->fib6_nh.fib_nh_gw_family &&
3813 ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
3817 /* Further clean up cached routes in exception table.
3818 * This is needed because cached route may have a different
3819 * gateway than its 'parent' in the case of an ip redirect.
3821 rt6_exceptions_clean_tohost(rt, gateway);
3826 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3828 fib6_clean_all(net, fib6_clean_tohost, gateway);
3831 struct arg_netdev_event {
3832 const struct net_device *dev;
3834 unsigned int nh_flags;
3835 unsigned long event;
3839 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3841 struct fib6_info *iter;
3842 struct fib6_node *fn;
3844 fn = rcu_dereference_protected(rt->fib6_node,
3845 lockdep_is_held(&rt->fib6_table->tb6_lock));
3846 iter = rcu_dereference_protected(fn->leaf,
3847 lockdep_is_held(&rt->fib6_table->tb6_lock));
3849 if (iter->fib6_metric == rt->fib6_metric &&
3850 rt6_qualify_for_ecmp(iter))
3852 iter = rcu_dereference_protected(iter->fib6_next,
3853 lockdep_is_held(&rt->fib6_table->tb6_lock));
3859 static bool rt6_is_dead(const struct fib6_info *rt)
3861 if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ||
3862 (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
3863 ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev)))
3869 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3871 struct fib6_info *iter;
3874 if (!rt6_is_dead(rt))
3875 total += rt->fib6_nh.fib_nh_weight;
3877 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3878 if (!rt6_is_dead(iter))
3879 total += iter->fib6_nh.fib_nh_weight;
3885 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3887 int upper_bound = -1;
3889 if (!rt6_is_dead(rt)) {
3890 *weight += rt->fib6_nh.fib_nh_weight;
3891 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3894 atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound);
3897 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3899 struct fib6_info *iter;
3902 rt6_upper_bound_set(rt, &weight, total);
3904 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3905 rt6_upper_bound_set(iter, &weight, total);
3908 void rt6_multipath_rebalance(struct fib6_info *rt)
3910 struct fib6_info *first;
3913 /* In case the entire multipath route was marked for flushing,
3914 * then there is no need to rebalance upon the removal of every
3917 if (!rt->fib6_nsiblings || rt->should_flush)
3920 /* During lookup routes are evaluated in order, so we need to
3921 * make sure upper bounds are assigned from the first sibling
3924 first = rt6_multipath_first_sibling(rt);
3925 if (WARN_ON_ONCE(!first))
3928 total = rt6_multipath_total_weight(first);
3929 rt6_multipath_upper_bound_set(first, total);
3932 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3934 const struct arg_netdev_event *arg = p_arg;
3935 struct net *net = dev_net(arg->dev);
3937 if (rt != net->ipv6.fib6_null_entry &&
3938 rt->fib6_nh.fib_nh_dev == arg->dev) {
3939 rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags;
3940 fib6_update_sernum_upto_root(net, rt);
3941 rt6_multipath_rebalance(rt);
3947 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3949 struct arg_netdev_event arg = {
3952 .nh_flags = nh_flags,
3956 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3957 arg.nh_flags |= RTNH_F_LINKDOWN;
3959 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3962 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3963 const struct net_device *dev)
3965 struct fib6_info *iter;
3967 if (rt->fib6_nh.fib_nh_dev == dev)
3969 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3970 if (iter->fib6_nh.fib_nh_dev == dev)
3976 static void rt6_multipath_flush(struct fib6_info *rt)
3978 struct fib6_info *iter;
3980 rt->should_flush = 1;
3981 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3982 iter->should_flush = 1;
3985 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3986 const struct net_device *down_dev)
3988 struct fib6_info *iter;
3989 unsigned int dead = 0;
3991 if (rt->fib6_nh.fib_nh_dev == down_dev ||
3992 rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
3994 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3995 if (iter->fib6_nh.fib_nh_dev == down_dev ||
3996 iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4002 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4003 const struct net_device *dev,
4004 unsigned int nh_flags)
4006 struct fib6_info *iter;
4008 if (rt->fib6_nh.fib_nh_dev == dev)
4009 rt->fib6_nh.fib_nh_flags |= nh_flags;
4010 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4011 if (iter->fib6_nh.fib_nh_dev == dev)
4012 iter->fib6_nh.fib_nh_flags |= nh_flags;
4015 /* called with write lock held for table with rt */
4016 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4018 const struct arg_netdev_event *arg = p_arg;
4019 const struct net_device *dev = arg->dev;
4020 struct net *net = dev_net(dev);
4022 if (rt == net->ipv6.fib6_null_entry)
4025 switch (arg->event) {
4026 case NETDEV_UNREGISTER:
4027 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4029 if (rt->should_flush)
4031 if (!rt->fib6_nsiblings)
4032 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4033 if (rt6_multipath_uses_dev(rt, dev)) {
4036 count = rt6_multipath_dead_count(rt, dev);
4037 if (rt->fib6_nsiblings + 1 == count) {
4038 rt6_multipath_flush(rt);
4041 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4043 fib6_update_sernum(net, rt);
4044 rt6_multipath_rebalance(rt);
4048 if (rt->fib6_nh.fib_nh_dev != dev ||
4049 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4051 rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN;
4052 rt6_multipath_rebalance(rt);
4059 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4061 struct arg_netdev_event arg = {
4067 struct net *net = dev_net(dev);
4069 if (net->ipv6.sysctl.skip_notify_on_dev_down)
4070 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4072 fib6_clean_all(net, fib6_ifdown, &arg);
4075 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4077 rt6_sync_down_dev(dev, event);
4078 rt6_uncached_list_flush_dev(dev_net(dev), dev);
4079 neigh_ifdown(&nd_tbl, dev);
4082 struct rt6_mtu_change_arg {
4083 struct net_device *dev;
4087 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4089 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4090 struct inet6_dev *idev;
4092 /* In IPv6 pmtu discovery is not optional,
4093 so that RTAX_MTU lock cannot disable it.
4094 We still use this lock to block changes
4095 caused by addrconf/ndisc.
4098 idev = __in6_dev_get(arg->dev);
4102 /* For administrative MTU increase, there is no way to discover
4103 IPv6 PMTU increase, so PMTU increase should be updated here.
4104 Since RFC 1981 doesn't include administrative MTU increase
4105 update PMTU increase is a MUST. (i.e. jumbo frame)
4107 if (rt->fib6_nh.fib_nh_dev == arg->dev &&
4108 !fib6_metric_locked(rt, RTAX_MTU)) {
4109 u32 mtu = rt->fib6_pmtu;
4111 if (mtu >= arg->mtu ||
4112 (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4113 fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4115 spin_lock_bh(&rt6_exception_lock);
4116 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4117 spin_unlock_bh(&rt6_exception_lock);
4122 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4124 struct rt6_mtu_change_arg arg = {
4129 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4132 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4133 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
4134 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
4135 [RTA_OIF] = { .type = NLA_U32 },
4136 [RTA_IIF] = { .type = NLA_U32 },
4137 [RTA_PRIORITY] = { .type = NLA_U32 },
4138 [RTA_METRICS] = { .type = NLA_NESTED },
4139 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
4140 [RTA_PREF] = { .type = NLA_U8 },
4141 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
4142 [RTA_ENCAP] = { .type = NLA_NESTED },
4143 [RTA_EXPIRES] = { .type = NLA_U32 },
4144 [RTA_UID] = { .type = NLA_U32 },
4145 [RTA_MARK] = { .type = NLA_U32 },
4146 [RTA_TABLE] = { .type = NLA_U32 },
4147 [RTA_IP_PROTO] = { .type = NLA_U8 },
4148 [RTA_SPORT] = { .type = NLA_U16 },
4149 [RTA_DPORT] = { .type = NLA_U16 },
4152 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4153 struct fib6_config *cfg,
4154 struct netlink_ext_ack *extack)
4157 struct nlattr *tb[RTA_MAX+1];
4161 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4167 rtm = nlmsg_data(nlh);
4169 *cfg = (struct fib6_config){
4170 .fc_table = rtm->rtm_table,
4171 .fc_dst_len = rtm->rtm_dst_len,
4172 .fc_src_len = rtm->rtm_src_len,
4174 .fc_protocol = rtm->rtm_protocol,
4175 .fc_type = rtm->rtm_type,
4177 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4178 .fc_nlinfo.nlh = nlh,
4179 .fc_nlinfo.nl_net = sock_net(skb->sk),
4182 if (rtm->rtm_type == RTN_UNREACHABLE ||
4183 rtm->rtm_type == RTN_BLACKHOLE ||
4184 rtm->rtm_type == RTN_PROHIBIT ||
4185 rtm->rtm_type == RTN_THROW)
4186 cfg->fc_flags |= RTF_REJECT;
4188 if (rtm->rtm_type == RTN_LOCAL)
4189 cfg->fc_flags |= RTF_LOCAL;
4191 if (rtm->rtm_flags & RTM_F_CLONED)
4192 cfg->fc_flags |= RTF_CACHE;
4194 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4196 if (tb[RTA_GATEWAY]) {
4197 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4198 cfg->fc_flags |= RTF_GATEWAY;
4201 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4206 int plen = (rtm->rtm_dst_len + 7) >> 3;
4208 if (nla_len(tb[RTA_DST]) < plen)
4211 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4215 int plen = (rtm->rtm_src_len + 7) >> 3;
4217 if (nla_len(tb[RTA_SRC]) < plen)
4220 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4223 if (tb[RTA_PREFSRC])
4224 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4227 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4229 if (tb[RTA_PRIORITY])
4230 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4232 if (tb[RTA_METRICS]) {
4233 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4234 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4238 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4240 if (tb[RTA_MULTIPATH]) {
4241 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4242 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4244 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4245 cfg->fc_mp_len, extack);
4251 pref = nla_get_u8(tb[RTA_PREF]);
4252 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4253 pref != ICMPV6_ROUTER_PREF_HIGH)
4254 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4255 cfg->fc_flags |= RTF_PREF(pref);
4259 cfg->fc_encap = tb[RTA_ENCAP];
4261 if (tb[RTA_ENCAP_TYPE]) {
4262 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4264 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4269 if (tb[RTA_EXPIRES]) {
4270 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4272 if (addrconf_finite_timeout(timeout)) {
4273 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4274 cfg->fc_flags |= RTF_EXPIRES;
4284 struct fib6_info *fib6_info;
4285 struct fib6_config r_cfg;
4286 struct list_head next;
4289 static int ip6_route_info_append(struct net *net,
4290 struct list_head *rt6_nh_list,
4291 struct fib6_info *rt,
4292 struct fib6_config *r_cfg)
4297 list_for_each_entry(nh, rt6_nh_list, next) {
4298 /* check if fib6_info already exists */
4299 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4303 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4307 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4308 list_add_tail(&nh->next, rt6_nh_list);
4313 static void ip6_route_mpath_notify(struct fib6_info *rt,
4314 struct fib6_info *rt_last,
4315 struct nl_info *info,
4318 /* if this is an APPEND route, then rt points to the first route
4319 * inserted and rt_last points to last route inserted. Userspace
4320 * wants a consistent dump of the route which starts at the first
4321 * nexthop. Since sibling routes are always added at the end of
4322 * the list, find the first sibling of the last route appended
4324 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4325 rt = list_first_entry(&rt_last->fib6_siblings,
4331 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4334 static int ip6_route_multipath_add(struct fib6_config *cfg,
4335 struct netlink_ext_ack *extack)
4337 struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4338 struct nl_info *info = &cfg->fc_nlinfo;
4339 struct fib6_config r_cfg;
4340 struct rtnexthop *rtnh;
4341 struct fib6_info *rt;
4342 struct rt6_nh *err_nh;
4343 struct rt6_nh *nh, *nh_safe;
4349 int replace = (cfg->fc_nlinfo.nlh &&
4350 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4351 LIST_HEAD(rt6_nh_list);
4353 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4354 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4355 nlflags |= NLM_F_APPEND;
4357 remaining = cfg->fc_mp_len;
4358 rtnh = (struct rtnexthop *)cfg->fc_mp;
4360 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4361 * fib6_info structs per nexthop
4363 while (rtnh_ok(rtnh, remaining)) {
4364 memcpy(&r_cfg, cfg, sizeof(*cfg));
4365 if (rtnh->rtnh_ifindex)
4366 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4368 attrlen = rtnh_attrlen(rtnh);
4370 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4372 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4374 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4375 r_cfg.fc_flags |= RTF_GATEWAY;
4377 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4378 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4380 r_cfg.fc_encap_type = nla_get_u16(nla);
4383 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4384 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4390 if (!rt6_qualify_for_ecmp(rt)) {
4392 NL_SET_ERR_MSG(extack,
4393 "Device only routes can not be added for IPv6 using the multipath API.");
4394 fib6_info_release(rt);
4398 rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1;
4400 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4403 fib6_info_release(rt);
4407 rtnh = rtnh_next(rtnh, &remaining);
4410 /* for add and replace send one notification with all nexthops.
4411 * Skip the notification in fib6_add_rt2node and send one with
4412 * the full route when done
4414 info->skip_notify = 1;
4417 list_for_each_entry(nh, &rt6_nh_list, next) {
4418 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4419 fib6_info_release(nh->fib6_info);
4422 /* save reference to last route successfully inserted */
4423 rt_last = nh->fib6_info;
4425 /* save reference to first route for notification */
4427 rt_notif = nh->fib6_info;
4430 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4431 nh->fib6_info = NULL;
4434 NL_SET_ERR_MSG_MOD(extack,
4435 "multipath route replace failed (check consistency of installed routes)");
4440 /* Because each route is added like a single route we remove
4441 * these flags after the first nexthop: if there is a collision,
4442 * we have already failed to add the first nexthop:
4443 * fib6_add_rt2node() has rejected it; when replacing, old
4444 * nexthops have been replaced by first new, the rest should
4447 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4452 /* success ... tell user about new route */
4453 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4457 /* send notification for routes that were added so that
4458 * the delete notifications sent by ip6_route_del are
4462 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4464 /* Delete routes that were already added */
4465 list_for_each_entry(nh, &rt6_nh_list, next) {
4468 ip6_route_del(&nh->r_cfg, extack);
4472 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4474 fib6_info_release(nh->fib6_info);
4475 list_del(&nh->next);
4482 static int ip6_route_multipath_del(struct fib6_config *cfg,
4483 struct netlink_ext_ack *extack)
4485 struct fib6_config r_cfg;
4486 struct rtnexthop *rtnh;
4489 int err = 1, last_err = 0;
4491 remaining = cfg->fc_mp_len;
4492 rtnh = (struct rtnexthop *)cfg->fc_mp;
4494 /* Parse a Multipath Entry */
4495 while (rtnh_ok(rtnh, remaining)) {
4496 memcpy(&r_cfg, cfg, sizeof(*cfg));
4497 if (rtnh->rtnh_ifindex)
4498 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4500 attrlen = rtnh_attrlen(rtnh);
4502 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4504 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4506 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4507 r_cfg.fc_flags |= RTF_GATEWAY;
4510 err = ip6_route_del(&r_cfg, extack);
4514 rtnh = rtnh_next(rtnh, &remaining);
4520 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4521 struct netlink_ext_ack *extack)
4523 struct fib6_config cfg;
4526 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4531 return ip6_route_multipath_del(&cfg, extack);
4533 cfg.fc_delete_all_nh = 1;
4534 return ip6_route_del(&cfg, extack);
4538 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4539 struct netlink_ext_ack *extack)
4541 struct fib6_config cfg;
4544 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4548 if (cfg.fc_metric == 0)
4549 cfg.fc_metric = IP6_RT_PRIO_USER;
4552 return ip6_route_multipath_add(&cfg, extack);
4554 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4557 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4559 int nexthop_len = 0;
4561 if (rt->fib6_nsiblings) {
4562 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4563 + NLA_ALIGN(sizeof(struct rtnexthop))
4564 + nla_total_size(16) /* RTA_GATEWAY */
4565 + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws);
4567 nexthop_len *= rt->fib6_nsiblings;
4570 return NLMSG_ALIGN(sizeof(struct rtmsg))
4571 + nla_total_size(16) /* RTA_SRC */
4572 + nla_total_size(16) /* RTA_DST */
4573 + nla_total_size(16) /* RTA_GATEWAY */
4574 + nla_total_size(16) /* RTA_PREFSRC */
4575 + nla_total_size(4) /* RTA_TABLE */
4576 + nla_total_size(4) /* RTA_IIF */
4577 + nla_total_size(4) /* RTA_OIF */
4578 + nla_total_size(4) /* RTA_PRIORITY */
4579 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4580 + nla_total_size(sizeof(struct rta_cacheinfo))
4581 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4582 + nla_total_size(1) /* RTA_PREF */
4583 + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws)
4587 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4588 struct fib6_info *rt, struct dst_entry *dst,
4589 struct in6_addr *dest, struct in6_addr *src,
4590 int iif, int type, u32 portid, u32 seq,
4593 struct rt6_info *rt6 = (struct rt6_info *)dst;
4594 struct rt6key *rt6_dst, *rt6_src;
4595 u32 *pmetrics, table, rt6_flags;
4596 struct nlmsghdr *nlh;
4600 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4605 rt6_dst = &rt6->rt6i_dst;
4606 rt6_src = &rt6->rt6i_src;
4607 rt6_flags = rt6->rt6i_flags;
4609 rt6_dst = &rt->fib6_dst;
4610 rt6_src = &rt->fib6_src;
4611 rt6_flags = rt->fib6_flags;
4614 rtm = nlmsg_data(nlh);
4615 rtm->rtm_family = AF_INET6;
4616 rtm->rtm_dst_len = rt6_dst->plen;
4617 rtm->rtm_src_len = rt6_src->plen;
4620 table = rt->fib6_table->tb6_id;
4622 table = RT6_TABLE_UNSPEC;
4623 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4624 if (nla_put_u32(skb, RTA_TABLE, table))
4625 goto nla_put_failure;
4627 rtm->rtm_type = rt->fib6_type;
4629 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4630 rtm->rtm_protocol = rt->fib6_protocol;
4632 if (rt6_flags & RTF_CACHE)
4633 rtm->rtm_flags |= RTM_F_CLONED;
4636 if (nla_put_in6_addr(skb, RTA_DST, dest))
4637 goto nla_put_failure;
4638 rtm->rtm_dst_len = 128;
4639 } else if (rtm->rtm_dst_len)
4640 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4641 goto nla_put_failure;
4642 #ifdef CONFIG_IPV6_SUBTREES
4644 if (nla_put_in6_addr(skb, RTA_SRC, src))
4645 goto nla_put_failure;
4646 rtm->rtm_src_len = 128;
4647 } else if (rtm->rtm_src_len &&
4648 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4649 goto nla_put_failure;
4652 #ifdef CONFIG_IPV6_MROUTE
4653 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4654 int err = ip6mr_get_route(net, skb, rtm, portid);
4659 goto nla_put_failure;
4662 if (nla_put_u32(skb, RTA_IIF, iif))
4663 goto nla_put_failure;
4665 struct in6_addr saddr_buf;
4666 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4667 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4668 goto nla_put_failure;
4671 if (rt->fib6_prefsrc.plen) {
4672 struct in6_addr saddr_buf;
4673 saddr_buf = rt->fib6_prefsrc.addr;
4674 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4675 goto nla_put_failure;
4678 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4679 if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4680 goto nla_put_failure;
4682 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4683 goto nla_put_failure;
4685 /* For multipath routes, walk the siblings list and add
4686 * each as a nexthop within RTA_MULTIPATH.
4689 if (rt6_flags & RTF_GATEWAY &&
4690 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4691 goto nla_put_failure;
4693 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4694 goto nla_put_failure;
4695 } else if (rt->fib6_nsiblings) {
4696 struct fib6_info *sibling, *next_sibling;
4699 mp = nla_nest_start(skb, RTA_MULTIPATH);
4701 goto nla_put_failure;
4703 if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common,
4704 rt->fib6_nh.fib_nh_weight) < 0)
4705 goto nla_put_failure;
4707 list_for_each_entry_safe(sibling, next_sibling,
4708 &rt->fib6_siblings, fib6_siblings) {
4709 if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common,
4710 sibling->fib6_nh.fib_nh_weight) < 0)
4711 goto nla_put_failure;
4714 nla_nest_end(skb, mp);
4716 if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common,
4717 &rtm->rtm_flags, false) < 0)
4718 goto nla_put_failure;
4721 if (rt6_flags & RTF_EXPIRES) {
4722 expires = dst ? dst->expires : rt->expires;
4726 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4727 goto nla_put_failure;
4729 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4730 goto nla_put_failure;
4733 nlmsg_end(skb, nlh);
4737 nlmsg_cancel(skb, nlh);
4741 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4742 const struct net_device *dev)
4744 if (f6i->fib6_nh.fib_nh_dev == dev)
4747 if (f6i->fib6_nsiblings) {
4748 struct fib6_info *sibling, *next_sibling;
4750 list_for_each_entry_safe(sibling, next_sibling,
4751 &f6i->fib6_siblings, fib6_siblings) {
4752 if (sibling->fib6_nh.fib_nh_dev == dev)
4760 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4762 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4763 struct fib_dump_filter *filter = &arg->filter;
4764 unsigned int flags = NLM_F_MULTI;
4765 struct net *net = arg->net;
4767 if (rt == net->ipv6.fib6_null_entry)
4770 if ((filter->flags & RTM_F_PREFIX) &&
4771 !(rt->fib6_flags & RTF_PREFIX_RT)) {
4772 /* success since this is not a prefix route */
4775 if (filter->filter_set) {
4776 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4777 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4778 (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4781 flags |= NLM_F_DUMP_FILTERED;
4784 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4785 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4786 arg->cb->nlh->nlmsg_seq, flags);
4789 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4790 const struct nlmsghdr *nlh,
4792 struct netlink_ext_ack *extack)
4797 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4798 NL_SET_ERR_MSG_MOD(extack,
4799 "Invalid header for get route request");
4803 if (!netlink_strict_get_check(skb))
4804 return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
4805 rtm_ipv6_policy, extack);
4807 rtm = nlmsg_data(nlh);
4808 if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4809 (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4810 rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4812 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4815 if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4816 NL_SET_ERR_MSG_MOD(extack,
4817 "Invalid flags for get route request");
4821 err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4822 rtm_ipv6_policy, extack);
4826 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4827 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4828 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4832 for (i = 0; i <= RTA_MAX; i++) {
4848 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4856 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4857 struct netlink_ext_ack *extack)
4859 struct net *net = sock_net(in_skb->sk);
4860 struct nlattr *tb[RTA_MAX+1];
4861 int err, iif = 0, oif = 0;
4862 struct fib6_info *from;
4863 struct dst_entry *dst;
4864 struct rt6_info *rt;
4865 struct sk_buff *skb;
4867 struct flowi6 fl6 = {};
4870 err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4875 rtm = nlmsg_data(nlh);
4876 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4877 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4880 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4883 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4887 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4890 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4894 iif = nla_get_u32(tb[RTA_IIF]);
4897 oif = nla_get_u32(tb[RTA_OIF]);
4900 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4903 fl6.flowi6_uid = make_kuid(current_user_ns(),
4904 nla_get_u32(tb[RTA_UID]));
4906 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4909 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4912 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4914 if (tb[RTA_IP_PROTO]) {
4915 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4916 &fl6.flowi6_proto, AF_INET6,
4923 struct net_device *dev;
4928 dev = dev_get_by_index_rcu(net, iif);
4935 fl6.flowi6_iif = iif;
4937 if (!ipv6_addr_any(&fl6.saddr))
4938 flags |= RT6_LOOKUP_F_HAS_SADDR;
4940 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4944 fl6.flowi6_oif = oif;
4946 dst = ip6_route_output(net, NULL, &fl6);
4950 rt = container_of(dst, struct rt6_info, dst);
4951 if (rt->dst.error) {
4952 err = rt->dst.error;
4957 if (rt == net->ipv6.ip6_null_entry) {
4958 err = rt->dst.error;
4963 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4970 skb_dst_set(skb, &rt->dst);
4973 from = rcu_dereference(rt->from);
4976 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4977 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4980 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4981 &fl6.saddr, iif, RTM_NEWROUTE,
4982 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4991 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4996 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4997 unsigned int nlm_flags)
4999 struct sk_buff *skb;
5000 struct net *net = info->nl_net;
5005 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5007 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5011 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5012 event, info->portid, seq, nlm_flags);
5014 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5015 WARN_ON(err == -EMSGSIZE);
5019 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5020 info->nlh, gfp_any());
5024 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5027 static int ip6_route_dev_notify(struct notifier_block *this,
5028 unsigned long event, void *ptr)
5030 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5031 struct net *net = dev_net(dev);
5033 if (!(dev->flags & IFF_LOOPBACK))
5036 if (event == NETDEV_REGISTER) {
5037 net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev;
5038 net->ipv6.ip6_null_entry->dst.dev = dev;
5039 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5040 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5041 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5042 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5043 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5044 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5046 } else if (event == NETDEV_UNREGISTER &&
5047 dev->reg_state != NETREG_UNREGISTERED) {
5048 /* NETDEV_UNREGISTER could be fired for multiple times by
5049 * netdev_wait_allrefs(). Make sure we only call this once.
5051 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5052 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5053 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5054 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5065 #ifdef CONFIG_PROC_FS
5066 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5068 struct net *net = (struct net *)seq->private;
5069 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5070 net->ipv6.rt6_stats->fib_nodes,
5071 net->ipv6.rt6_stats->fib_route_nodes,
5072 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5073 net->ipv6.rt6_stats->fib_rt_entries,
5074 net->ipv6.rt6_stats->fib_rt_cache,
5075 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5076 net->ipv6.rt6_stats->fib_discarded_routes);
5080 #endif /* CONFIG_PROC_FS */
5082 #ifdef CONFIG_SYSCTL
5085 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5086 void __user *buffer, size_t *lenp, loff_t *ppos)
5094 net = (struct net *)ctl->extra1;
5095 delay = net->ipv6.sysctl.flush_delay;
5096 ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5100 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5107 static struct ctl_table ipv6_route_table_template[] = {
5109 .procname = "flush",
5110 .data = &init_net.ipv6.sysctl.flush_delay,
5111 .maxlen = sizeof(int),
5113 .proc_handler = ipv6_sysctl_rtcache_flush
5116 .procname = "gc_thresh",
5117 .data = &ip6_dst_ops_template.gc_thresh,
5118 .maxlen = sizeof(int),
5120 .proc_handler = proc_dointvec,
5123 .procname = "max_size",
5124 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
5125 .maxlen = sizeof(int),
5127 .proc_handler = proc_dointvec,
5130 .procname = "gc_min_interval",
5131 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5132 .maxlen = sizeof(int),
5134 .proc_handler = proc_dointvec_jiffies,
5137 .procname = "gc_timeout",
5138 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5139 .maxlen = sizeof(int),
5141 .proc_handler = proc_dointvec_jiffies,
5144 .procname = "gc_interval",
5145 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5146 .maxlen = sizeof(int),
5148 .proc_handler = proc_dointvec_jiffies,
5151 .procname = "gc_elasticity",
5152 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5153 .maxlen = sizeof(int),
5155 .proc_handler = proc_dointvec,
5158 .procname = "mtu_expires",
5159 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5160 .maxlen = sizeof(int),
5162 .proc_handler = proc_dointvec_jiffies,
5165 .procname = "min_adv_mss",
5166 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5167 .maxlen = sizeof(int),
5169 .proc_handler = proc_dointvec,
5172 .procname = "gc_min_interval_ms",
5173 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5174 .maxlen = sizeof(int),
5176 .proc_handler = proc_dointvec_ms_jiffies,
5179 .procname = "skip_notify_on_dev_down",
5180 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5181 .maxlen = sizeof(int),
5183 .proc_handler = proc_dointvec,
5190 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5192 struct ctl_table *table;
5194 table = kmemdup(ipv6_route_table_template,
5195 sizeof(ipv6_route_table_template),
5199 table[0].data = &net->ipv6.sysctl.flush_delay;
5200 table[0].extra1 = net;
5201 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5202 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5203 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5204 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5205 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5206 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5207 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5208 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5209 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5210 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5212 /* Don't export sysctls to unprivileged users */
5213 if (net->user_ns != &init_user_ns)
5214 table[0].procname = NULL;
5221 static int __net_init ip6_route_net_init(struct net *net)
5225 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5226 sizeof(net->ipv6.ip6_dst_ops));
5228 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5229 goto out_ip6_dst_ops;
5231 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5232 sizeof(*net->ipv6.fib6_null_entry),
5234 if (!net->ipv6.fib6_null_entry)
5235 goto out_ip6_dst_entries;
5237 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5238 sizeof(*net->ipv6.ip6_null_entry),
5240 if (!net->ipv6.ip6_null_entry)
5241 goto out_fib6_null_entry;
5242 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5243 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5244 ip6_template_metrics, true);
5246 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5247 net->ipv6.fib6_has_custom_rules = false;
5248 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5249 sizeof(*net->ipv6.ip6_prohibit_entry),
5251 if (!net->ipv6.ip6_prohibit_entry)
5252 goto out_ip6_null_entry;
5253 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5254 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5255 ip6_template_metrics, true);
5257 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5258 sizeof(*net->ipv6.ip6_blk_hole_entry),
5260 if (!net->ipv6.ip6_blk_hole_entry)
5261 goto out_ip6_prohibit_entry;
5262 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5263 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5264 ip6_template_metrics, true);
5267 net->ipv6.sysctl.flush_delay = 0;
5268 net->ipv6.sysctl.ip6_rt_max_size = 4096;
5269 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5270 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5271 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5272 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5273 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5274 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5275 net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5277 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5283 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5284 out_ip6_prohibit_entry:
5285 kfree(net->ipv6.ip6_prohibit_entry);
5287 kfree(net->ipv6.ip6_null_entry);
5289 out_fib6_null_entry:
5290 kfree(net->ipv6.fib6_null_entry);
5291 out_ip6_dst_entries:
5292 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5297 static void __net_exit ip6_route_net_exit(struct net *net)
5299 kfree(net->ipv6.fib6_null_entry);
5300 kfree(net->ipv6.ip6_null_entry);
5301 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5302 kfree(net->ipv6.ip6_prohibit_entry);
5303 kfree(net->ipv6.ip6_blk_hole_entry);
5305 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5308 static int __net_init ip6_route_net_init_late(struct net *net)
5310 #ifdef CONFIG_PROC_FS
5311 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5312 sizeof(struct ipv6_route_iter));
5313 proc_create_net_single("rt6_stats", 0444, net->proc_net,
5314 rt6_stats_seq_show, NULL);
5319 static void __net_exit ip6_route_net_exit_late(struct net *net)
5321 #ifdef CONFIG_PROC_FS
5322 remove_proc_entry("ipv6_route", net->proc_net);
5323 remove_proc_entry("rt6_stats", net->proc_net);
5327 static struct pernet_operations ip6_route_net_ops = {
5328 .init = ip6_route_net_init,
5329 .exit = ip6_route_net_exit,
5332 static int __net_init ipv6_inetpeer_init(struct net *net)
5334 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5338 inet_peer_base_init(bp);
5339 net->ipv6.peers = bp;
5343 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5345 struct inet_peer_base *bp = net->ipv6.peers;
5347 net->ipv6.peers = NULL;
5348 inetpeer_invalidate_tree(bp);
5352 static struct pernet_operations ipv6_inetpeer_ops = {
5353 .init = ipv6_inetpeer_init,
5354 .exit = ipv6_inetpeer_exit,
5357 static struct pernet_operations ip6_route_net_late_ops = {
5358 .init = ip6_route_net_init_late,
5359 .exit = ip6_route_net_exit_late,
5362 static struct notifier_block ip6_route_dev_notifier = {
5363 .notifier_call = ip6_route_dev_notify,
5364 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5367 void __init ip6_route_init_special_entries(void)
5369 /* Registering of the loopback is done before this portion of code,
5370 * the loopback reference in rt6_info will not be taken, do it
5371 * manually for init_net */
5372 init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev;
5373 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5374 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5375 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5376 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5377 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5378 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5379 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5383 int __init ip6_route_init(void)
5389 ip6_dst_ops_template.kmem_cachep =
5390 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5391 SLAB_HWCACHE_ALIGN, NULL);
5392 if (!ip6_dst_ops_template.kmem_cachep)
5395 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5397 goto out_kmem_cache;
5399 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5401 goto out_dst_entries;
5403 ret = register_pernet_subsys(&ip6_route_net_ops);
5405 goto out_register_inetpeer;
5407 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5411 goto out_register_subsys;
5417 ret = fib6_rules_init();
5421 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5423 goto fib6_rules_init;
5425 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5426 inet6_rtm_newroute, NULL, 0);
5428 goto out_register_late_subsys;
5430 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5431 inet6_rtm_delroute, NULL, 0);
5433 goto out_register_late_subsys;
5435 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5436 inet6_rtm_getroute, NULL,
5437 RTNL_FLAG_DOIT_UNLOCKED);
5439 goto out_register_late_subsys;
5441 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5443 goto out_register_late_subsys;
5445 for_each_possible_cpu(cpu) {
5446 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5448 INIT_LIST_HEAD(&ul->head);
5449 spin_lock_init(&ul->lock);
5455 out_register_late_subsys:
5456 rtnl_unregister_all(PF_INET6);
5457 unregister_pernet_subsys(&ip6_route_net_late_ops);
5459 fib6_rules_cleanup();
5464 out_register_subsys:
5465 unregister_pernet_subsys(&ip6_route_net_ops);
5466 out_register_inetpeer:
5467 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5469 dst_entries_destroy(&ip6_dst_blackhole_ops);
5471 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5475 void ip6_route_cleanup(void)
5477 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5478 unregister_pernet_subsys(&ip6_route_net_late_ops);
5479 fib6_rules_cleanup();
5482 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5483 unregister_pernet_subsys(&ip6_route_net_ops);
5484 dst_entries_destroy(&ip6_dst_blackhole_ops);
5485 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);