2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
67 #include <linux/uaccess.h>
70 #include <linux/sysctl.h>
73 static int ip6_rt_type_to_error(u8 fib6_type);
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
81 RT6_NUD_FAIL_HARD = -3,
82 RT6_NUD_FAIL_PROBE = -2,
83 RT6_NUD_FAIL_DO_RR = -1,
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void ip6_dst_destroy(struct dst_entry *);
92 static void ip6_dst_ifdown(struct dst_entry *,
93 struct net_device *dev, int how);
94 static int ip6_dst_gc(struct dst_ops *ops);
96 static int ip6_pkt_discard(struct sk_buff *skb);
97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int ip6_pkt_prohibit(struct sk_buff *skb);
99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void ip6_link_failure(struct sk_buff *skb);
101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102 struct sk_buff *skb, u32 mtu);
103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104 struct sk_buff *skb);
105 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
107 static size_t rt6_nlmsg_size(struct fib6_info *rt);
108 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
109 struct fib6_info *rt, struct dst_entry *dst,
110 struct in6_addr *dest, struct in6_addr *src,
111 int iif, int type, u32 portid, u32 seq,
113 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
114 struct in6_addr *daddr,
115 struct in6_addr *saddr);
117 #ifdef CONFIG_IPV6_ROUTE_INFO
118 static struct fib6_info *rt6_add_route_info(struct net *net,
119 const struct in6_addr *prefix, int prefixlen,
120 const struct in6_addr *gwaddr,
121 struct net_device *dev,
123 static struct fib6_info *rt6_get_route_info(struct net *net,
124 const struct in6_addr *prefix, int prefixlen,
125 const struct in6_addr *gwaddr,
126 struct net_device *dev);
129 struct uncached_list {
131 struct list_head head;
134 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
136 void rt6_uncached_list_add(struct rt6_info *rt)
138 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
140 rt->rt6i_uncached_list = ul;
142 spin_lock_bh(&ul->lock);
143 list_add_tail(&rt->rt6i_uncached, &ul->head);
144 spin_unlock_bh(&ul->lock);
147 void rt6_uncached_list_del(struct rt6_info *rt)
149 if (!list_empty(&rt->rt6i_uncached)) {
150 struct uncached_list *ul = rt->rt6i_uncached_list;
151 struct net *net = dev_net(rt->dst.dev);
153 spin_lock_bh(&ul->lock);
154 list_del(&rt->rt6i_uncached);
155 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
156 spin_unlock_bh(&ul->lock);
160 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
162 struct net_device *loopback_dev = net->loopback_dev;
165 if (dev == loopback_dev)
168 for_each_possible_cpu(cpu) {
169 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
172 spin_lock_bh(&ul->lock);
173 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
174 struct inet6_dev *rt_idev = rt->rt6i_idev;
175 struct net_device *rt_dev = rt->dst.dev;
177 if (rt_idev->dev == dev) {
178 rt->rt6i_idev = in6_dev_get(loopback_dev);
179 in6_dev_put(rt_idev);
183 rt->dst.dev = loopback_dev;
184 dev_hold(rt->dst.dev);
188 spin_unlock_bh(&ul->lock);
192 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
196 if (!ipv6_addr_any(p))
197 return (const void *) p;
199 return &ipv6_hdr(skb)->daddr;
203 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
204 struct net_device *dev,
210 daddr = choose_neigh_daddr(gw, skb, daddr);
211 n = __ipv6_neigh_lookup(dev, daddr);
215 n = neigh_create(&nd_tbl, daddr, dev);
216 return IS_ERR(n) ? NULL : n;
219 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
223 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
225 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
228 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
230 struct net_device *dev = dst->dev;
231 struct rt6_info *rt = (struct rt6_info *)dst;
233 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
236 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
238 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
240 __ipv6_confirm_neigh(dev, daddr);
243 static struct dst_ops ip6_dst_ops_template = {
247 .check = ip6_dst_check,
248 .default_advmss = ip6_default_advmss,
250 .cow_metrics = dst_cow_metrics_generic,
251 .destroy = ip6_dst_destroy,
252 .ifdown = ip6_dst_ifdown,
253 .negative_advice = ip6_negative_advice,
254 .link_failure = ip6_link_failure,
255 .update_pmtu = ip6_rt_update_pmtu,
256 .redirect = rt6_do_redirect,
257 .local_out = __ip6_local_out,
258 .neigh_lookup = ip6_dst_neigh_lookup,
259 .confirm_neigh = ip6_confirm_neigh,
262 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
264 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
266 return mtu ? : dst->dev->mtu;
269 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
270 struct sk_buff *skb, u32 mtu)
274 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
279 static struct dst_ops ip6_dst_blackhole_ops = {
281 .destroy = ip6_dst_destroy,
282 .check = ip6_dst_check,
283 .mtu = ip6_blackhole_mtu,
284 .default_advmss = ip6_default_advmss,
285 .update_pmtu = ip6_rt_blackhole_update_pmtu,
286 .redirect = ip6_rt_blackhole_redirect,
287 .cow_metrics = dst_cow_metrics_generic,
288 .neigh_lookup = ip6_dst_neigh_lookup,
291 static const u32 ip6_template_metrics[RTAX_MAX] = {
292 [RTAX_HOPLIMIT - 1] = 0,
295 static const struct fib6_info fib6_null_entry_template = {
296 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
297 .fib6_protocol = RTPROT_KERNEL,
298 .fib6_metric = ~(u32)0,
299 .fib6_ref = ATOMIC_INIT(1),
300 .fib6_type = RTN_UNREACHABLE,
301 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
304 static const struct rt6_info ip6_null_entry_template = {
306 .__refcnt = ATOMIC_INIT(1),
308 .obsolete = DST_OBSOLETE_FORCE_CHK,
309 .error = -ENETUNREACH,
310 .input = ip6_pkt_discard,
311 .output = ip6_pkt_discard_out,
313 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
316 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
318 static const struct rt6_info ip6_prohibit_entry_template = {
320 .__refcnt = ATOMIC_INIT(1),
322 .obsolete = DST_OBSOLETE_FORCE_CHK,
324 .input = ip6_pkt_prohibit,
325 .output = ip6_pkt_prohibit_out,
327 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
330 static const struct rt6_info ip6_blk_hole_entry_template = {
332 .__refcnt = ATOMIC_INIT(1),
334 .obsolete = DST_OBSOLETE_FORCE_CHK,
336 .input = dst_discard,
337 .output = dst_discard_out,
339 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
344 static void rt6_info_init(struct rt6_info *rt)
346 struct dst_entry *dst = &rt->dst;
348 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
349 INIT_LIST_HEAD(&rt->rt6i_uncached);
352 /* allocate dst with ip6_dst_ops */
353 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
356 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
357 1, DST_OBSOLETE_FORCE_CHK, flags);
361 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
366 EXPORT_SYMBOL(ip6_dst_alloc);
368 static void ip6_dst_destroy(struct dst_entry *dst)
370 struct rt6_info *rt = (struct rt6_info *)dst;
371 struct fib6_info *from;
372 struct inet6_dev *idev;
374 ip_dst_metrics_put(dst);
375 rt6_uncached_list_del(rt);
377 idev = rt->rt6i_idev;
379 rt->rt6i_idev = NULL;
384 from = rcu_dereference(rt->from);
385 rcu_assign_pointer(rt->from, NULL);
386 fib6_info_release(from);
390 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
393 struct rt6_info *rt = (struct rt6_info *)dst;
394 struct inet6_dev *idev = rt->rt6i_idev;
395 struct net_device *loopback_dev =
396 dev_net(dev)->loopback_dev;
398 if (idev && idev->dev != loopback_dev) {
399 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
401 rt->rt6i_idev = loopback_idev;
407 static bool __rt6_check_expired(const struct rt6_info *rt)
409 if (rt->rt6i_flags & RTF_EXPIRES)
410 return time_after(jiffies, rt->dst.expires);
415 static bool rt6_check_expired(const struct rt6_info *rt)
417 struct fib6_info *from;
419 from = rcu_dereference(rt->from);
421 if (rt->rt6i_flags & RTF_EXPIRES) {
422 if (time_after(jiffies, rt->dst.expires))
425 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
426 fib6_check_expired(from);
431 struct fib6_info *fib6_multipath_select(const struct net *net,
432 struct fib6_info *match,
433 struct flowi6 *fl6, int oif,
434 const struct sk_buff *skb,
437 struct fib6_info *sibling, *next_sibling;
439 /* We might have already computed the hash for ICMPv6 errors. In such
440 * case it will always be non-zero. Otherwise now is the time to do it.
443 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
445 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
448 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
450 const struct fib6_nh *nh = &sibling->fib6_nh;
453 nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
454 if (fl6->mp_hash > nh_upper_bound)
456 if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
466 * Route lookup. rcu_read_lock() should be held.
469 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
470 const struct in6_addr *saddr, int oif, int flags)
472 const struct net_device *dev;
474 if (nh->fib_nh_flags & RTNH_F_DEAD)
477 dev = nh->fib_nh_dev;
479 if (dev->ifindex == oif)
482 if (ipv6_chk_addr(net, saddr, dev,
483 flags & RT6_LOOKUP_F_IFACE))
490 static inline struct fib6_info *rt6_device_match(struct net *net,
491 struct fib6_info *rt,
492 const struct in6_addr *saddr,
496 const struct fib6_nh *nh;
497 struct fib6_info *sprt;
499 if (!oif && ipv6_addr_any(saddr) &&
500 !(rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD))
503 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
505 if (__rt6_device_match(net, nh, saddr, oif, flags))
509 if (oif && flags & RT6_LOOKUP_F_IFACE)
510 return net->ipv6.fib6_null_entry;
512 return rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
515 #ifdef CONFIG_IPV6_ROUTER_PREF
516 struct __rt6_probe_work {
517 struct work_struct work;
518 struct in6_addr target;
519 struct net_device *dev;
522 static void rt6_probe_deferred(struct work_struct *w)
524 struct in6_addr mcaddr;
525 struct __rt6_probe_work *work =
526 container_of(w, struct __rt6_probe_work, work);
528 addrconf_addr_solict_mult(&work->target, &mcaddr);
529 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
534 static void rt6_probe(struct fib6_nh *fib6_nh)
536 struct __rt6_probe_work *work = NULL;
537 const struct in6_addr *nh_gw;
538 struct neighbour *neigh;
539 struct net_device *dev;
540 struct inet6_dev *idev;
543 * Okay, this does not seem to be appropriate
544 * for now, however, we need to check if it
545 * is really so; aka Router Reachability Probing.
547 * Router Reachability Probe MUST be rate-limited
548 * to no more than one per minute.
550 if (fib6_nh->fib_nh_gw_family)
553 nh_gw = &fib6_nh->fib_nh_gw6;
554 dev = fib6_nh->fib_nh_dev;
556 idev = __in6_dev_get(dev);
557 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
559 if (neigh->nud_state & NUD_VALID)
562 write_lock(&neigh->lock);
563 if (!(neigh->nud_state & NUD_VALID) &&
565 neigh->updated + idev->cnf.rtr_probe_interval)) {
566 work = kmalloc(sizeof(*work), GFP_ATOMIC);
568 __neigh_set_probe_once(neigh);
570 write_unlock(&neigh->lock);
571 } else if (time_after(jiffies, fib6_nh->last_probe +
572 idev->cnf.rtr_probe_interval)) {
573 work = kmalloc(sizeof(*work), GFP_ATOMIC);
577 fib6_nh->last_probe = jiffies;
578 INIT_WORK(&work->work, rt6_probe_deferred);
579 work->target = *nh_gw;
582 schedule_work(&work->work);
586 rcu_read_unlock_bh();
589 static inline void rt6_probe(struct fib6_nh *fib6_nh)
595 * Default Router Selection (RFC 2461 6.3.6)
597 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
599 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
600 struct neighbour *neigh;
603 neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
604 &fib6_nh->fib_nh_gw6);
606 read_lock(&neigh->lock);
607 if (neigh->nud_state & NUD_VALID)
608 ret = RT6_NUD_SUCCEED;
609 #ifdef CONFIG_IPV6_ROUTER_PREF
610 else if (!(neigh->nud_state & NUD_FAILED))
611 ret = RT6_NUD_SUCCEED;
613 ret = RT6_NUD_FAIL_PROBE;
615 read_unlock(&neigh->lock);
617 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
618 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
620 rcu_read_unlock_bh();
625 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
630 if (!oif || nh->fib_nh_dev->ifindex == oif)
633 if (!m && (strict & RT6_LOOKUP_F_IFACE))
634 return RT6_NUD_FAIL_HARD;
635 #ifdef CONFIG_IPV6_ROUTER_PREF
636 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
638 if ((strict & RT6_LOOKUP_F_REACHABLE) &&
639 !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
640 int n = rt6_check_neigh(nh);
647 static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
648 int oif, int strict, int *mpri, bool *do_rr)
650 bool match_do_rr = false;
654 if (nh->fib_nh_flags & RTNH_F_DEAD)
657 if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
658 nh->fib_nh_flags & RTNH_F_LINKDOWN &&
659 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
662 m = rt6_score_route(nh, fib6_flags, oif, strict);
663 if (m == RT6_NUD_FAIL_DO_RR) {
665 m = 0; /* lowest valid score */
666 } else if (m == RT6_NUD_FAIL_HARD) {
670 if (strict & RT6_LOOKUP_F_REACHABLE)
673 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
675 *do_rr = match_do_rr;
683 static void __find_rr_leaf(struct fib6_info *rt_start,
684 struct fib6_info *nomatch, u32 metric,
685 struct fib6_info **match, struct fib6_info **cont,
686 int oif, int strict, bool *do_rr, int *mpri)
688 struct fib6_info *rt;
692 rt = rcu_dereference(rt->fib6_next)) {
695 if (cont && rt->fib6_metric != metric) {
700 if (fib6_check_expired(rt))
704 if (find_match(nh, rt->fib6_flags, oif, strict, mpri, do_rr))
709 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
710 struct fib6_info *leaf,
711 struct fib6_info *rr_head,
712 u32 metric, int oif, int strict,
715 struct fib6_info *match = NULL, *cont = NULL;
718 __find_rr_leaf(rr_head, NULL, metric, &match, &cont,
719 oif, strict, do_rr, &mpri);
721 __find_rr_leaf(leaf, rr_head, metric, &match, &cont,
722 oif, strict, do_rr, &mpri);
727 __find_rr_leaf(cont, NULL, metric, &match, NULL,
728 oif, strict, do_rr, &mpri);
733 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
736 struct fib6_info *leaf = rcu_dereference(fn->leaf);
737 struct fib6_info *match, *rt0;
741 if (!leaf || leaf == net->ipv6.fib6_null_entry)
742 return net->ipv6.fib6_null_entry;
744 rt0 = rcu_dereference(fn->rr_ptr);
748 /* Double check to make sure fn is not an intermediate node
749 * and fn->leaf does not points to its child's leaf
750 * (This might happen if all routes under fn are deleted from
751 * the tree and fib6_repair_tree() is called on the node.)
753 key_plen = rt0->fib6_dst.plen;
754 #ifdef CONFIG_IPV6_SUBTREES
755 if (rt0->fib6_src.plen)
756 key_plen = rt0->fib6_src.plen;
758 if (fn->fn_bit != key_plen)
759 return net->ipv6.fib6_null_entry;
761 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
765 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
767 /* no entries matched; do round-robin */
768 if (!next || next->fib6_metric != rt0->fib6_metric)
772 spin_lock_bh(&leaf->fib6_table->tb6_lock);
773 /* make sure next is not being deleted from the tree */
775 rcu_assign_pointer(fn->rr_ptr, next);
776 spin_unlock_bh(&leaf->fib6_table->tb6_lock);
780 return match ? match : net->ipv6.fib6_null_entry;
783 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
785 return (rt->fib6_flags & RTF_NONEXTHOP) || rt->fib6_nh.fib_nh_gw_family;
788 #ifdef CONFIG_IPV6_ROUTE_INFO
789 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
790 const struct in6_addr *gwaddr)
792 struct net *net = dev_net(dev);
793 struct route_info *rinfo = (struct route_info *) opt;
794 struct in6_addr prefix_buf, *prefix;
796 unsigned long lifetime;
797 struct fib6_info *rt;
799 if (len < sizeof(struct route_info)) {
803 /* Sanity check for prefix_len and length */
804 if (rinfo->length > 3) {
806 } else if (rinfo->prefix_len > 128) {
808 } else if (rinfo->prefix_len > 64) {
809 if (rinfo->length < 2) {
812 } else if (rinfo->prefix_len > 0) {
813 if (rinfo->length < 1) {
818 pref = rinfo->route_pref;
819 if (pref == ICMPV6_ROUTER_PREF_INVALID)
822 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
824 if (rinfo->length == 3)
825 prefix = (struct in6_addr *)rinfo->prefix;
827 /* this function is safe */
828 ipv6_addr_prefix(&prefix_buf,
829 (struct in6_addr *)rinfo->prefix,
831 prefix = &prefix_buf;
834 if (rinfo->prefix_len == 0)
835 rt = rt6_get_dflt_router(net, gwaddr, dev);
837 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
840 if (rt && !lifetime) {
846 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
849 rt->fib6_flags = RTF_ROUTEINFO |
850 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
853 if (!addrconf_finite_timeout(lifetime))
854 fib6_clean_expires(rt);
856 fib6_set_expires(rt, jiffies + HZ * lifetime);
858 fib6_info_release(rt);
865 * Misc support functions
868 /* called with rcu_lock held */
869 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
871 struct net_device *dev = rt->fib6_nh.fib_nh_dev;
873 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
874 /* for copies of local routes, dst->dev needs to be the
875 * device if it is a master device, the master device if
876 * device is enslaved, and the loopback as the default
878 if (netif_is_l3_slave(dev) &&
879 !rt6_need_strict(&rt->fib6_dst.addr))
880 dev = l3mdev_master_dev_rcu(dev);
881 else if (!netif_is_l3_master(dev))
882 dev = dev_net(dev)->loopback_dev;
883 /* last case is netif_is_l3_master(dev) is true in which
884 * case we want dev returned to be dev
891 static const int fib6_prop[RTN_MAX + 1] = {
898 [RTN_BLACKHOLE] = -EINVAL,
899 [RTN_UNREACHABLE] = -EHOSTUNREACH,
900 [RTN_PROHIBIT] = -EACCES,
901 [RTN_THROW] = -EAGAIN,
903 [RTN_XRESOLVE] = -EINVAL,
906 static int ip6_rt_type_to_error(u8 fib6_type)
908 return fib6_prop[fib6_type];
911 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
913 unsigned short flags = 0;
916 flags |= DST_NOCOUNT;
917 if (rt->dst_nopolicy)
918 flags |= DST_NOPOLICY;
925 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
927 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
929 switch (ort->fib6_type) {
931 rt->dst.output = dst_discard_out;
932 rt->dst.input = dst_discard;
935 rt->dst.output = ip6_pkt_prohibit_out;
936 rt->dst.input = ip6_pkt_prohibit;
939 case RTN_UNREACHABLE:
941 rt->dst.output = ip6_pkt_discard_out;
942 rt->dst.input = ip6_pkt_discard;
947 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
949 if (ort->fib6_flags & RTF_REJECT) {
950 ip6_rt_init_dst_reject(rt, ort);
955 rt->dst.output = ip6_output;
957 if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
958 rt->dst.input = ip6_input;
959 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
960 rt->dst.input = ip6_mc_input;
962 rt->dst.input = ip6_forward;
965 if (ort->fib6_nh.fib_nh_lws) {
966 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.fib_nh_lws);
967 lwtunnel_set_redirect(&rt->dst);
970 rt->dst.lastuse = jiffies;
973 /* Caller must already hold reference to @from */
974 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
976 rt->rt6i_flags &= ~RTF_EXPIRES;
977 rcu_assign_pointer(rt->from, from);
978 ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
981 /* Caller must already hold reference to @ort */
982 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
984 struct net_device *dev = fib6_info_nh_dev(ort);
986 ip6_rt_init_dst(rt, ort);
988 rt->rt6i_dst = ort->fib6_dst;
989 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
990 rt->rt6i_flags = ort->fib6_flags;
991 if (ort->fib6_nh.fib_nh_gw_family) {
992 rt->rt6i_gateway = ort->fib6_nh.fib_nh_gw6;
993 rt->rt6i_flags |= RTF_GATEWAY;
995 rt6_set_from(rt, ort);
996 #ifdef CONFIG_IPV6_SUBTREES
997 rt->rt6i_src = ort->fib6_src;
1001 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1002 struct in6_addr *saddr)
1004 struct fib6_node *pn, *sn;
1006 if (fn->fn_flags & RTN_TL_ROOT)
1008 pn = rcu_dereference(fn->parent);
1009 sn = FIB6_SUBTREE(pn);
1011 fn = fib6_node_lookup(sn, NULL, saddr);
1014 if (fn->fn_flags & RTN_RTINFO)
1019 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1021 struct rt6_info *rt = *prt;
1023 if (dst_hold_safe(&rt->dst))
1026 rt = net->ipv6.ip6_null_entry;
1035 /* called with rcu_lock held */
1036 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1038 unsigned short flags = fib6_info_dst_flags(rt);
1039 struct net_device *dev = rt->fib6_nh.fib_nh_dev;
1040 struct rt6_info *nrt;
1042 if (!fib6_info_hold_safe(rt))
1045 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1047 fib6_info_release(rt);
1051 ip6_rt_copy_init(nrt, rt);
1055 nrt = dev_net(dev)->ipv6.ip6_null_entry;
1056 dst_hold(&nrt->dst);
1060 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1061 struct fib6_table *table,
1063 const struct sk_buff *skb,
1066 struct fib6_info *f6i;
1067 struct fib6_node *fn;
1068 struct rt6_info *rt;
1070 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1071 flags &= ~RT6_LOOKUP_F_IFACE;
1074 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1076 f6i = rcu_dereference(fn->leaf);
1078 f6i = net->ipv6.fib6_null_entry;
1080 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1081 fl6->flowi6_oif, flags);
1083 if (f6i == net->ipv6.fib6_null_entry) {
1084 fn = fib6_backtrack(fn, &fl6->saddr);
1088 rt = net->ipv6.ip6_null_entry;
1093 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1094 f6i = fib6_multipath_select(net, f6i, fl6, fl6->flowi6_oif, skb,
1096 /* Search through exception table */
1097 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1099 if (ip6_hold_safe(net, &rt))
1100 dst_use_noref(&rt->dst, jiffies);
1102 rt = ip6_create_rt_rcu(f6i);
1106 trace_fib6_table_lookup(net, f6i, table, fl6);
1113 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1114 const struct sk_buff *skb, int flags)
1116 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1118 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1120 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1121 const struct in6_addr *saddr, int oif,
1122 const struct sk_buff *skb, int strict)
1124 struct flowi6 fl6 = {
1128 struct dst_entry *dst;
1129 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1132 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1133 flags |= RT6_LOOKUP_F_HAS_SADDR;
1136 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1137 if (dst->error == 0)
1138 return (struct rt6_info *) dst;
1144 EXPORT_SYMBOL(rt6_lookup);
1146 /* ip6_ins_rt is called with FREE table->tb6_lock.
1147 * It takes new route entry, the addition fails by any reason the
1148 * route is released.
1149 * Caller must hold dst before calling it.
1152 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1153 struct netlink_ext_ack *extack)
1156 struct fib6_table *table;
1158 table = rt->fib6_table;
1159 spin_lock_bh(&table->tb6_lock);
1160 err = fib6_add(&table->tb6_root, rt, info, extack);
1161 spin_unlock_bh(&table->tb6_lock);
1166 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1168 struct nl_info info = { .nl_net = net, };
1170 return __ip6_ins_rt(rt, &info, NULL);
1173 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1174 const struct in6_addr *daddr,
1175 const struct in6_addr *saddr)
1177 struct net_device *dev;
1178 struct rt6_info *rt;
1184 if (!fib6_info_hold_safe(ort))
1187 dev = ip6_rt_get_dev_rcu(ort);
1188 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1190 fib6_info_release(ort);
1194 ip6_rt_copy_init(rt, ort);
1195 rt->rt6i_flags |= RTF_CACHE;
1196 rt->dst.flags |= DST_HOST;
1197 rt->rt6i_dst.addr = *daddr;
1198 rt->rt6i_dst.plen = 128;
1200 if (!rt6_is_gw_or_nonexthop(ort)) {
1201 if (ort->fib6_dst.plen != 128 &&
1202 ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1203 rt->rt6i_flags |= RTF_ANYCAST;
1204 #ifdef CONFIG_IPV6_SUBTREES
1205 if (rt->rt6i_src.plen && saddr) {
1206 rt->rt6i_src.addr = *saddr;
1207 rt->rt6i_src.plen = 128;
1215 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1217 unsigned short flags = fib6_info_dst_flags(rt);
1218 struct net_device *dev;
1219 struct rt6_info *pcpu_rt;
1221 if (!fib6_info_hold_safe(rt))
1225 dev = ip6_rt_get_dev_rcu(rt);
1226 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1229 fib6_info_release(rt);
1232 ip6_rt_copy_init(pcpu_rt, rt);
1233 pcpu_rt->rt6i_flags |= RTF_PCPU;
1237 /* It should be called with rcu_read_lock() acquired */
1238 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1240 struct rt6_info *pcpu_rt, **p;
1242 p = this_cpu_ptr(rt->rt6i_pcpu);
1246 ip6_hold_safe(NULL, &pcpu_rt);
1251 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1252 struct fib6_info *rt)
1254 struct rt6_info *pcpu_rt, *prev, **p;
1256 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1258 dst_hold(&net->ipv6.ip6_null_entry->dst);
1259 return net->ipv6.ip6_null_entry;
1262 dst_hold(&pcpu_rt->dst);
1263 p = this_cpu_ptr(rt->rt6i_pcpu);
1264 prev = cmpxchg(p, NULL, pcpu_rt);
1270 /* exception hash table implementation
1272 static DEFINE_SPINLOCK(rt6_exception_lock);
1274 /* Remove rt6_ex from hash table and free the memory
1275 * Caller must hold rt6_exception_lock
1277 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1278 struct rt6_exception *rt6_ex)
1280 struct fib6_info *from;
1283 if (!bucket || !rt6_ex)
1286 net = dev_net(rt6_ex->rt6i->dst.dev);
1287 net->ipv6.rt6_stats->fib_rt_cache--;
1289 /* purge completely the exception to allow releasing the held resources:
1290 * some [sk] cache may keep the dst around for unlimited time
1292 from = rcu_dereference_protected(rt6_ex->rt6i->from,
1293 lockdep_is_held(&rt6_exception_lock));
1294 rcu_assign_pointer(rt6_ex->rt6i->from, NULL);
1295 fib6_info_release(from);
1296 dst_dev_put(&rt6_ex->rt6i->dst);
1298 hlist_del_rcu(&rt6_ex->hlist);
1299 dst_release(&rt6_ex->rt6i->dst);
1300 kfree_rcu(rt6_ex, rcu);
1301 WARN_ON_ONCE(!bucket->depth);
1305 /* Remove oldest rt6_ex in bucket and free the memory
1306 * Caller must hold rt6_exception_lock
1308 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1310 struct rt6_exception *rt6_ex, *oldest = NULL;
1315 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1316 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1319 rt6_remove_exception(bucket, oldest);
1322 static u32 rt6_exception_hash(const struct in6_addr *dst,
1323 const struct in6_addr *src)
1325 static u32 seed __read_mostly;
1328 net_get_random_once(&seed, sizeof(seed));
1329 val = jhash(dst, sizeof(*dst), seed);
1331 #ifdef CONFIG_IPV6_SUBTREES
1333 val = jhash(src, sizeof(*src), val);
1335 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1338 /* Helper function to find the cached rt in the hash table
1339 * and update bucket pointer to point to the bucket for this
1340 * (daddr, saddr) pair
1341 * Caller must hold rt6_exception_lock
1343 static struct rt6_exception *
1344 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1345 const struct in6_addr *daddr,
1346 const struct in6_addr *saddr)
1348 struct rt6_exception *rt6_ex;
1351 if (!(*bucket) || !daddr)
1354 hval = rt6_exception_hash(daddr, saddr);
1357 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1358 struct rt6_info *rt6 = rt6_ex->rt6i;
1359 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1361 #ifdef CONFIG_IPV6_SUBTREES
1362 if (matched && saddr)
1363 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1371 /* Helper function to find the cached rt in the hash table
1372 * and update bucket pointer to point to the bucket for this
1373 * (daddr, saddr) pair
1374 * Caller must hold rcu_read_lock()
1376 static struct rt6_exception *
1377 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1378 const struct in6_addr *daddr,
1379 const struct in6_addr *saddr)
1381 struct rt6_exception *rt6_ex;
1384 WARN_ON_ONCE(!rcu_read_lock_held());
1386 if (!(*bucket) || !daddr)
1389 hval = rt6_exception_hash(daddr, saddr);
1392 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1393 struct rt6_info *rt6 = rt6_ex->rt6i;
1394 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1396 #ifdef CONFIG_IPV6_SUBTREES
1397 if (matched && saddr)
1398 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1406 static unsigned int fib6_mtu(const struct fib6_info *rt)
1410 if (rt->fib6_pmtu) {
1411 mtu = rt->fib6_pmtu;
1413 struct net_device *dev = fib6_info_nh_dev(rt);
1414 struct inet6_dev *idev;
1417 idev = __in6_dev_get(dev);
1418 mtu = idev->cnf.mtu6;
1422 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1424 return mtu - lwtunnel_headroom(rt->fib6_nh.fib_nh_lws, mtu);
1427 static int rt6_insert_exception(struct rt6_info *nrt,
1428 struct fib6_info *ort)
1430 struct net *net = dev_net(nrt->dst.dev);
1431 struct rt6_exception_bucket *bucket;
1432 struct in6_addr *src_key = NULL;
1433 struct rt6_exception *rt6_ex;
1436 spin_lock_bh(&rt6_exception_lock);
1438 if (ort->exception_bucket_flushed) {
1443 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1444 lockdep_is_held(&rt6_exception_lock));
1446 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1452 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1455 #ifdef CONFIG_IPV6_SUBTREES
1456 /* rt6i_src.plen != 0 indicates ort is in subtree
1457 * and exception table is indexed by a hash of
1458 * both rt6i_dst and rt6i_src.
1459 * Otherwise, the exception table is indexed by
1460 * a hash of only rt6i_dst.
1462 if (ort->fib6_src.plen)
1463 src_key = &nrt->rt6i_src.addr;
1465 /* rt6_mtu_change() might lower mtu on ort.
1466 * Only insert this exception route if its mtu
1467 * is less than ort's mtu value.
1469 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1474 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1477 rt6_remove_exception(bucket, rt6_ex);
1479 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1485 rt6_ex->stamp = jiffies;
1486 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1488 net->ipv6.rt6_stats->fib_rt_cache++;
1490 if (bucket->depth > FIB6_MAX_DEPTH)
1491 rt6_exception_remove_oldest(bucket);
1494 spin_unlock_bh(&rt6_exception_lock);
1496 /* Update fn->fn_sernum to invalidate all cached dst */
1498 spin_lock_bh(&ort->fib6_table->tb6_lock);
1499 fib6_update_sernum(net, ort);
1500 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1501 fib6_force_start_gc(net);
1507 void rt6_flush_exceptions(struct fib6_info *rt)
1509 struct rt6_exception_bucket *bucket;
1510 struct rt6_exception *rt6_ex;
1511 struct hlist_node *tmp;
1514 spin_lock_bh(&rt6_exception_lock);
1515 /* Prevent rt6_insert_exception() to recreate the bucket list */
1516 rt->exception_bucket_flushed = 1;
1518 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1519 lockdep_is_held(&rt6_exception_lock));
1523 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1524 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1525 rt6_remove_exception(bucket, rt6_ex);
1526 WARN_ON_ONCE(bucket->depth);
1531 spin_unlock_bh(&rt6_exception_lock);
1534 /* Find cached rt in the hash table inside passed in rt
1535 * Caller has to hold rcu_read_lock()
1537 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1538 struct in6_addr *daddr,
1539 struct in6_addr *saddr)
1541 struct rt6_exception_bucket *bucket;
1542 struct in6_addr *src_key = NULL;
1543 struct rt6_exception *rt6_ex;
1544 struct rt6_info *res = NULL;
1546 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1548 #ifdef CONFIG_IPV6_SUBTREES
1549 /* rt6i_src.plen != 0 indicates rt is in subtree
1550 * and exception table is indexed by a hash of
1551 * both rt6i_dst and rt6i_src.
1552 * Otherwise, the exception table is indexed by
1553 * a hash of only rt6i_dst.
1555 if (rt->fib6_src.plen)
1558 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1560 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1566 /* Remove the passed in cached rt from the hash table that contains it */
1567 static int rt6_remove_exception_rt(struct rt6_info *rt)
1569 struct rt6_exception_bucket *bucket;
1570 struct in6_addr *src_key = NULL;
1571 struct rt6_exception *rt6_ex;
1572 struct fib6_info *from;
1575 from = rcu_dereference(rt->from);
1577 !(rt->rt6i_flags & RTF_CACHE))
1580 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1583 spin_lock_bh(&rt6_exception_lock);
1584 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1585 lockdep_is_held(&rt6_exception_lock));
1586 #ifdef CONFIG_IPV6_SUBTREES
1587 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1588 * and exception table is indexed by a hash of
1589 * both rt6i_dst and rt6i_src.
1590 * Otherwise, the exception table is indexed by
1591 * a hash of only rt6i_dst.
1593 if (from->fib6_src.plen)
1594 src_key = &rt->rt6i_src.addr;
1596 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1600 rt6_remove_exception(bucket, rt6_ex);
1606 spin_unlock_bh(&rt6_exception_lock);
1610 /* Find rt6_ex which contains the passed in rt cache and
1613 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1615 struct rt6_exception_bucket *bucket;
1616 struct in6_addr *src_key = NULL;
1617 struct rt6_exception *rt6_ex;
1618 struct fib6_info *from;
1621 from = rcu_dereference(rt->from);
1622 if (!from || !(rt->rt6i_flags & RTF_CACHE))
1625 bucket = rcu_dereference(from->rt6i_exception_bucket);
1627 #ifdef CONFIG_IPV6_SUBTREES
1628 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1629 * and exception table is indexed by a hash of
1630 * both rt6i_dst and rt6i_src.
1631 * Otherwise, the exception table is indexed by
1632 * a hash of only rt6i_dst.
1634 if (from->fib6_src.plen)
1635 src_key = &rt->rt6i_src.addr;
1637 rt6_ex = __rt6_find_exception_rcu(&bucket,
1641 rt6_ex->stamp = jiffies;
1647 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1648 struct rt6_info *rt, int mtu)
1650 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1651 * lowest MTU in the path: always allow updating the route PMTU to
1652 * reflect PMTU decreases.
1654 * If the new MTU is higher, and the route PMTU is equal to the local
1655 * MTU, this means the old MTU is the lowest in the path, so allow
1656 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1660 if (dst_mtu(&rt->dst) >= mtu)
1663 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1669 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1670 struct fib6_info *rt, int mtu)
1672 struct rt6_exception_bucket *bucket;
1673 struct rt6_exception *rt6_ex;
1676 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1677 lockdep_is_held(&rt6_exception_lock));
1682 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1683 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1684 struct rt6_info *entry = rt6_ex->rt6i;
1686 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1687 * route), the metrics of its rt->from have already
1690 if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1691 rt6_mtu_change_route_allowed(idev, entry, mtu))
1692 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1698 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1700 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1701 struct in6_addr *gateway)
1703 struct rt6_exception_bucket *bucket;
1704 struct rt6_exception *rt6_ex;
1705 struct hlist_node *tmp;
1708 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1711 spin_lock_bh(&rt6_exception_lock);
1712 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1713 lockdep_is_held(&rt6_exception_lock));
1716 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1717 hlist_for_each_entry_safe(rt6_ex, tmp,
1718 &bucket->chain, hlist) {
1719 struct rt6_info *entry = rt6_ex->rt6i;
1721 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1722 RTF_CACHE_GATEWAY &&
1723 ipv6_addr_equal(gateway,
1724 &entry->rt6i_gateway)) {
1725 rt6_remove_exception(bucket, rt6_ex);
1732 spin_unlock_bh(&rt6_exception_lock);
1735 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1736 struct rt6_exception *rt6_ex,
1737 struct fib6_gc_args *gc_args,
1740 struct rt6_info *rt = rt6_ex->rt6i;
1742 /* we are pruning and obsoleting aged-out and non gateway exceptions
1743 * even if others have still references to them, so that on next
1744 * dst_check() such references can be dropped.
1745 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1746 * expired, independently from their aging, as per RFC 8201 section 4
1748 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1749 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1750 RT6_TRACE("aging clone %p\n", rt);
1751 rt6_remove_exception(bucket, rt6_ex);
1754 } else if (time_after(jiffies, rt->dst.expires)) {
1755 RT6_TRACE("purging expired route %p\n", rt);
1756 rt6_remove_exception(bucket, rt6_ex);
1760 if (rt->rt6i_flags & RTF_GATEWAY) {
1761 struct neighbour *neigh;
1762 __u8 neigh_flags = 0;
1764 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1766 neigh_flags = neigh->flags;
1768 if (!(neigh_flags & NTF_ROUTER)) {
1769 RT6_TRACE("purging route %p via non-router but gateway\n",
1771 rt6_remove_exception(bucket, rt6_ex);
1779 void rt6_age_exceptions(struct fib6_info *rt,
1780 struct fib6_gc_args *gc_args,
1783 struct rt6_exception_bucket *bucket;
1784 struct rt6_exception *rt6_ex;
1785 struct hlist_node *tmp;
1788 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1792 spin_lock(&rt6_exception_lock);
1793 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1794 lockdep_is_held(&rt6_exception_lock));
1797 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1798 hlist_for_each_entry_safe(rt6_ex, tmp,
1799 &bucket->chain, hlist) {
1800 rt6_age_examine_exception(bucket, rt6_ex,
1806 spin_unlock(&rt6_exception_lock);
1807 rcu_read_unlock_bh();
1810 /* must be called with rcu lock held */
1811 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1812 int oif, struct flowi6 *fl6, int strict)
1814 struct fib6_node *fn, *saved_fn;
1815 struct fib6_info *f6i;
1817 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1820 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1824 f6i = rt6_select(net, fn, oif, strict);
1825 if (f6i == net->ipv6.fib6_null_entry) {
1826 fn = fib6_backtrack(fn, &fl6->saddr);
1828 goto redo_rt6_select;
1829 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1830 /* also consider unreachable route */
1831 strict &= ~RT6_LOOKUP_F_REACHABLE;
1833 goto redo_rt6_select;
1837 trace_fib6_table_lookup(net, f6i, table, fl6);
1842 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1843 int oif, struct flowi6 *fl6,
1844 const struct sk_buff *skb, int flags)
1846 struct fib6_info *f6i;
1847 struct rt6_info *rt;
1850 strict |= flags & RT6_LOOKUP_F_IFACE;
1851 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1852 if (net->ipv6.devconf_all->forwarding == 0)
1853 strict |= RT6_LOOKUP_F_REACHABLE;
1857 f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1858 if (f6i == net->ipv6.fib6_null_entry) {
1859 rt = net->ipv6.ip6_null_entry;
1865 if (f6i->fib6_nsiblings)
1866 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1868 /*Search through exception table */
1869 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1871 if (ip6_hold_safe(net, &rt))
1872 dst_use_noref(&rt->dst, jiffies);
1876 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1877 !f6i->fib6_nh.fib_nh_gw_family)) {
1878 /* Create a RTF_CACHE clone which will not be
1879 * owned by the fib6 tree. It is for the special case where
1880 * the daddr in the skb during the neighbor look-up is different
1881 * from the fl6->daddr used to look-up route here.
1883 struct rt6_info *uncached_rt;
1885 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1890 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1891 * No need for another dst_hold()
1893 rt6_uncached_list_add(uncached_rt);
1894 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1896 uncached_rt = net->ipv6.ip6_null_entry;
1897 dst_hold(&uncached_rt->dst);
1902 /* Get a percpu copy */
1904 struct rt6_info *pcpu_rt;
1907 pcpu_rt = rt6_get_pcpu_route(f6i);
1910 pcpu_rt = rt6_make_pcpu_route(net, f6i);
1918 EXPORT_SYMBOL_GPL(ip6_pol_route);
1920 static struct rt6_info *ip6_pol_route_input(struct net *net,
1921 struct fib6_table *table,
1923 const struct sk_buff *skb,
1926 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1929 struct dst_entry *ip6_route_input_lookup(struct net *net,
1930 struct net_device *dev,
1932 const struct sk_buff *skb,
1935 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1936 flags |= RT6_LOOKUP_F_IFACE;
1938 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1940 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1942 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1943 struct flow_keys *keys,
1944 struct flow_keys *flkeys)
1946 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1947 const struct ipv6hdr *key_iph = outer_iph;
1948 struct flow_keys *_flkeys = flkeys;
1949 const struct ipv6hdr *inner_iph;
1950 const struct icmp6hdr *icmph;
1951 struct ipv6hdr _inner_iph;
1952 struct icmp6hdr _icmph;
1954 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1957 icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1958 sizeof(_icmph), &_icmph);
1962 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1963 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1964 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1965 icmph->icmp6_type != ICMPV6_PARAMPROB)
1968 inner_iph = skb_header_pointer(skb,
1969 skb_transport_offset(skb) + sizeof(*icmph),
1970 sizeof(_inner_iph), &_inner_iph);
1974 key_iph = inner_iph;
1978 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1979 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1980 keys->tags.flow_label = _flkeys->tags.flow_label;
1981 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1983 keys->addrs.v6addrs.src = key_iph->saddr;
1984 keys->addrs.v6addrs.dst = key_iph->daddr;
1985 keys->tags.flow_label = ip6_flowlabel(key_iph);
1986 keys->basic.ip_proto = key_iph->nexthdr;
1990 /* if skb is set it will be used and fl6 can be NULL */
1991 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1992 const struct sk_buff *skb, struct flow_keys *flkeys)
1994 struct flow_keys hash_keys;
1997 switch (ip6_multipath_hash_policy(net)) {
1999 memset(&hash_keys, 0, sizeof(hash_keys));
2000 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2002 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2004 hash_keys.addrs.v6addrs.src = fl6->saddr;
2005 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2006 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2007 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2012 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2013 struct flow_keys keys;
2015 /* short-circuit if we already have L4 hash present */
2017 return skb_get_hash_raw(skb) >> 1;
2019 memset(&hash_keys, 0, sizeof(hash_keys));
2022 skb_flow_dissect_flow_keys(skb, &keys, flag);
2025 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2026 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2027 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2028 hash_keys.ports.src = flkeys->ports.src;
2029 hash_keys.ports.dst = flkeys->ports.dst;
2030 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2032 memset(&hash_keys, 0, sizeof(hash_keys));
2033 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2034 hash_keys.addrs.v6addrs.src = fl6->saddr;
2035 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2036 hash_keys.ports.src = fl6->fl6_sport;
2037 hash_keys.ports.dst = fl6->fl6_dport;
2038 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2042 mhash = flow_hash_from_keys(&hash_keys);
2047 void ip6_route_input(struct sk_buff *skb)
2049 const struct ipv6hdr *iph = ipv6_hdr(skb);
2050 struct net *net = dev_net(skb->dev);
2051 int flags = RT6_LOOKUP_F_HAS_SADDR;
2052 struct ip_tunnel_info *tun_info;
2053 struct flowi6 fl6 = {
2054 .flowi6_iif = skb->dev->ifindex,
2055 .daddr = iph->daddr,
2056 .saddr = iph->saddr,
2057 .flowlabel = ip6_flowinfo(iph),
2058 .flowi6_mark = skb->mark,
2059 .flowi6_proto = iph->nexthdr,
2061 struct flow_keys *flkeys = NULL, _flkeys;
2063 tun_info = skb_tunnel_info(skb);
2064 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2065 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2067 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2070 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2071 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2074 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2077 static struct rt6_info *ip6_pol_route_output(struct net *net,
2078 struct fib6_table *table,
2080 const struct sk_buff *skb,
2083 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2086 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2087 struct flowi6 *fl6, int flags)
2091 if (ipv6_addr_type(&fl6->daddr) &
2092 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2093 struct dst_entry *dst;
2095 dst = l3mdev_link_scope_lookup(net, fl6);
2100 fl6->flowi6_iif = LOOPBACK_IFINDEX;
2102 any_src = ipv6_addr_any(&fl6->saddr);
2103 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2104 (fl6->flowi6_oif && any_src))
2105 flags |= RT6_LOOKUP_F_IFACE;
2108 flags |= RT6_LOOKUP_F_HAS_SADDR;
2110 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2112 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2114 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2116 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2118 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2119 struct net_device *loopback_dev = net->loopback_dev;
2120 struct dst_entry *new = NULL;
2122 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2123 DST_OBSOLETE_DEAD, 0);
2126 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2130 new->input = dst_discard;
2131 new->output = dst_discard_out;
2133 dst_copy_metrics(new, &ort->dst);
2135 rt->rt6i_idev = in6_dev_get(loopback_dev);
2136 rt->rt6i_gateway = ort->rt6i_gateway;
2137 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2139 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2140 #ifdef CONFIG_IPV6_SUBTREES
2141 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2145 dst_release(dst_orig);
2146 return new ? new : ERR_PTR(-ENOMEM);
2150 * Destination cache support functions
2153 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2157 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2160 if (fib6_check_expired(f6i))
2166 static struct dst_entry *rt6_check(struct rt6_info *rt,
2167 struct fib6_info *from,
2172 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2173 rt_cookie != cookie)
2176 if (rt6_check_expired(rt))
2182 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2183 struct fib6_info *from,
2186 if (!__rt6_check_expired(rt) &&
2187 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2188 fib6_check(from, cookie))
2194 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2196 struct dst_entry *dst_ret;
2197 struct fib6_info *from;
2198 struct rt6_info *rt;
2200 rt = container_of(dst, struct rt6_info, dst);
2204 /* All IPV6 dsts are created with ->obsolete set to the value
2205 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2206 * into this function always.
2209 from = rcu_dereference(rt->from);
2211 if (from && (rt->rt6i_flags & RTF_PCPU ||
2212 unlikely(!list_empty(&rt->rt6i_uncached))))
2213 dst_ret = rt6_dst_from_check(rt, from, cookie);
2215 dst_ret = rt6_check(rt, from, cookie);
2222 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2224 struct rt6_info *rt = (struct rt6_info *) dst;
2227 if (rt->rt6i_flags & RTF_CACHE) {
2229 if (rt6_check_expired(rt)) {
2230 rt6_remove_exception_rt(rt);
2242 static void ip6_link_failure(struct sk_buff *skb)
2244 struct rt6_info *rt;
2246 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2248 rt = (struct rt6_info *) skb_dst(skb);
2251 if (rt->rt6i_flags & RTF_CACHE) {
2252 rt6_remove_exception_rt(rt);
2254 struct fib6_info *from;
2255 struct fib6_node *fn;
2257 from = rcu_dereference(rt->from);
2259 fn = rcu_dereference(from->fib6_node);
2260 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2268 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2270 if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2271 struct fib6_info *from;
2274 from = rcu_dereference(rt0->from);
2276 rt0->dst.expires = from->expires;
2280 dst_set_expires(&rt0->dst, timeout);
2281 rt0->rt6i_flags |= RTF_EXPIRES;
2284 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2286 struct net *net = dev_net(rt->dst.dev);
2288 dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2289 rt->rt6i_flags |= RTF_MODIFIED;
2290 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2293 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2295 return !(rt->rt6i_flags & RTF_CACHE) &&
2296 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2299 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2300 const struct ipv6hdr *iph, u32 mtu)
2302 const struct in6_addr *daddr, *saddr;
2303 struct rt6_info *rt6 = (struct rt6_info *)dst;
2305 if (dst_metric_locked(dst, RTAX_MTU))
2309 daddr = &iph->daddr;
2310 saddr = &iph->saddr;
2312 daddr = &sk->sk_v6_daddr;
2313 saddr = &inet6_sk(sk)->saddr;
2318 dst_confirm_neigh(dst, daddr);
2319 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2320 if (mtu >= dst_mtu(dst))
2323 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2324 rt6_do_update_pmtu(rt6, mtu);
2325 /* update rt6_ex->stamp for cache */
2326 if (rt6->rt6i_flags & RTF_CACHE)
2327 rt6_update_exception_stamp_rt(rt6);
2329 struct fib6_info *from;
2330 struct rt6_info *nrt6;
2333 from = rcu_dereference(rt6->from);
2338 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2340 rt6_do_update_pmtu(nrt6, mtu);
2341 if (rt6_insert_exception(nrt6, from))
2342 dst_release_immediate(&nrt6->dst);
2348 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2349 struct sk_buff *skb, u32 mtu)
2351 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2354 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2355 int oif, u32 mark, kuid_t uid)
2357 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2358 struct dst_entry *dst;
2359 struct flowi6 fl6 = {
2361 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2362 .daddr = iph->daddr,
2363 .saddr = iph->saddr,
2364 .flowlabel = ip6_flowinfo(iph),
2368 dst = ip6_route_output(net, NULL, &fl6);
2370 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2373 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2375 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2377 int oif = sk->sk_bound_dev_if;
2378 struct dst_entry *dst;
2380 if (!oif && skb->dev)
2381 oif = l3mdev_master_ifindex(skb->dev);
2383 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2385 dst = __sk_dst_get(sk);
2386 if (!dst || !dst->obsolete ||
2387 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2391 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2392 ip6_datagram_dst_update(sk, false);
2395 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2397 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2398 const struct flowi6 *fl6)
2400 #ifdef CONFIG_IPV6_SUBTREES
2401 struct ipv6_pinfo *np = inet6_sk(sk);
2404 ip6_dst_store(sk, dst,
2405 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2406 &sk->sk_v6_daddr : NULL,
2407 #ifdef CONFIG_IPV6_SUBTREES
2408 ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2414 static bool ip6_redirect_nh_match(struct fib6_info *f6i,
2417 const struct in6_addr *gw,
2418 struct rt6_info **ret)
2420 if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
2421 fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
2424 /* rt_cache's gateway might be different from its 'parent'
2425 * in the case of an ip redirect.
2426 * So we keep searching in the exception table if the gateway
2429 if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
2430 struct rt6_info *rt_cache;
2432 rt_cache = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
2434 ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
2443 /* Handle redirects */
2444 struct ip6rd_flowi {
2446 struct in6_addr gateway;
2449 static struct rt6_info *__ip6_route_redirect(struct net *net,
2450 struct fib6_table *table,
2452 const struct sk_buff *skb,
2455 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2456 struct rt6_info *ret = NULL;
2457 struct fib6_info *rt;
2458 struct fib6_node *fn;
2460 /* Get the "current" route for this destination and
2461 * check if the redirect has come from appropriate router.
2463 * RFC 4861 specifies that redirects should only be
2464 * accepted if they come from the nexthop to the target.
2465 * Due to the way the routes are chosen, this notion
2466 * is a bit fuzzy and one might need to check all possible
2471 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2473 for_each_fib6_node_rt_rcu(fn) {
2474 if (fib6_check_expired(rt))
2476 if (rt->fib6_flags & RTF_REJECT)
2478 if (ip6_redirect_nh_match(rt, &rt->fib6_nh, fl6,
2479 &rdfl->gateway, &ret))
2484 rt = net->ipv6.fib6_null_entry;
2485 else if (rt->fib6_flags & RTF_REJECT) {
2486 ret = net->ipv6.ip6_null_entry;
2490 if (rt == net->ipv6.fib6_null_entry) {
2491 fn = fib6_backtrack(fn, &fl6->saddr);
2498 ip6_hold_safe(net, &ret);
2500 ret = ip6_create_rt_rcu(rt);
2504 trace_fib6_table_lookup(net, rt, table, fl6);
2508 static struct dst_entry *ip6_route_redirect(struct net *net,
2509 const struct flowi6 *fl6,
2510 const struct sk_buff *skb,
2511 const struct in6_addr *gateway)
2513 int flags = RT6_LOOKUP_F_HAS_SADDR;
2514 struct ip6rd_flowi rdfl;
2517 rdfl.gateway = *gateway;
2519 return fib6_rule_lookup(net, &rdfl.fl6, skb,
2520 flags, __ip6_route_redirect);
2523 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2526 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2527 struct dst_entry *dst;
2528 struct flowi6 fl6 = {
2529 .flowi6_iif = LOOPBACK_IFINDEX,
2531 .flowi6_mark = mark,
2532 .daddr = iph->daddr,
2533 .saddr = iph->saddr,
2534 .flowlabel = ip6_flowinfo(iph),
2538 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2539 rt6_do_redirect(dst, NULL, skb);
2542 EXPORT_SYMBOL_GPL(ip6_redirect);
2544 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2546 const struct ipv6hdr *iph = ipv6_hdr(skb);
2547 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2548 struct dst_entry *dst;
2549 struct flowi6 fl6 = {
2550 .flowi6_iif = LOOPBACK_IFINDEX,
2553 .saddr = iph->daddr,
2554 .flowi6_uid = sock_net_uid(net, NULL),
2557 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2558 rt6_do_redirect(dst, NULL, skb);
2562 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2564 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2567 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2569 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2571 struct net_device *dev = dst->dev;
2572 unsigned int mtu = dst_mtu(dst);
2573 struct net *net = dev_net(dev);
2575 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2577 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2578 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2581 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2582 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2583 * IPV6_MAXPLEN is also valid and means: "any MSS,
2584 * rely only on pmtu discovery"
2586 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2591 static unsigned int ip6_mtu(const struct dst_entry *dst)
2593 struct inet6_dev *idev;
2596 mtu = dst_metric_raw(dst, RTAX_MTU);
2603 idev = __in6_dev_get(dst->dev);
2605 mtu = idev->cnf.mtu6;
2609 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2611 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2615 * 1. mtu on route is locked - use it
2616 * 2. mtu from nexthop exception
2617 * 3. mtu from egress device
2619 * based on ip6_dst_mtu_forward and exception logic of
2620 * rt6_find_cached_rt; called with rcu_read_lock
2622 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2623 struct in6_addr *saddr)
2625 struct rt6_exception_bucket *bucket;
2626 struct rt6_exception *rt6_ex;
2627 struct in6_addr *src_key;
2628 struct inet6_dev *idev;
2631 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2632 mtu = f6i->fib6_pmtu;
2638 #ifdef CONFIG_IPV6_SUBTREES
2639 if (f6i->fib6_src.plen)
2643 bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2644 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2645 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2646 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2649 struct net_device *dev = fib6_info_nh_dev(f6i);
2652 idev = __in6_dev_get(dev);
2653 if (idev && idev->cnf.mtu6 > mtu)
2654 mtu = idev->cnf.mtu6;
2657 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2659 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2662 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2665 struct dst_entry *dst;
2666 struct rt6_info *rt;
2667 struct inet6_dev *idev = in6_dev_get(dev);
2668 struct net *net = dev_net(dev);
2670 if (unlikely(!idev))
2671 return ERR_PTR(-ENODEV);
2673 rt = ip6_dst_alloc(net, dev, 0);
2674 if (unlikely(!rt)) {
2676 dst = ERR_PTR(-ENOMEM);
2680 rt->dst.flags |= DST_HOST;
2681 rt->dst.input = ip6_input;
2682 rt->dst.output = ip6_output;
2683 rt->rt6i_gateway = fl6->daddr;
2684 rt->rt6i_dst.addr = fl6->daddr;
2685 rt->rt6i_dst.plen = 128;
2686 rt->rt6i_idev = idev;
2687 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2689 /* Add this dst into uncached_list so that rt6_disable_ip() can
2690 * do proper release of the net_device
2692 rt6_uncached_list_add(rt);
2693 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2695 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2701 static int ip6_dst_gc(struct dst_ops *ops)
2703 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2704 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2705 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2706 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2707 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2708 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2711 entries = dst_entries_get_fast(ops);
2712 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2713 entries <= rt_max_size)
2716 net->ipv6.ip6_rt_gc_expire++;
2717 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2718 entries = dst_entries_get_slow(ops);
2719 if (entries < ops->gc_thresh)
2720 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2722 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2723 return entries > rt_max_size;
2726 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2727 struct fib6_config *cfg,
2728 const struct in6_addr *gw_addr,
2729 u32 tbid, int flags)
2731 struct flowi6 fl6 = {
2732 .flowi6_oif = cfg->fc_ifindex,
2734 .saddr = cfg->fc_prefsrc,
2736 struct fib6_table *table;
2737 struct rt6_info *rt;
2739 table = fib6_get_table(net, tbid);
2743 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2744 flags |= RT6_LOOKUP_F_HAS_SADDR;
2746 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2747 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2749 /* if table lookup failed, fall back to full lookup */
2750 if (rt == net->ipv6.ip6_null_entry) {
2758 static int ip6_route_check_nh_onlink(struct net *net,
2759 struct fib6_config *cfg,
2760 const struct net_device *dev,
2761 struct netlink_ext_ack *extack)
2763 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2764 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2765 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2766 struct fib6_info *from;
2767 struct rt6_info *grt;
2771 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2774 from = rcu_dereference(grt->from);
2775 if (!grt->dst.error &&
2776 /* ignore match if it is the default route */
2777 from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2778 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2779 NL_SET_ERR_MSG(extack,
2780 "Nexthop has invalid gateway or device mismatch");
2791 static int ip6_route_check_nh(struct net *net,
2792 struct fib6_config *cfg,
2793 struct net_device **_dev,
2794 struct inet6_dev **idev)
2796 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2797 struct net_device *dev = _dev ? *_dev : NULL;
2798 struct rt6_info *grt = NULL;
2799 int err = -EHOSTUNREACH;
2801 if (cfg->fc_table) {
2802 int flags = RT6_LOOKUP_F_IFACE;
2804 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2805 cfg->fc_table, flags);
2807 if (grt->rt6i_flags & RTF_GATEWAY ||
2808 (dev && dev != grt->dst.dev)) {
2816 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2822 if (dev != grt->dst.dev) {
2827 *_dev = dev = grt->dst.dev;
2828 *idev = grt->rt6i_idev;
2830 in6_dev_hold(grt->rt6i_idev);
2833 if (!(grt->rt6i_flags & RTF_GATEWAY))
2842 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2843 struct net_device **_dev, struct inet6_dev **idev,
2844 struct netlink_ext_ack *extack)
2846 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2847 int gwa_type = ipv6_addr_type(gw_addr);
2848 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2849 const struct net_device *dev = *_dev;
2850 bool need_addr_check = !dev;
2853 /* if gw_addr is local we will fail to detect this in case
2854 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2855 * will return already-added prefix route via interface that
2856 * prefix route was assigned to, which might be non-loopback.
2859 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2860 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2864 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2865 /* IPv6 strictly inhibits using not link-local
2866 * addresses as nexthop address.
2867 * Otherwise, router will not able to send redirects.
2868 * It is very good, but in some (rare!) circumstances
2869 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2870 * some exceptions. --ANK
2871 * We allow IPv4-mapped nexthops to support RFC4798-type
2874 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2875 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2879 if (cfg->fc_flags & RTNH_F_ONLINK)
2880 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2882 err = ip6_route_check_nh(net, cfg, _dev, idev);
2888 /* reload in case device was changed */
2893 NL_SET_ERR_MSG(extack, "Egress device not specified");
2895 } else if (dev->flags & IFF_LOOPBACK) {
2896 NL_SET_ERR_MSG(extack,
2897 "Egress device can not be loopback device for this route");
2901 /* if we did not check gw_addr above, do so now that the
2902 * egress device has been resolved.
2904 if (need_addr_check &&
2905 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2906 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2915 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
2917 if ((flags & RTF_REJECT) ||
2918 (dev && (dev->flags & IFF_LOOPBACK) &&
2919 !(addr_type & IPV6_ADDR_LOOPBACK) &&
2920 !(flags & RTF_LOCAL)))
2926 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
2927 struct fib6_config *cfg, gfp_t gfp_flags,
2928 struct netlink_ext_ack *extack)
2930 struct net_device *dev = NULL;
2931 struct inet6_dev *idev = NULL;
2935 fib6_nh->fib_nh_family = AF_INET6;
2938 if (cfg->fc_ifindex) {
2939 dev = dev_get_by_index(net, cfg->fc_ifindex);
2942 idev = in6_dev_get(dev);
2947 if (cfg->fc_flags & RTNH_F_ONLINK) {
2949 NL_SET_ERR_MSG(extack,
2950 "Nexthop device required for onlink");
2954 if (!(dev->flags & IFF_UP)) {
2955 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2960 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
2963 fib6_nh->fib_nh_weight = 1;
2965 /* We cannot add true routes via loopback here,
2966 * they would result in kernel looping; promote them to reject routes
2968 addr_type = ipv6_addr_type(&cfg->fc_dst);
2969 if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
2970 /* hold loopback dev/idev if we haven't done so. */
2971 if (dev != net->loopback_dev) {
2976 dev = net->loopback_dev;
2978 idev = in6_dev_get(dev);
2987 if (cfg->fc_flags & RTF_GATEWAY) {
2988 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
2992 fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
2993 fib6_nh->fib_nh_gw_family = AF_INET6;
3000 if (idev->cnf.disable_ipv6) {
3001 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3006 if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
3007 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3012 if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3013 !netif_carrier_ok(dev))
3014 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
3016 err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap,
3017 cfg->fc_encap_type, cfg, gfp_flags, extack);
3021 fib6_nh->fib_nh_dev = dev;
3022 fib6_nh->fib_nh_oif = dev->ifindex;
3029 lwtstate_put(fib6_nh->fib_nh_lws);
3030 fib6_nh->fib_nh_lws = NULL;
3038 void fib6_nh_release(struct fib6_nh *fib6_nh)
3040 fib_nh_common_release(&fib6_nh->nh_common);
3043 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3045 struct netlink_ext_ack *extack)
3047 struct net *net = cfg->fc_nlinfo.nl_net;
3048 struct fib6_info *rt = NULL;
3049 struct fib6_table *table;
3053 /* RTF_PCPU is an internal flag; can not be set by userspace */
3054 if (cfg->fc_flags & RTF_PCPU) {
3055 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3059 /* RTF_CACHE is an internal flag; can not be set by userspace */
3060 if (cfg->fc_flags & RTF_CACHE) {
3061 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3065 if (cfg->fc_type > RTN_MAX) {
3066 NL_SET_ERR_MSG(extack, "Invalid route type");
3070 if (cfg->fc_dst_len > 128) {
3071 NL_SET_ERR_MSG(extack, "Invalid prefix length");
3074 if (cfg->fc_src_len > 128) {
3075 NL_SET_ERR_MSG(extack, "Invalid source address length");
3078 #ifndef CONFIG_IPV6_SUBTREES
3079 if (cfg->fc_src_len) {
3080 NL_SET_ERR_MSG(extack,
3081 "Specifying source address requires IPV6_SUBTREES to be enabled");
3087 if (cfg->fc_nlinfo.nlh &&
3088 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3089 table = fib6_get_table(net, cfg->fc_table);
3091 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3092 table = fib6_new_table(net, cfg->fc_table);
3095 table = fib6_new_table(net, cfg->fc_table);
3102 rt = fib6_info_alloc(gfp_flags);
3106 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3108 if (IS_ERR(rt->fib6_metrics)) {
3109 err = PTR_ERR(rt->fib6_metrics);
3110 /* Do not leave garbage there. */
3111 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3115 if (cfg->fc_flags & RTF_ADDRCONF)
3116 rt->dst_nocount = true;
3118 if (cfg->fc_flags & RTF_EXPIRES)
3119 fib6_set_expires(rt, jiffies +
3120 clock_t_to_jiffies(cfg->fc_expires));
3122 fib6_clean_expires(rt);
3124 if (cfg->fc_protocol == RTPROT_UNSPEC)
3125 cfg->fc_protocol = RTPROT_BOOT;
3126 rt->fib6_protocol = cfg->fc_protocol;
3128 rt->fib6_table = table;
3129 rt->fib6_metric = cfg->fc_metric;
3130 rt->fib6_type = cfg->fc_type;
3131 rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3133 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3134 rt->fib6_dst.plen = cfg->fc_dst_len;
3135 if (rt->fib6_dst.plen == 128)
3136 rt->dst_host = true;
3138 #ifdef CONFIG_IPV6_SUBTREES
3139 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3140 rt->fib6_src.plen = cfg->fc_src_len;
3142 err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
3146 /* We cannot add true routes via loopback here,
3147 * they would result in kernel looping; promote them to reject routes
3149 addr_type = ipv6_addr_type(&cfg->fc_dst);
3150 if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type))
3151 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3153 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3154 struct net_device *dev = fib6_info_nh_dev(rt);
3156 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3157 NL_SET_ERR_MSG(extack, "Invalid source address");
3161 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3162 rt->fib6_prefsrc.plen = 128;
3164 rt->fib6_prefsrc.plen = 0;
3168 fib6_info_release(rt);
3169 return ERR_PTR(err);
3172 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3173 struct netlink_ext_ack *extack)
3175 struct fib6_info *rt;
3178 rt = ip6_route_info_create(cfg, gfp_flags, extack);
3182 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3183 fib6_info_release(rt);
3188 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3190 struct net *net = info->nl_net;
3191 struct fib6_table *table;
3194 if (rt == net->ipv6.fib6_null_entry) {
3199 table = rt->fib6_table;
3200 spin_lock_bh(&table->tb6_lock);
3201 err = fib6_del(rt, info);
3202 spin_unlock_bh(&table->tb6_lock);
3205 fib6_info_release(rt);
3209 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3211 struct nl_info info = { .nl_net = net };
3213 return __ip6_del_rt(rt, &info);
3216 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3218 struct nl_info *info = &cfg->fc_nlinfo;
3219 struct net *net = info->nl_net;
3220 struct sk_buff *skb = NULL;
3221 struct fib6_table *table;
3224 if (rt == net->ipv6.fib6_null_entry)
3226 table = rt->fib6_table;
3227 spin_lock_bh(&table->tb6_lock);
3229 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3230 struct fib6_info *sibling, *next_sibling;
3232 /* prefer to send a single notification with all hops */
3233 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3235 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3237 if (rt6_fill_node(net, skb, rt, NULL,
3238 NULL, NULL, 0, RTM_DELROUTE,
3239 info->portid, seq, 0) < 0) {
3243 info->skip_notify = 1;
3246 list_for_each_entry_safe(sibling, next_sibling,
3249 err = fib6_del(sibling, info);
3255 err = fib6_del(rt, info);
3257 spin_unlock_bh(&table->tb6_lock);
3259 fib6_info_release(rt);
3262 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3263 info->nlh, gfp_any());
3268 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3272 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3275 if (cfg->fc_flags & RTF_GATEWAY &&
3276 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3279 rc = rt6_remove_exception_rt(rt);
3284 static int ip6_route_del(struct fib6_config *cfg,
3285 struct netlink_ext_ack *extack)
3287 struct rt6_info *rt_cache;
3288 struct fib6_table *table;
3289 struct fib6_info *rt;
3290 struct fib6_node *fn;
3293 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3295 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3301 fn = fib6_locate(&table->tb6_root,
3302 &cfg->fc_dst, cfg->fc_dst_len,
3303 &cfg->fc_src, cfg->fc_src_len,
3304 !(cfg->fc_flags & RTF_CACHE));
3307 for_each_fib6_node_rt_rcu(fn) {
3310 if (cfg->fc_flags & RTF_CACHE) {
3313 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3316 rc = ip6_del_cached_rt(rt_cache, cfg);
3326 if (cfg->fc_ifindex &&
3328 nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
3330 if (cfg->fc_flags & RTF_GATEWAY &&
3331 !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
3333 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3335 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3337 if (!fib6_info_hold_safe(rt))
3341 /* if gateway was specified only delete the one hop */
3342 if (cfg->fc_flags & RTF_GATEWAY)
3343 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3345 return __ip6_del_rt_siblings(rt, cfg);
3353 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3355 struct netevent_redirect netevent;
3356 struct rt6_info *rt, *nrt = NULL;
3357 struct ndisc_options ndopts;
3358 struct inet6_dev *in6_dev;
3359 struct neighbour *neigh;
3360 struct fib6_info *from;
3362 int optlen, on_link;
3365 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3366 optlen -= sizeof(*msg);
3369 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3373 msg = (struct rd_msg *)icmp6_hdr(skb);
3375 if (ipv6_addr_is_multicast(&msg->dest)) {
3376 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3381 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3383 } else if (ipv6_addr_type(&msg->target) !=
3384 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3385 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3389 in6_dev = __in6_dev_get(skb->dev);
3392 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3396 * The IP source address of the Redirect MUST be the same as the current
3397 * first-hop router for the specified ICMP Destination Address.
3400 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3401 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3406 if (ndopts.nd_opts_tgt_lladdr) {
3407 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3410 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3415 rt = (struct rt6_info *) dst;
3416 if (rt->rt6i_flags & RTF_REJECT) {
3417 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3421 /* Redirect received -> path was valid.
3422 * Look, redirects are sent only in response to data packets,
3423 * so that this nexthop apparently is reachable. --ANK
3425 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3427 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3432 * We have finally decided to accept it.
3435 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3436 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3437 NEIGH_UPDATE_F_OVERRIDE|
3438 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3439 NEIGH_UPDATE_F_ISROUTER)),
3440 NDISC_REDIRECT, &ndopts);
3443 from = rcu_dereference(rt->from);
3444 /* This fib6_info_hold() is safe here because we hold reference to rt
3445 * and rt already holds reference to fib6_info.
3447 fib6_info_hold(from);
3450 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3454 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3456 nrt->rt6i_flags &= ~RTF_GATEWAY;
3458 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3460 /* No need to remove rt from the exception table if rt is
3461 * a cached route because rt6_insert_exception() will
3464 if (rt6_insert_exception(nrt, from)) {
3465 dst_release_immediate(&nrt->dst);
3469 netevent.old = &rt->dst;
3470 netevent.new = &nrt->dst;
3471 netevent.daddr = &msg->dest;
3472 netevent.neigh = neigh;
3473 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3476 fib6_info_release(from);
3477 neigh_release(neigh);
3480 #ifdef CONFIG_IPV6_ROUTE_INFO
3481 static struct fib6_info *rt6_get_route_info(struct net *net,
3482 const struct in6_addr *prefix, int prefixlen,
3483 const struct in6_addr *gwaddr,
3484 struct net_device *dev)
3486 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3487 int ifindex = dev->ifindex;
3488 struct fib6_node *fn;
3489 struct fib6_info *rt = NULL;
3490 struct fib6_table *table;
3492 table = fib6_get_table(net, tb_id);
3497 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3501 for_each_fib6_node_rt_rcu(fn) {
3502 if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
3504 if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
3505 !rt->fib6_nh.fib_nh_gw_family)
3507 if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
3509 if (!fib6_info_hold_safe(rt))
3518 static struct fib6_info *rt6_add_route_info(struct net *net,
3519 const struct in6_addr *prefix, int prefixlen,
3520 const struct in6_addr *gwaddr,
3521 struct net_device *dev,
3524 struct fib6_config cfg = {
3525 .fc_metric = IP6_RT_PRIO_USER,
3526 .fc_ifindex = dev->ifindex,
3527 .fc_dst_len = prefixlen,
3528 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3529 RTF_UP | RTF_PREF(pref),
3530 .fc_protocol = RTPROT_RA,
3531 .fc_type = RTN_UNICAST,
3532 .fc_nlinfo.portid = 0,
3533 .fc_nlinfo.nlh = NULL,
3534 .fc_nlinfo.nl_net = net,
3537 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3538 cfg.fc_dst = *prefix;
3539 cfg.fc_gateway = *gwaddr;
3541 /* We should treat it as a default route if prefix length is 0. */
3543 cfg.fc_flags |= RTF_DEFAULT;
3545 ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3547 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3551 struct fib6_info *rt6_get_dflt_router(struct net *net,
3552 const struct in6_addr *addr,
3553 struct net_device *dev)
3555 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3556 struct fib6_info *rt;
3557 struct fib6_table *table;
3559 table = fib6_get_table(net, tb_id);
3564 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3565 struct fib6_nh *nh = &rt->fib6_nh;
3567 if (dev == nh->fib_nh_dev &&
3568 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3569 ipv6_addr_equal(&nh->fib_nh_gw6, addr))
3572 if (rt && !fib6_info_hold_safe(rt))
3578 struct fib6_info *rt6_add_dflt_router(struct net *net,
3579 const struct in6_addr *gwaddr,
3580 struct net_device *dev,
3583 struct fib6_config cfg = {
3584 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3585 .fc_metric = IP6_RT_PRIO_USER,
3586 .fc_ifindex = dev->ifindex,
3587 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3588 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3589 .fc_protocol = RTPROT_RA,
3590 .fc_type = RTN_UNICAST,
3591 .fc_nlinfo.portid = 0,
3592 .fc_nlinfo.nlh = NULL,
3593 .fc_nlinfo.nl_net = net,
3596 cfg.fc_gateway = *gwaddr;
3598 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3599 struct fib6_table *table;
3601 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3603 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3606 return rt6_get_dflt_router(net, gwaddr, dev);
3609 static void __rt6_purge_dflt_routers(struct net *net,
3610 struct fib6_table *table)
3612 struct fib6_info *rt;
3616 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3617 struct net_device *dev = fib6_info_nh_dev(rt);
3618 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3620 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3621 (!idev || idev->cnf.accept_ra != 2) &&
3622 fib6_info_hold_safe(rt)) {
3624 ip6_del_rt(net, rt);
3630 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3633 void rt6_purge_dflt_routers(struct net *net)
3635 struct fib6_table *table;
3636 struct hlist_head *head;
3641 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3642 head = &net->ipv6.fib_table_hash[h];
3643 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3644 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3645 __rt6_purge_dflt_routers(net, table);
3652 static void rtmsg_to_fib6_config(struct net *net,
3653 struct in6_rtmsg *rtmsg,
3654 struct fib6_config *cfg)
3656 *cfg = (struct fib6_config){
3657 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3659 .fc_ifindex = rtmsg->rtmsg_ifindex,
3660 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
3661 .fc_expires = rtmsg->rtmsg_info,
3662 .fc_dst_len = rtmsg->rtmsg_dst_len,
3663 .fc_src_len = rtmsg->rtmsg_src_len,
3664 .fc_flags = rtmsg->rtmsg_flags,
3665 .fc_type = rtmsg->rtmsg_type,
3667 .fc_nlinfo.nl_net = net,
3669 .fc_dst = rtmsg->rtmsg_dst,
3670 .fc_src = rtmsg->rtmsg_src,
3671 .fc_gateway = rtmsg->rtmsg_gateway,
3675 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3677 struct fib6_config cfg;
3678 struct in6_rtmsg rtmsg;
3682 case SIOCADDRT: /* Add a route */
3683 case SIOCDELRT: /* Delete a route */
3684 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3686 err = copy_from_user(&rtmsg, arg,
3687 sizeof(struct in6_rtmsg));
3691 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3696 err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3699 err = ip6_route_del(&cfg, NULL);
3713 * Drop the packet on the floor
3716 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3719 struct dst_entry *dst = skb_dst(skb);
3720 switch (ipstats_mib_noroutes) {
3721 case IPSTATS_MIB_INNOROUTES:
3722 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3723 if (type == IPV6_ADDR_ANY) {
3724 IP6_INC_STATS(dev_net(dst->dev),
3725 __in6_dev_get_safely(skb->dev),
3726 IPSTATS_MIB_INADDRERRORS);
3730 case IPSTATS_MIB_OUTNOROUTES:
3731 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3732 ipstats_mib_noroutes);
3735 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3740 static int ip6_pkt_discard(struct sk_buff *skb)
3742 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3745 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3747 skb->dev = skb_dst(skb)->dev;
3748 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3751 static int ip6_pkt_prohibit(struct sk_buff *skb)
3753 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3756 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3758 skb->dev = skb_dst(skb)->dev;
3759 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3763 * Allocate a dst for local (unicast / anycast) address.
3766 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3767 struct inet6_dev *idev,
3768 const struct in6_addr *addr,
3769 bool anycast, gfp_t gfp_flags)
3771 struct fib6_config cfg = {
3772 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
3773 .fc_ifindex = idev->dev->ifindex,
3774 .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
3777 .fc_protocol = RTPROT_KERNEL,
3778 .fc_nlinfo.nl_net = net,
3779 .fc_ignore_dev_down = true,
3783 cfg.fc_type = RTN_ANYCAST;
3784 cfg.fc_flags |= RTF_ANYCAST;
3786 cfg.fc_type = RTN_LOCAL;
3787 cfg.fc_flags |= RTF_LOCAL;
3790 return ip6_route_info_create(&cfg, gfp_flags, NULL);
3793 /* remove deleted ip from prefsrc entries */
3794 struct arg_dev_net_ip {
3795 struct net_device *dev;
3797 struct in6_addr *addr;
3800 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3802 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3803 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3804 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3806 if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) &&
3807 rt != net->ipv6.fib6_null_entry &&
3808 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3809 spin_lock_bh(&rt6_exception_lock);
3810 /* remove prefsrc entry */
3811 rt->fib6_prefsrc.plen = 0;
3812 spin_unlock_bh(&rt6_exception_lock);
3817 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3819 struct net *net = dev_net(ifp->idev->dev);
3820 struct arg_dev_net_ip adni = {
3821 .dev = ifp->idev->dev,
3825 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3828 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT)
3830 /* Remove routers and update dst entries when gateway turn into host. */
3831 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3833 struct in6_addr *gateway = (struct in6_addr *)arg;
3835 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3836 rt->fib6_nh.fib_nh_gw_family &&
3837 ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
3841 /* Further clean up cached routes in exception table.
3842 * This is needed because cached route may have a different
3843 * gateway than its 'parent' in the case of an ip redirect.
3845 rt6_exceptions_clean_tohost(rt, gateway);
3850 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3852 fib6_clean_all(net, fib6_clean_tohost, gateway);
3855 struct arg_netdev_event {
3856 const struct net_device *dev;
3858 unsigned int nh_flags;
3859 unsigned long event;
3863 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3865 struct fib6_info *iter;
3866 struct fib6_node *fn;
3868 fn = rcu_dereference_protected(rt->fib6_node,
3869 lockdep_is_held(&rt->fib6_table->tb6_lock));
3870 iter = rcu_dereference_protected(fn->leaf,
3871 lockdep_is_held(&rt->fib6_table->tb6_lock));
3873 if (iter->fib6_metric == rt->fib6_metric &&
3874 rt6_qualify_for_ecmp(iter))
3876 iter = rcu_dereference_protected(iter->fib6_next,
3877 lockdep_is_held(&rt->fib6_table->tb6_lock));
3883 static bool rt6_is_dead(const struct fib6_info *rt)
3885 if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ||
3886 (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
3887 ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev)))
3893 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3895 struct fib6_info *iter;
3898 if (!rt6_is_dead(rt))
3899 total += rt->fib6_nh.fib_nh_weight;
3901 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3902 if (!rt6_is_dead(iter))
3903 total += iter->fib6_nh.fib_nh_weight;
3909 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3911 int upper_bound = -1;
3913 if (!rt6_is_dead(rt)) {
3914 *weight += rt->fib6_nh.fib_nh_weight;
3915 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3918 atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound);
3921 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3923 struct fib6_info *iter;
3926 rt6_upper_bound_set(rt, &weight, total);
3928 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3929 rt6_upper_bound_set(iter, &weight, total);
3932 void rt6_multipath_rebalance(struct fib6_info *rt)
3934 struct fib6_info *first;
3937 /* In case the entire multipath route was marked for flushing,
3938 * then there is no need to rebalance upon the removal of every
3941 if (!rt->fib6_nsiblings || rt->should_flush)
3944 /* During lookup routes are evaluated in order, so we need to
3945 * make sure upper bounds are assigned from the first sibling
3948 first = rt6_multipath_first_sibling(rt);
3949 if (WARN_ON_ONCE(!first))
3952 total = rt6_multipath_total_weight(first);
3953 rt6_multipath_upper_bound_set(first, total);
3956 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3958 const struct arg_netdev_event *arg = p_arg;
3959 struct net *net = dev_net(arg->dev);
3961 if (rt != net->ipv6.fib6_null_entry &&
3962 rt->fib6_nh.fib_nh_dev == arg->dev) {
3963 rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags;
3964 fib6_update_sernum_upto_root(net, rt);
3965 rt6_multipath_rebalance(rt);
3971 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3973 struct arg_netdev_event arg = {
3976 .nh_flags = nh_flags,
3980 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3981 arg.nh_flags |= RTNH_F_LINKDOWN;
3983 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3986 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3987 const struct net_device *dev)
3989 struct fib6_info *iter;
3991 if (rt->fib6_nh.fib_nh_dev == dev)
3993 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3994 if (iter->fib6_nh.fib_nh_dev == dev)
4000 static void rt6_multipath_flush(struct fib6_info *rt)
4002 struct fib6_info *iter;
4004 rt->should_flush = 1;
4005 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4006 iter->should_flush = 1;
4009 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4010 const struct net_device *down_dev)
4012 struct fib6_info *iter;
4013 unsigned int dead = 0;
4015 if (rt->fib6_nh.fib_nh_dev == down_dev ||
4016 rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4018 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4019 if (iter->fib6_nh.fib_nh_dev == down_dev ||
4020 iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4026 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4027 const struct net_device *dev,
4028 unsigned int nh_flags)
4030 struct fib6_info *iter;
4032 if (rt->fib6_nh.fib_nh_dev == dev)
4033 rt->fib6_nh.fib_nh_flags |= nh_flags;
4034 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4035 if (iter->fib6_nh.fib_nh_dev == dev)
4036 iter->fib6_nh.fib_nh_flags |= nh_flags;
4039 /* called with write lock held for table with rt */
4040 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4042 const struct arg_netdev_event *arg = p_arg;
4043 const struct net_device *dev = arg->dev;
4044 struct net *net = dev_net(dev);
4046 if (rt == net->ipv6.fib6_null_entry)
4049 switch (arg->event) {
4050 case NETDEV_UNREGISTER:
4051 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4053 if (rt->should_flush)
4055 if (!rt->fib6_nsiblings)
4056 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4057 if (rt6_multipath_uses_dev(rt, dev)) {
4060 count = rt6_multipath_dead_count(rt, dev);
4061 if (rt->fib6_nsiblings + 1 == count) {
4062 rt6_multipath_flush(rt);
4065 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4067 fib6_update_sernum(net, rt);
4068 rt6_multipath_rebalance(rt);
4072 if (rt->fib6_nh.fib_nh_dev != dev ||
4073 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4075 rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN;
4076 rt6_multipath_rebalance(rt);
4083 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4085 struct arg_netdev_event arg = {
4091 struct net *net = dev_net(dev);
4093 if (net->ipv6.sysctl.skip_notify_on_dev_down)
4094 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4096 fib6_clean_all(net, fib6_ifdown, &arg);
4099 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4101 rt6_sync_down_dev(dev, event);
4102 rt6_uncached_list_flush_dev(dev_net(dev), dev);
4103 neigh_ifdown(&nd_tbl, dev);
4106 struct rt6_mtu_change_arg {
4107 struct net_device *dev;
4111 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4113 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4114 struct inet6_dev *idev;
4116 /* In IPv6 pmtu discovery is not optional,
4117 so that RTAX_MTU lock cannot disable it.
4118 We still use this lock to block changes
4119 caused by addrconf/ndisc.
4122 idev = __in6_dev_get(arg->dev);
4126 /* For administrative MTU increase, there is no way to discover
4127 IPv6 PMTU increase, so PMTU increase should be updated here.
4128 Since RFC 1981 doesn't include administrative MTU increase
4129 update PMTU increase is a MUST. (i.e. jumbo frame)
4131 if (rt->fib6_nh.fib_nh_dev == arg->dev &&
4132 !fib6_metric_locked(rt, RTAX_MTU)) {
4133 u32 mtu = rt->fib6_pmtu;
4135 if (mtu >= arg->mtu ||
4136 (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4137 fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4139 spin_lock_bh(&rt6_exception_lock);
4140 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4141 spin_unlock_bh(&rt6_exception_lock);
4146 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4148 struct rt6_mtu_change_arg arg = {
4153 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4156 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4157 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
4158 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
4159 [RTA_OIF] = { .type = NLA_U32 },
4160 [RTA_IIF] = { .type = NLA_U32 },
4161 [RTA_PRIORITY] = { .type = NLA_U32 },
4162 [RTA_METRICS] = { .type = NLA_NESTED },
4163 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
4164 [RTA_PREF] = { .type = NLA_U8 },
4165 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
4166 [RTA_ENCAP] = { .type = NLA_NESTED },
4167 [RTA_EXPIRES] = { .type = NLA_U32 },
4168 [RTA_UID] = { .type = NLA_U32 },
4169 [RTA_MARK] = { .type = NLA_U32 },
4170 [RTA_TABLE] = { .type = NLA_U32 },
4171 [RTA_IP_PROTO] = { .type = NLA_U8 },
4172 [RTA_SPORT] = { .type = NLA_U16 },
4173 [RTA_DPORT] = { .type = NLA_U16 },
4176 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4177 struct fib6_config *cfg,
4178 struct netlink_ext_ack *extack)
4181 struct nlattr *tb[RTA_MAX+1];
4185 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4191 rtm = nlmsg_data(nlh);
4193 *cfg = (struct fib6_config){
4194 .fc_table = rtm->rtm_table,
4195 .fc_dst_len = rtm->rtm_dst_len,
4196 .fc_src_len = rtm->rtm_src_len,
4198 .fc_protocol = rtm->rtm_protocol,
4199 .fc_type = rtm->rtm_type,
4201 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4202 .fc_nlinfo.nlh = nlh,
4203 .fc_nlinfo.nl_net = sock_net(skb->sk),
4206 if (rtm->rtm_type == RTN_UNREACHABLE ||
4207 rtm->rtm_type == RTN_BLACKHOLE ||
4208 rtm->rtm_type == RTN_PROHIBIT ||
4209 rtm->rtm_type == RTN_THROW)
4210 cfg->fc_flags |= RTF_REJECT;
4212 if (rtm->rtm_type == RTN_LOCAL)
4213 cfg->fc_flags |= RTF_LOCAL;
4215 if (rtm->rtm_flags & RTM_F_CLONED)
4216 cfg->fc_flags |= RTF_CACHE;
4218 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4220 if (tb[RTA_GATEWAY]) {
4221 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4222 cfg->fc_flags |= RTF_GATEWAY;
4225 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4230 int plen = (rtm->rtm_dst_len + 7) >> 3;
4232 if (nla_len(tb[RTA_DST]) < plen)
4235 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4239 int plen = (rtm->rtm_src_len + 7) >> 3;
4241 if (nla_len(tb[RTA_SRC]) < plen)
4244 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4247 if (tb[RTA_PREFSRC])
4248 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4251 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4253 if (tb[RTA_PRIORITY])
4254 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4256 if (tb[RTA_METRICS]) {
4257 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4258 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4262 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4264 if (tb[RTA_MULTIPATH]) {
4265 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4266 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4268 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4269 cfg->fc_mp_len, extack);
4275 pref = nla_get_u8(tb[RTA_PREF]);
4276 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4277 pref != ICMPV6_ROUTER_PREF_HIGH)
4278 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4279 cfg->fc_flags |= RTF_PREF(pref);
4283 cfg->fc_encap = tb[RTA_ENCAP];
4285 if (tb[RTA_ENCAP_TYPE]) {
4286 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4288 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4293 if (tb[RTA_EXPIRES]) {
4294 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4296 if (addrconf_finite_timeout(timeout)) {
4297 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4298 cfg->fc_flags |= RTF_EXPIRES;
4308 struct fib6_info *fib6_info;
4309 struct fib6_config r_cfg;
4310 struct list_head next;
4313 static int ip6_route_info_append(struct net *net,
4314 struct list_head *rt6_nh_list,
4315 struct fib6_info *rt,
4316 struct fib6_config *r_cfg)
4321 list_for_each_entry(nh, rt6_nh_list, next) {
4322 /* check if fib6_info already exists */
4323 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4327 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4331 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4332 list_add_tail(&nh->next, rt6_nh_list);
4337 static void ip6_route_mpath_notify(struct fib6_info *rt,
4338 struct fib6_info *rt_last,
4339 struct nl_info *info,
4342 /* if this is an APPEND route, then rt points to the first route
4343 * inserted and rt_last points to last route inserted. Userspace
4344 * wants a consistent dump of the route which starts at the first
4345 * nexthop. Since sibling routes are always added at the end of
4346 * the list, find the first sibling of the last route appended
4348 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4349 rt = list_first_entry(&rt_last->fib6_siblings,
4355 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4358 static int ip6_route_multipath_add(struct fib6_config *cfg,
4359 struct netlink_ext_ack *extack)
4361 struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4362 struct nl_info *info = &cfg->fc_nlinfo;
4363 struct fib6_config r_cfg;
4364 struct rtnexthop *rtnh;
4365 struct fib6_info *rt;
4366 struct rt6_nh *err_nh;
4367 struct rt6_nh *nh, *nh_safe;
4373 int replace = (cfg->fc_nlinfo.nlh &&
4374 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4375 LIST_HEAD(rt6_nh_list);
4377 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4378 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4379 nlflags |= NLM_F_APPEND;
4381 remaining = cfg->fc_mp_len;
4382 rtnh = (struct rtnexthop *)cfg->fc_mp;
4384 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4385 * fib6_info structs per nexthop
4387 while (rtnh_ok(rtnh, remaining)) {
4388 memcpy(&r_cfg, cfg, sizeof(*cfg));
4389 if (rtnh->rtnh_ifindex)
4390 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4392 attrlen = rtnh_attrlen(rtnh);
4394 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4396 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4398 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4399 r_cfg.fc_flags |= RTF_GATEWAY;
4401 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4402 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4404 r_cfg.fc_encap_type = nla_get_u16(nla);
4407 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4408 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4414 if (!rt6_qualify_for_ecmp(rt)) {
4416 NL_SET_ERR_MSG(extack,
4417 "Device only routes can not be added for IPv6 using the multipath API.");
4418 fib6_info_release(rt);
4422 rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1;
4424 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4427 fib6_info_release(rt);
4431 rtnh = rtnh_next(rtnh, &remaining);
4434 /* for add and replace send one notification with all nexthops.
4435 * Skip the notification in fib6_add_rt2node and send one with
4436 * the full route when done
4438 info->skip_notify = 1;
4441 list_for_each_entry(nh, &rt6_nh_list, next) {
4442 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4443 fib6_info_release(nh->fib6_info);
4446 /* save reference to last route successfully inserted */
4447 rt_last = nh->fib6_info;
4449 /* save reference to first route for notification */
4451 rt_notif = nh->fib6_info;
4454 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4455 nh->fib6_info = NULL;
4458 NL_SET_ERR_MSG_MOD(extack,
4459 "multipath route replace failed (check consistency of installed routes)");
4464 /* Because each route is added like a single route we remove
4465 * these flags after the first nexthop: if there is a collision,
4466 * we have already failed to add the first nexthop:
4467 * fib6_add_rt2node() has rejected it; when replacing, old
4468 * nexthops have been replaced by first new, the rest should
4471 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4476 /* success ... tell user about new route */
4477 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4481 /* send notification for routes that were added so that
4482 * the delete notifications sent by ip6_route_del are
4486 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4488 /* Delete routes that were already added */
4489 list_for_each_entry(nh, &rt6_nh_list, next) {
4492 ip6_route_del(&nh->r_cfg, extack);
4496 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4498 fib6_info_release(nh->fib6_info);
4499 list_del(&nh->next);
4506 static int ip6_route_multipath_del(struct fib6_config *cfg,
4507 struct netlink_ext_ack *extack)
4509 struct fib6_config r_cfg;
4510 struct rtnexthop *rtnh;
4513 int err = 1, last_err = 0;
4515 remaining = cfg->fc_mp_len;
4516 rtnh = (struct rtnexthop *)cfg->fc_mp;
4518 /* Parse a Multipath Entry */
4519 while (rtnh_ok(rtnh, remaining)) {
4520 memcpy(&r_cfg, cfg, sizeof(*cfg));
4521 if (rtnh->rtnh_ifindex)
4522 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4524 attrlen = rtnh_attrlen(rtnh);
4526 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4528 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4530 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4531 r_cfg.fc_flags |= RTF_GATEWAY;
4534 err = ip6_route_del(&r_cfg, extack);
4538 rtnh = rtnh_next(rtnh, &remaining);
4544 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4545 struct netlink_ext_ack *extack)
4547 struct fib6_config cfg;
4550 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4555 return ip6_route_multipath_del(&cfg, extack);
4557 cfg.fc_delete_all_nh = 1;
4558 return ip6_route_del(&cfg, extack);
4562 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4563 struct netlink_ext_ack *extack)
4565 struct fib6_config cfg;
4568 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4572 if (cfg.fc_metric == 0)
4573 cfg.fc_metric = IP6_RT_PRIO_USER;
4576 return ip6_route_multipath_add(&cfg, extack);
4578 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4581 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4583 int nexthop_len = 0;
4585 if (rt->fib6_nsiblings) {
4586 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4587 + NLA_ALIGN(sizeof(struct rtnexthop))
4588 + nla_total_size(16) /* RTA_GATEWAY */
4589 + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws);
4591 nexthop_len *= rt->fib6_nsiblings;
4594 return NLMSG_ALIGN(sizeof(struct rtmsg))
4595 + nla_total_size(16) /* RTA_SRC */
4596 + nla_total_size(16) /* RTA_DST */
4597 + nla_total_size(16) /* RTA_GATEWAY */
4598 + nla_total_size(16) /* RTA_PREFSRC */
4599 + nla_total_size(4) /* RTA_TABLE */
4600 + nla_total_size(4) /* RTA_IIF */
4601 + nla_total_size(4) /* RTA_OIF */
4602 + nla_total_size(4) /* RTA_PRIORITY */
4603 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4604 + nla_total_size(sizeof(struct rta_cacheinfo))
4605 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4606 + nla_total_size(1) /* RTA_PREF */
4607 + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws)
4611 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4612 struct fib6_info *rt, struct dst_entry *dst,
4613 struct in6_addr *dest, struct in6_addr *src,
4614 int iif, int type, u32 portid, u32 seq,
4617 struct rt6_info *rt6 = (struct rt6_info *)dst;
4618 struct rt6key *rt6_dst, *rt6_src;
4619 u32 *pmetrics, table, rt6_flags;
4620 struct nlmsghdr *nlh;
4624 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4629 rt6_dst = &rt6->rt6i_dst;
4630 rt6_src = &rt6->rt6i_src;
4631 rt6_flags = rt6->rt6i_flags;
4633 rt6_dst = &rt->fib6_dst;
4634 rt6_src = &rt->fib6_src;
4635 rt6_flags = rt->fib6_flags;
4638 rtm = nlmsg_data(nlh);
4639 rtm->rtm_family = AF_INET6;
4640 rtm->rtm_dst_len = rt6_dst->plen;
4641 rtm->rtm_src_len = rt6_src->plen;
4644 table = rt->fib6_table->tb6_id;
4646 table = RT6_TABLE_UNSPEC;
4647 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4648 if (nla_put_u32(skb, RTA_TABLE, table))
4649 goto nla_put_failure;
4651 rtm->rtm_type = rt->fib6_type;
4653 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4654 rtm->rtm_protocol = rt->fib6_protocol;
4656 if (rt6_flags & RTF_CACHE)
4657 rtm->rtm_flags |= RTM_F_CLONED;
4660 if (nla_put_in6_addr(skb, RTA_DST, dest))
4661 goto nla_put_failure;
4662 rtm->rtm_dst_len = 128;
4663 } else if (rtm->rtm_dst_len)
4664 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4665 goto nla_put_failure;
4666 #ifdef CONFIG_IPV6_SUBTREES
4668 if (nla_put_in6_addr(skb, RTA_SRC, src))
4669 goto nla_put_failure;
4670 rtm->rtm_src_len = 128;
4671 } else if (rtm->rtm_src_len &&
4672 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4673 goto nla_put_failure;
4676 #ifdef CONFIG_IPV6_MROUTE
4677 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4678 int err = ip6mr_get_route(net, skb, rtm, portid);
4683 goto nla_put_failure;
4686 if (nla_put_u32(skb, RTA_IIF, iif))
4687 goto nla_put_failure;
4689 struct in6_addr saddr_buf;
4690 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4691 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4692 goto nla_put_failure;
4695 if (rt->fib6_prefsrc.plen) {
4696 struct in6_addr saddr_buf;
4697 saddr_buf = rt->fib6_prefsrc.addr;
4698 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4699 goto nla_put_failure;
4702 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4703 if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4704 goto nla_put_failure;
4706 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4707 goto nla_put_failure;
4709 /* For multipath routes, walk the siblings list and add
4710 * each as a nexthop within RTA_MULTIPATH.
4713 if (rt6_flags & RTF_GATEWAY &&
4714 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4715 goto nla_put_failure;
4717 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4718 goto nla_put_failure;
4719 } else if (rt->fib6_nsiblings) {
4720 struct fib6_info *sibling, *next_sibling;
4723 mp = nla_nest_start(skb, RTA_MULTIPATH);
4725 goto nla_put_failure;
4727 if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common,
4728 rt->fib6_nh.fib_nh_weight) < 0)
4729 goto nla_put_failure;
4731 list_for_each_entry_safe(sibling, next_sibling,
4732 &rt->fib6_siblings, fib6_siblings) {
4733 if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common,
4734 sibling->fib6_nh.fib_nh_weight) < 0)
4735 goto nla_put_failure;
4738 nla_nest_end(skb, mp);
4740 if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common,
4741 &rtm->rtm_flags, false) < 0)
4742 goto nla_put_failure;
4745 if (rt6_flags & RTF_EXPIRES) {
4746 expires = dst ? dst->expires : rt->expires;
4750 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4751 goto nla_put_failure;
4753 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4754 goto nla_put_failure;
4757 nlmsg_end(skb, nlh);
4761 nlmsg_cancel(skb, nlh);
4765 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4766 const struct net_device *dev)
4768 if (f6i->fib6_nh.fib_nh_dev == dev)
4771 if (f6i->fib6_nsiblings) {
4772 struct fib6_info *sibling, *next_sibling;
4774 list_for_each_entry_safe(sibling, next_sibling,
4775 &f6i->fib6_siblings, fib6_siblings) {
4776 if (sibling->fib6_nh.fib_nh_dev == dev)
4784 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4786 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4787 struct fib_dump_filter *filter = &arg->filter;
4788 unsigned int flags = NLM_F_MULTI;
4789 struct net *net = arg->net;
4791 if (rt == net->ipv6.fib6_null_entry)
4794 if ((filter->flags & RTM_F_PREFIX) &&
4795 !(rt->fib6_flags & RTF_PREFIX_RT)) {
4796 /* success since this is not a prefix route */
4799 if (filter->filter_set) {
4800 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4801 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4802 (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4805 flags |= NLM_F_DUMP_FILTERED;
4808 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4809 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4810 arg->cb->nlh->nlmsg_seq, flags);
4813 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4814 const struct nlmsghdr *nlh,
4816 struct netlink_ext_ack *extack)
4821 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4822 NL_SET_ERR_MSG_MOD(extack,
4823 "Invalid header for get route request");
4827 if (!netlink_strict_get_check(skb))
4828 return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
4829 rtm_ipv6_policy, extack);
4831 rtm = nlmsg_data(nlh);
4832 if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4833 (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4834 rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4836 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4839 if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4840 NL_SET_ERR_MSG_MOD(extack,
4841 "Invalid flags for get route request");
4845 err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4846 rtm_ipv6_policy, extack);
4850 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4851 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4852 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4856 for (i = 0; i <= RTA_MAX; i++) {
4872 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4880 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4881 struct netlink_ext_ack *extack)
4883 struct net *net = sock_net(in_skb->sk);
4884 struct nlattr *tb[RTA_MAX+1];
4885 int err, iif = 0, oif = 0;
4886 struct fib6_info *from;
4887 struct dst_entry *dst;
4888 struct rt6_info *rt;
4889 struct sk_buff *skb;
4891 struct flowi6 fl6 = {};
4894 err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4899 rtm = nlmsg_data(nlh);
4900 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4901 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4904 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4907 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4911 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4914 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4918 iif = nla_get_u32(tb[RTA_IIF]);
4921 oif = nla_get_u32(tb[RTA_OIF]);
4924 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4927 fl6.flowi6_uid = make_kuid(current_user_ns(),
4928 nla_get_u32(tb[RTA_UID]));
4930 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4933 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4936 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4938 if (tb[RTA_IP_PROTO]) {
4939 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4940 &fl6.flowi6_proto, AF_INET6,
4947 struct net_device *dev;
4952 dev = dev_get_by_index_rcu(net, iif);
4959 fl6.flowi6_iif = iif;
4961 if (!ipv6_addr_any(&fl6.saddr))
4962 flags |= RT6_LOOKUP_F_HAS_SADDR;
4964 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4968 fl6.flowi6_oif = oif;
4970 dst = ip6_route_output(net, NULL, &fl6);
4974 rt = container_of(dst, struct rt6_info, dst);
4975 if (rt->dst.error) {
4976 err = rt->dst.error;
4981 if (rt == net->ipv6.ip6_null_entry) {
4982 err = rt->dst.error;
4987 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4994 skb_dst_set(skb, &rt->dst);
4997 from = rcu_dereference(rt->from);
5000 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
5001 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
5004 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5005 &fl6.saddr, iif, RTM_NEWROUTE,
5006 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
5015 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5020 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5021 unsigned int nlm_flags)
5023 struct sk_buff *skb;
5024 struct net *net = info->nl_net;
5029 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5031 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5035 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5036 event, info->portid, seq, nlm_flags);
5038 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5039 WARN_ON(err == -EMSGSIZE);
5043 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5044 info->nlh, gfp_any());
5048 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5051 static int ip6_route_dev_notify(struct notifier_block *this,
5052 unsigned long event, void *ptr)
5054 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5055 struct net *net = dev_net(dev);
5057 if (!(dev->flags & IFF_LOOPBACK))
5060 if (event == NETDEV_REGISTER) {
5061 net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev;
5062 net->ipv6.ip6_null_entry->dst.dev = dev;
5063 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5064 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5065 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5066 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5067 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5068 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5070 } else if (event == NETDEV_UNREGISTER &&
5071 dev->reg_state != NETREG_UNREGISTERED) {
5072 /* NETDEV_UNREGISTER could be fired for multiple times by
5073 * netdev_wait_allrefs(). Make sure we only call this once.
5075 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5076 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5077 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5078 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5089 #ifdef CONFIG_PROC_FS
5090 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5092 struct net *net = (struct net *)seq->private;
5093 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5094 net->ipv6.rt6_stats->fib_nodes,
5095 net->ipv6.rt6_stats->fib_route_nodes,
5096 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5097 net->ipv6.rt6_stats->fib_rt_entries,
5098 net->ipv6.rt6_stats->fib_rt_cache,
5099 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5100 net->ipv6.rt6_stats->fib_discarded_routes);
5104 #endif /* CONFIG_PROC_FS */
5106 #ifdef CONFIG_SYSCTL
5109 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5110 void __user *buffer, size_t *lenp, loff_t *ppos)
5118 net = (struct net *)ctl->extra1;
5119 delay = net->ipv6.sysctl.flush_delay;
5120 ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5124 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5131 static struct ctl_table ipv6_route_table_template[] = {
5133 .procname = "flush",
5134 .data = &init_net.ipv6.sysctl.flush_delay,
5135 .maxlen = sizeof(int),
5137 .proc_handler = ipv6_sysctl_rtcache_flush
5140 .procname = "gc_thresh",
5141 .data = &ip6_dst_ops_template.gc_thresh,
5142 .maxlen = sizeof(int),
5144 .proc_handler = proc_dointvec,
5147 .procname = "max_size",
5148 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
5149 .maxlen = sizeof(int),
5151 .proc_handler = proc_dointvec,
5154 .procname = "gc_min_interval",
5155 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5156 .maxlen = sizeof(int),
5158 .proc_handler = proc_dointvec_jiffies,
5161 .procname = "gc_timeout",
5162 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5163 .maxlen = sizeof(int),
5165 .proc_handler = proc_dointvec_jiffies,
5168 .procname = "gc_interval",
5169 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5170 .maxlen = sizeof(int),
5172 .proc_handler = proc_dointvec_jiffies,
5175 .procname = "gc_elasticity",
5176 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5177 .maxlen = sizeof(int),
5179 .proc_handler = proc_dointvec,
5182 .procname = "mtu_expires",
5183 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5184 .maxlen = sizeof(int),
5186 .proc_handler = proc_dointvec_jiffies,
5189 .procname = "min_adv_mss",
5190 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5191 .maxlen = sizeof(int),
5193 .proc_handler = proc_dointvec,
5196 .procname = "gc_min_interval_ms",
5197 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5198 .maxlen = sizeof(int),
5200 .proc_handler = proc_dointvec_ms_jiffies,
5203 .procname = "skip_notify_on_dev_down",
5204 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5205 .maxlen = sizeof(int),
5207 .proc_handler = proc_dointvec,
5214 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5216 struct ctl_table *table;
5218 table = kmemdup(ipv6_route_table_template,
5219 sizeof(ipv6_route_table_template),
5223 table[0].data = &net->ipv6.sysctl.flush_delay;
5224 table[0].extra1 = net;
5225 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5226 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5227 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5228 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5229 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5230 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5231 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5232 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5233 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5234 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5236 /* Don't export sysctls to unprivileged users */
5237 if (net->user_ns != &init_user_ns)
5238 table[0].procname = NULL;
5245 static int __net_init ip6_route_net_init(struct net *net)
5249 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5250 sizeof(net->ipv6.ip6_dst_ops));
5252 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5253 goto out_ip6_dst_ops;
5255 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5256 sizeof(*net->ipv6.fib6_null_entry),
5258 if (!net->ipv6.fib6_null_entry)
5259 goto out_ip6_dst_entries;
5261 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5262 sizeof(*net->ipv6.ip6_null_entry),
5264 if (!net->ipv6.ip6_null_entry)
5265 goto out_fib6_null_entry;
5266 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5267 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5268 ip6_template_metrics, true);
5270 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5271 net->ipv6.fib6_has_custom_rules = false;
5272 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5273 sizeof(*net->ipv6.ip6_prohibit_entry),
5275 if (!net->ipv6.ip6_prohibit_entry)
5276 goto out_ip6_null_entry;
5277 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5278 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5279 ip6_template_metrics, true);
5281 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5282 sizeof(*net->ipv6.ip6_blk_hole_entry),
5284 if (!net->ipv6.ip6_blk_hole_entry)
5285 goto out_ip6_prohibit_entry;
5286 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5287 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5288 ip6_template_metrics, true);
5291 net->ipv6.sysctl.flush_delay = 0;
5292 net->ipv6.sysctl.ip6_rt_max_size = 4096;
5293 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5294 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5295 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5296 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5297 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5298 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5299 net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5301 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5308 out_ip6_prohibit_entry:
5309 kfree(net->ipv6.ip6_prohibit_entry);
5311 kfree(net->ipv6.ip6_null_entry);
5313 out_fib6_null_entry:
5314 kfree(net->ipv6.fib6_null_entry);
5315 out_ip6_dst_entries:
5316 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5321 static void __net_exit ip6_route_net_exit(struct net *net)
5323 kfree(net->ipv6.fib6_null_entry);
5324 kfree(net->ipv6.ip6_null_entry);
5325 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5326 kfree(net->ipv6.ip6_prohibit_entry);
5327 kfree(net->ipv6.ip6_blk_hole_entry);
5329 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5332 static int __net_init ip6_route_net_init_late(struct net *net)
5334 #ifdef CONFIG_PROC_FS
5335 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5336 sizeof(struct ipv6_route_iter));
5337 proc_create_net_single("rt6_stats", 0444, net->proc_net,
5338 rt6_stats_seq_show, NULL);
5343 static void __net_exit ip6_route_net_exit_late(struct net *net)
5345 #ifdef CONFIG_PROC_FS
5346 remove_proc_entry("ipv6_route", net->proc_net);
5347 remove_proc_entry("rt6_stats", net->proc_net);
5351 static struct pernet_operations ip6_route_net_ops = {
5352 .init = ip6_route_net_init,
5353 .exit = ip6_route_net_exit,
5356 static int __net_init ipv6_inetpeer_init(struct net *net)
5358 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5362 inet_peer_base_init(bp);
5363 net->ipv6.peers = bp;
5367 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5369 struct inet_peer_base *bp = net->ipv6.peers;
5371 net->ipv6.peers = NULL;
5372 inetpeer_invalidate_tree(bp);
5376 static struct pernet_operations ipv6_inetpeer_ops = {
5377 .init = ipv6_inetpeer_init,
5378 .exit = ipv6_inetpeer_exit,
5381 static struct pernet_operations ip6_route_net_late_ops = {
5382 .init = ip6_route_net_init_late,
5383 .exit = ip6_route_net_exit_late,
5386 static struct notifier_block ip6_route_dev_notifier = {
5387 .notifier_call = ip6_route_dev_notify,
5388 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5391 void __init ip6_route_init_special_entries(void)
5393 /* Registering of the loopback is done before this portion of code,
5394 * the loopback reference in rt6_info will not be taken, do it
5395 * manually for init_net */
5396 init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev;
5397 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5398 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5399 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5400 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5401 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5402 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5403 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5407 int __init ip6_route_init(void)
5413 ip6_dst_ops_template.kmem_cachep =
5414 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5415 SLAB_HWCACHE_ALIGN, NULL);
5416 if (!ip6_dst_ops_template.kmem_cachep)
5419 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5421 goto out_kmem_cache;
5423 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5425 goto out_dst_entries;
5427 ret = register_pernet_subsys(&ip6_route_net_ops);
5429 goto out_register_inetpeer;
5431 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5435 goto out_register_subsys;
5441 ret = fib6_rules_init();
5445 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5447 goto fib6_rules_init;
5449 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5450 inet6_rtm_newroute, NULL, 0);
5452 goto out_register_late_subsys;
5454 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5455 inet6_rtm_delroute, NULL, 0);
5457 goto out_register_late_subsys;
5459 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5460 inet6_rtm_getroute, NULL,
5461 RTNL_FLAG_DOIT_UNLOCKED);
5463 goto out_register_late_subsys;
5465 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5467 goto out_register_late_subsys;
5469 for_each_possible_cpu(cpu) {
5470 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5472 INIT_LIST_HEAD(&ul->head);
5473 spin_lock_init(&ul->lock);
5479 out_register_late_subsys:
5480 rtnl_unregister_all(PF_INET6);
5481 unregister_pernet_subsys(&ip6_route_net_late_ops);
5483 fib6_rules_cleanup();
5488 out_register_subsys:
5489 unregister_pernet_subsys(&ip6_route_net_ops);
5490 out_register_inetpeer:
5491 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5493 dst_entries_destroy(&ip6_dst_blackhole_ops);
5495 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5499 void ip6_route_cleanup(void)
5501 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5502 unregister_pernet_subsys(&ip6_route_net_late_ops);
5503 fib6_rules_cleanup();
5506 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5507 unregister_pernet_subsys(&ip6_route_net_ops);
5508 dst_entries_destroy(&ip6_dst_blackhole_ops);
5509 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);