]> asedeno.scripts.mit.edu Git - linux.git/blob - net/ipv6/route.c
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
[linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 static int ip6_rt_type_to_error(u8 fib6_type);
74
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79
80 enum rt6_nud_state {
81         RT6_NUD_FAIL_HARD = -3,
82         RT6_NUD_FAIL_PROBE = -2,
83         RT6_NUD_FAIL_DO_RR = -1,
84         RT6_NUD_SUCCEED = 1
85 };
86
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int      ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(struct dst_ops *ops);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int              ip6_pkt_prohibit(struct sk_buff *skb);
99 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void             ip6_link_failure(struct sk_buff *skb);
101 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102                                            struct sk_buff *skb, u32 mtu);
103 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104                                         struct sk_buff *skb);
105 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
106                            int strict);
107 static size_t rt6_nlmsg_size(struct fib6_info *rt);
108 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
109                          struct fib6_info *rt, struct dst_entry *dst,
110                          struct in6_addr *dest, struct in6_addr *src,
111                          int iif, int type, u32 portid, u32 seq,
112                          unsigned int flags);
113 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
114                                            struct in6_addr *daddr,
115                                            struct in6_addr *saddr);
116
117 #ifdef CONFIG_IPV6_ROUTE_INFO
118 static struct fib6_info *rt6_add_route_info(struct net *net,
119                                            const struct in6_addr *prefix, int prefixlen,
120                                            const struct in6_addr *gwaddr,
121                                            struct net_device *dev,
122                                            unsigned int pref);
123 static struct fib6_info *rt6_get_route_info(struct net *net,
124                                            const struct in6_addr *prefix, int prefixlen,
125                                            const struct in6_addr *gwaddr,
126                                            struct net_device *dev);
127 #endif
128
129 struct uncached_list {
130         spinlock_t              lock;
131         struct list_head        head;
132 };
133
134 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
135
136 void rt6_uncached_list_add(struct rt6_info *rt)
137 {
138         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
139
140         rt->rt6i_uncached_list = ul;
141
142         spin_lock_bh(&ul->lock);
143         list_add_tail(&rt->rt6i_uncached, &ul->head);
144         spin_unlock_bh(&ul->lock);
145 }
146
147 void rt6_uncached_list_del(struct rt6_info *rt)
148 {
149         if (!list_empty(&rt->rt6i_uncached)) {
150                 struct uncached_list *ul = rt->rt6i_uncached_list;
151                 struct net *net = dev_net(rt->dst.dev);
152
153                 spin_lock_bh(&ul->lock);
154                 list_del(&rt->rt6i_uncached);
155                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
156                 spin_unlock_bh(&ul->lock);
157         }
158 }
159
160 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
161 {
162         struct net_device *loopback_dev = net->loopback_dev;
163         int cpu;
164
165         if (dev == loopback_dev)
166                 return;
167
168         for_each_possible_cpu(cpu) {
169                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
170                 struct rt6_info *rt;
171
172                 spin_lock_bh(&ul->lock);
173                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
174                         struct inet6_dev *rt_idev = rt->rt6i_idev;
175                         struct net_device *rt_dev = rt->dst.dev;
176
177                         if (rt_idev->dev == dev) {
178                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
179                                 in6_dev_put(rt_idev);
180                         }
181
182                         if (rt_dev == dev) {
183                                 rt->dst.dev = loopback_dev;
184                                 dev_hold(rt->dst.dev);
185                                 dev_put(rt_dev);
186                         }
187                 }
188                 spin_unlock_bh(&ul->lock);
189         }
190 }
191
192 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
193                                              struct sk_buff *skb,
194                                              const void *daddr)
195 {
196         if (!ipv6_addr_any(p))
197                 return (const void *) p;
198         else if (skb)
199                 return &ipv6_hdr(skb)->daddr;
200         return daddr;
201 }
202
203 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
204                                    struct net_device *dev,
205                                    struct sk_buff *skb,
206                                    const void *daddr)
207 {
208         struct neighbour *n;
209
210         daddr = choose_neigh_daddr(gw, skb, daddr);
211         n = __ipv6_neigh_lookup(dev, daddr);
212         if (n)
213                 return n;
214
215         n = neigh_create(&nd_tbl, daddr, dev);
216         return IS_ERR(n) ? NULL : n;
217 }
218
219 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
220                                               struct sk_buff *skb,
221                                               const void *daddr)
222 {
223         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
224
225         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
226 }
227
228 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
229 {
230         struct net_device *dev = dst->dev;
231         struct rt6_info *rt = (struct rt6_info *)dst;
232
233         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
234         if (!daddr)
235                 return;
236         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
237                 return;
238         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
239                 return;
240         __ipv6_confirm_neigh(dev, daddr);
241 }
242
243 static struct dst_ops ip6_dst_ops_template = {
244         .family                 =       AF_INET6,
245         .gc                     =       ip6_dst_gc,
246         .gc_thresh              =       1024,
247         .check                  =       ip6_dst_check,
248         .default_advmss         =       ip6_default_advmss,
249         .mtu                    =       ip6_mtu,
250         .cow_metrics            =       dst_cow_metrics_generic,
251         .destroy                =       ip6_dst_destroy,
252         .ifdown                 =       ip6_dst_ifdown,
253         .negative_advice        =       ip6_negative_advice,
254         .link_failure           =       ip6_link_failure,
255         .update_pmtu            =       ip6_rt_update_pmtu,
256         .redirect               =       rt6_do_redirect,
257         .local_out              =       __ip6_local_out,
258         .neigh_lookup           =       ip6_dst_neigh_lookup,
259         .confirm_neigh          =       ip6_confirm_neigh,
260 };
261
262 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
263 {
264         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
265
266         return mtu ? : dst->dev->mtu;
267 }
268
269 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
270                                          struct sk_buff *skb, u32 mtu)
271 {
272 }
273
274 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
275                                       struct sk_buff *skb)
276 {
277 }
278
279 static struct dst_ops ip6_dst_blackhole_ops = {
280         .family                 =       AF_INET6,
281         .destroy                =       ip6_dst_destroy,
282         .check                  =       ip6_dst_check,
283         .mtu                    =       ip6_blackhole_mtu,
284         .default_advmss         =       ip6_default_advmss,
285         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
286         .redirect               =       ip6_rt_blackhole_redirect,
287         .cow_metrics            =       dst_cow_metrics_generic,
288         .neigh_lookup           =       ip6_dst_neigh_lookup,
289 };
290
291 static const u32 ip6_template_metrics[RTAX_MAX] = {
292         [RTAX_HOPLIMIT - 1] = 0,
293 };
294
295 static const struct fib6_info fib6_null_entry_template = {
296         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
297         .fib6_protocol  = RTPROT_KERNEL,
298         .fib6_metric    = ~(u32)0,
299         .fib6_ref       = ATOMIC_INIT(1),
300         .fib6_type      = RTN_UNREACHABLE,
301         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
302 };
303
304 static const struct rt6_info ip6_null_entry_template = {
305         .dst = {
306                 .__refcnt       = ATOMIC_INIT(1),
307                 .__use          = 1,
308                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
309                 .error          = -ENETUNREACH,
310                 .input          = ip6_pkt_discard,
311                 .output         = ip6_pkt_discard_out,
312         },
313         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
314 };
315
316 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
317
318 static const struct rt6_info ip6_prohibit_entry_template = {
319         .dst = {
320                 .__refcnt       = ATOMIC_INIT(1),
321                 .__use          = 1,
322                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
323                 .error          = -EACCES,
324                 .input          = ip6_pkt_prohibit,
325                 .output         = ip6_pkt_prohibit_out,
326         },
327         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
328 };
329
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331         .dst = {
332                 .__refcnt       = ATOMIC_INIT(1),
333                 .__use          = 1,
334                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
335                 .error          = -EINVAL,
336                 .input          = dst_discard,
337                 .output         = dst_discard_out,
338         },
339         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
340 };
341
342 #endif
343
344 static void rt6_info_init(struct rt6_info *rt)
345 {
346         struct dst_entry *dst = &rt->dst;
347
348         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
349         INIT_LIST_HEAD(&rt->rt6i_uncached);
350 }
351
352 /* allocate dst with ip6_dst_ops */
353 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
354                                int flags)
355 {
356         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
357                                         1, DST_OBSOLETE_FORCE_CHK, flags);
358
359         if (rt) {
360                 rt6_info_init(rt);
361                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
362         }
363
364         return rt;
365 }
366 EXPORT_SYMBOL(ip6_dst_alloc);
367
368 static void ip6_dst_destroy(struct dst_entry *dst)
369 {
370         struct rt6_info *rt = (struct rt6_info *)dst;
371         struct fib6_info *from;
372         struct inet6_dev *idev;
373
374         ip_dst_metrics_put(dst);
375         rt6_uncached_list_del(rt);
376
377         idev = rt->rt6i_idev;
378         if (idev) {
379                 rt->rt6i_idev = NULL;
380                 in6_dev_put(idev);
381         }
382
383         rcu_read_lock();
384         from = rcu_dereference(rt->from);
385         rcu_assign_pointer(rt->from, NULL);
386         fib6_info_release(from);
387         rcu_read_unlock();
388 }
389
390 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
391                            int how)
392 {
393         struct rt6_info *rt = (struct rt6_info *)dst;
394         struct inet6_dev *idev = rt->rt6i_idev;
395         struct net_device *loopback_dev =
396                 dev_net(dev)->loopback_dev;
397
398         if (idev && idev->dev != loopback_dev) {
399                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
400                 if (loopback_idev) {
401                         rt->rt6i_idev = loopback_idev;
402                         in6_dev_put(idev);
403                 }
404         }
405 }
406
407 static bool __rt6_check_expired(const struct rt6_info *rt)
408 {
409         if (rt->rt6i_flags & RTF_EXPIRES)
410                 return time_after(jiffies, rt->dst.expires);
411         else
412                 return false;
413 }
414
415 static bool rt6_check_expired(const struct rt6_info *rt)
416 {
417         struct fib6_info *from;
418
419         from = rcu_dereference(rt->from);
420
421         if (rt->rt6i_flags & RTF_EXPIRES) {
422                 if (time_after(jiffies, rt->dst.expires))
423                         return true;
424         } else if (from) {
425                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
426                         fib6_check_expired(from);
427         }
428         return false;
429 }
430
431 struct fib6_info *fib6_multipath_select(const struct net *net,
432                                         struct fib6_info *match,
433                                         struct flowi6 *fl6, int oif,
434                                         const struct sk_buff *skb,
435                                         int strict)
436 {
437         struct fib6_info *sibling, *next_sibling;
438
439         /* We might have already computed the hash for ICMPv6 errors. In such
440          * case it will always be non-zero. Otherwise now is the time to do it.
441          */
442         if (!fl6->mp_hash)
443                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
444
445         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
446                 return match;
447
448         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
449                                  fib6_siblings) {
450                 const struct fib6_nh *nh = &sibling->fib6_nh;
451                 int nh_upper_bound;
452
453                 nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
454                 if (fl6->mp_hash > nh_upper_bound)
455                         continue;
456                 if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
457                         break;
458                 match = sibling;
459                 break;
460         }
461
462         return match;
463 }
464
465 /*
466  *      Route lookup. rcu_read_lock() should be held.
467  */
468
469 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
470                                const struct in6_addr *saddr, int oif, int flags)
471 {
472         const struct net_device *dev;
473
474         if (nh->fib_nh_flags & RTNH_F_DEAD)
475                 return false;
476
477         dev = nh->fib_nh_dev;
478         if (oif) {
479                 if (dev->ifindex == oif)
480                         return true;
481         } else {
482                 if (ipv6_chk_addr(net, saddr, dev,
483                                   flags & RT6_LOOKUP_F_IFACE))
484                         return true;
485         }
486
487         return false;
488 }
489
490 static inline struct fib6_info *rt6_device_match(struct net *net,
491                                                  struct fib6_info *rt,
492                                                     const struct in6_addr *saddr,
493                                                     int oif,
494                                                     int flags)
495 {
496         const struct fib6_nh *nh;
497         struct fib6_info *sprt;
498
499         if (!oif && ipv6_addr_any(saddr) &&
500             !(rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD))
501                 return rt;
502
503         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
504                 nh = &sprt->fib6_nh;
505                 if (__rt6_device_match(net, nh, saddr, oif, flags))
506                         return sprt;
507         }
508
509         if (oif && flags & RT6_LOOKUP_F_IFACE)
510                 return net->ipv6.fib6_null_entry;
511
512         return rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
513 }
514
515 #ifdef CONFIG_IPV6_ROUTER_PREF
516 struct __rt6_probe_work {
517         struct work_struct work;
518         struct in6_addr target;
519         struct net_device *dev;
520 };
521
522 static void rt6_probe_deferred(struct work_struct *w)
523 {
524         struct in6_addr mcaddr;
525         struct __rt6_probe_work *work =
526                 container_of(w, struct __rt6_probe_work, work);
527
528         addrconf_addr_solict_mult(&work->target, &mcaddr);
529         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
530         dev_put(work->dev);
531         kfree(work);
532 }
533
534 static void rt6_probe(struct fib6_nh *fib6_nh)
535 {
536         struct __rt6_probe_work *work = NULL;
537         const struct in6_addr *nh_gw;
538         struct neighbour *neigh;
539         struct net_device *dev;
540         struct inet6_dev *idev;
541
542         /*
543          * Okay, this does not seem to be appropriate
544          * for now, however, we need to check if it
545          * is really so; aka Router Reachability Probing.
546          *
547          * Router Reachability Probe MUST be rate-limited
548          * to no more than one per minute.
549          */
550         if (fib6_nh->fib_nh_gw_family)
551                 return;
552
553         nh_gw = &fib6_nh->fib_nh_gw6;
554         dev = fib6_nh->fib_nh_dev;
555         rcu_read_lock_bh();
556         idev = __in6_dev_get(dev);
557         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
558         if (neigh) {
559                 if (neigh->nud_state & NUD_VALID)
560                         goto out;
561
562                 write_lock(&neigh->lock);
563                 if (!(neigh->nud_state & NUD_VALID) &&
564                     time_after(jiffies,
565                                neigh->updated + idev->cnf.rtr_probe_interval)) {
566                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
567                         if (work)
568                                 __neigh_set_probe_once(neigh);
569                 }
570                 write_unlock(&neigh->lock);
571         } else if (time_after(jiffies, fib6_nh->last_probe +
572                                        idev->cnf.rtr_probe_interval)) {
573                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
574         }
575
576         if (work) {
577                 fib6_nh->last_probe = jiffies;
578                 INIT_WORK(&work->work, rt6_probe_deferred);
579                 work->target = *nh_gw;
580                 dev_hold(dev);
581                 work->dev = dev;
582                 schedule_work(&work->work);
583         }
584
585 out:
586         rcu_read_unlock_bh();
587 }
588 #else
589 static inline void rt6_probe(struct fib6_nh *fib6_nh)
590 {
591 }
592 #endif
593
594 /*
595  * Default Router Selection (RFC 2461 6.3.6)
596  */
597 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
598 {
599         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
600         struct neighbour *neigh;
601
602         rcu_read_lock_bh();
603         neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
604                                           &fib6_nh->fib_nh_gw6);
605         if (neigh) {
606                 read_lock(&neigh->lock);
607                 if (neigh->nud_state & NUD_VALID)
608                         ret = RT6_NUD_SUCCEED;
609 #ifdef CONFIG_IPV6_ROUTER_PREF
610                 else if (!(neigh->nud_state & NUD_FAILED))
611                         ret = RT6_NUD_SUCCEED;
612                 else
613                         ret = RT6_NUD_FAIL_PROBE;
614 #endif
615                 read_unlock(&neigh->lock);
616         } else {
617                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
618                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
619         }
620         rcu_read_unlock_bh();
621
622         return ret;
623 }
624
625 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
626                            int strict)
627 {
628         int m = 0;
629
630         if (!oif || nh->fib_nh_dev->ifindex == oif)
631                 m = 2;
632
633         if (!m && (strict & RT6_LOOKUP_F_IFACE))
634                 return RT6_NUD_FAIL_HARD;
635 #ifdef CONFIG_IPV6_ROUTER_PREF
636         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
637 #endif
638         if ((strict & RT6_LOOKUP_F_REACHABLE) &&
639             !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
640                 int n = rt6_check_neigh(nh);
641                 if (n < 0)
642                         return n;
643         }
644         return m;
645 }
646
647 static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
648                        int oif, int strict, int *mpri, bool *do_rr)
649 {
650         bool match_do_rr = false;
651         bool rc = false;
652         int m;
653
654         if (nh->fib_nh_flags & RTNH_F_DEAD)
655                 goto out;
656
657         if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
658             nh->fib_nh_flags & RTNH_F_LINKDOWN &&
659             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
660                 goto out;
661
662         m = rt6_score_route(nh, fib6_flags, oif, strict);
663         if (m == RT6_NUD_FAIL_DO_RR) {
664                 match_do_rr = true;
665                 m = 0; /* lowest valid score */
666         } else if (m == RT6_NUD_FAIL_HARD) {
667                 goto out;
668         }
669
670         if (strict & RT6_LOOKUP_F_REACHABLE)
671                 rt6_probe(nh);
672
673         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
674         if (m > *mpri) {
675                 *do_rr = match_do_rr;
676                 *mpri = m;
677                 rc = true;
678         }
679 out:
680         return rc;
681 }
682
683 static void __find_rr_leaf(struct fib6_info *rt_start,
684                            struct fib6_info *nomatch, u32 metric,
685                            struct fib6_info **match, struct fib6_info **cont,
686                            int oif, int strict, bool *do_rr, int *mpri)
687 {
688         struct fib6_info *rt;
689
690         for (rt = rt_start;
691              rt && rt != nomatch;
692              rt = rcu_dereference(rt->fib6_next)) {
693                 struct fib6_nh *nh;
694
695                 if (cont && rt->fib6_metric != metric) {
696                         *cont = rt;
697                         return;
698                 }
699
700                 if (fib6_check_expired(rt))
701                         continue;
702
703                 nh = &rt->fib6_nh;
704                 if (find_match(nh, rt->fib6_flags, oif, strict, mpri, do_rr))
705                         *match = rt;
706         }
707 }
708
709 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
710                                       struct fib6_info *leaf,
711                                       struct fib6_info *rr_head,
712                                       u32 metric, int oif, int strict,
713                                       bool *do_rr)
714 {
715         struct fib6_info *match = NULL, *cont = NULL;
716         int mpri = -1;
717
718         __find_rr_leaf(rr_head, NULL, metric, &match, &cont,
719                        oif, strict, do_rr, &mpri);
720
721         __find_rr_leaf(leaf, rr_head, metric, &match, &cont,
722                        oif, strict, do_rr, &mpri);
723
724         if (match || !cont)
725                 return match;
726
727         __find_rr_leaf(cont, NULL, metric, &match, NULL,
728                        oif, strict, do_rr, &mpri);
729
730         return match;
731 }
732
733 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
734                                    int oif, int strict)
735 {
736         struct fib6_info *leaf = rcu_dereference(fn->leaf);
737         struct fib6_info *match, *rt0;
738         bool do_rr = false;
739         int key_plen;
740
741         if (!leaf || leaf == net->ipv6.fib6_null_entry)
742                 return net->ipv6.fib6_null_entry;
743
744         rt0 = rcu_dereference(fn->rr_ptr);
745         if (!rt0)
746                 rt0 = leaf;
747
748         /* Double check to make sure fn is not an intermediate node
749          * and fn->leaf does not points to its child's leaf
750          * (This might happen if all routes under fn are deleted from
751          * the tree and fib6_repair_tree() is called on the node.)
752          */
753         key_plen = rt0->fib6_dst.plen;
754 #ifdef CONFIG_IPV6_SUBTREES
755         if (rt0->fib6_src.plen)
756                 key_plen = rt0->fib6_src.plen;
757 #endif
758         if (fn->fn_bit != key_plen)
759                 return net->ipv6.fib6_null_entry;
760
761         match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
762                              &do_rr);
763
764         if (do_rr) {
765                 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
766
767                 /* no entries matched; do round-robin */
768                 if (!next || next->fib6_metric != rt0->fib6_metric)
769                         next = leaf;
770
771                 if (next != rt0) {
772                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
773                         /* make sure next is not being deleted from the tree */
774                         if (next->fib6_node)
775                                 rcu_assign_pointer(fn->rr_ptr, next);
776                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
777                 }
778         }
779
780         return match ? match : net->ipv6.fib6_null_entry;
781 }
782
783 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
784 {
785         return (rt->fib6_flags & RTF_NONEXTHOP) || rt->fib6_nh.fib_nh_gw_family;
786 }
787
788 #ifdef CONFIG_IPV6_ROUTE_INFO
789 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
790                   const struct in6_addr *gwaddr)
791 {
792         struct net *net = dev_net(dev);
793         struct route_info *rinfo = (struct route_info *) opt;
794         struct in6_addr prefix_buf, *prefix;
795         unsigned int pref;
796         unsigned long lifetime;
797         struct fib6_info *rt;
798
799         if (len < sizeof(struct route_info)) {
800                 return -EINVAL;
801         }
802
803         /* Sanity check for prefix_len and length */
804         if (rinfo->length > 3) {
805                 return -EINVAL;
806         } else if (rinfo->prefix_len > 128) {
807                 return -EINVAL;
808         } else if (rinfo->prefix_len > 64) {
809                 if (rinfo->length < 2) {
810                         return -EINVAL;
811                 }
812         } else if (rinfo->prefix_len > 0) {
813                 if (rinfo->length < 1) {
814                         return -EINVAL;
815                 }
816         }
817
818         pref = rinfo->route_pref;
819         if (pref == ICMPV6_ROUTER_PREF_INVALID)
820                 return -EINVAL;
821
822         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
823
824         if (rinfo->length == 3)
825                 prefix = (struct in6_addr *)rinfo->prefix;
826         else {
827                 /* this function is safe */
828                 ipv6_addr_prefix(&prefix_buf,
829                                  (struct in6_addr *)rinfo->prefix,
830                                  rinfo->prefix_len);
831                 prefix = &prefix_buf;
832         }
833
834         if (rinfo->prefix_len == 0)
835                 rt = rt6_get_dflt_router(net, gwaddr, dev);
836         else
837                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
838                                         gwaddr, dev);
839
840         if (rt && !lifetime) {
841                 ip6_del_rt(net, rt);
842                 rt = NULL;
843         }
844
845         if (!rt && lifetime)
846                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
847                                         dev, pref);
848         else if (rt)
849                 rt->fib6_flags = RTF_ROUTEINFO |
850                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
851
852         if (rt) {
853                 if (!addrconf_finite_timeout(lifetime))
854                         fib6_clean_expires(rt);
855                 else
856                         fib6_set_expires(rt, jiffies + HZ * lifetime);
857
858                 fib6_info_release(rt);
859         }
860         return 0;
861 }
862 #endif
863
864 /*
865  *      Misc support functions
866  */
867
868 /* called with rcu_lock held */
869 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
870 {
871         struct net_device *dev = rt->fib6_nh.fib_nh_dev;
872
873         if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
874                 /* for copies of local routes, dst->dev needs to be the
875                  * device if it is a master device, the master device if
876                  * device is enslaved, and the loopback as the default
877                  */
878                 if (netif_is_l3_slave(dev) &&
879                     !rt6_need_strict(&rt->fib6_dst.addr))
880                         dev = l3mdev_master_dev_rcu(dev);
881                 else if (!netif_is_l3_master(dev))
882                         dev = dev_net(dev)->loopback_dev;
883                 /* last case is netif_is_l3_master(dev) is true in which
884                  * case we want dev returned to be dev
885                  */
886         }
887
888         return dev;
889 }
890
891 static const int fib6_prop[RTN_MAX + 1] = {
892         [RTN_UNSPEC]    = 0,
893         [RTN_UNICAST]   = 0,
894         [RTN_LOCAL]     = 0,
895         [RTN_BROADCAST] = 0,
896         [RTN_ANYCAST]   = 0,
897         [RTN_MULTICAST] = 0,
898         [RTN_BLACKHOLE] = -EINVAL,
899         [RTN_UNREACHABLE] = -EHOSTUNREACH,
900         [RTN_PROHIBIT]  = -EACCES,
901         [RTN_THROW]     = -EAGAIN,
902         [RTN_NAT]       = -EINVAL,
903         [RTN_XRESOLVE]  = -EINVAL,
904 };
905
906 static int ip6_rt_type_to_error(u8 fib6_type)
907 {
908         return fib6_prop[fib6_type];
909 }
910
911 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
912 {
913         unsigned short flags = 0;
914
915         if (rt->dst_nocount)
916                 flags |= DST_NOCOUNT;
917         if (rt->dst_nopolicy)
918                 flags |= DST_NOPOLICY;
919         if (rt->dst_host)
920                 flags |= DST_HOST;
921
922         return flags;
923 }
924
925 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
926 {
927         rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
928
929         switch (ort->fib6_type) {
930         case RTN_BLACKHOLE:
931                 rt->dst.output = dst_discard_out;
932                 rt->dst.input = dst_discard;
933                 break;
934         case RTN_PROHIBIT:
935                 rt->dst.output = ip6_pkt_prohibit_out;
936                 rt->dst.input = ip6_pkt_prohibit;
937                 break;
938         case RTN_THROW:
939         case RTN_UNREACHABLE:
940         default:
941                 rt->dst.output = ip6_pkt_discard_out;
942                 rt->dst.input = ip6_pkt_discard;
943                 break;
944         }
945 }
946
947 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
948 {
949         if (ort->fib6_flags & RTF_REJECT) {
950                 ip6_rt_init_dst_reject(rt, ort);
951                 return;
952         }
953
954         rt->dst.error = 0;
955         rt->dst.output = ip6_output;
956
957         if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
958                 rt->dst.input = ip6_input;
959         } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
960                 rt->dst.input = ip6_mc_input;
961         } else {
962                 rt->dst.input = ip6_forward;
963         }
964
965         if (ort->fib6_nh.fib_nh_lws) {
966                 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.fib_nh_lws);
967                 lwtunnel_set_redirect(&rt->dst);
968         }
969
970         rt->dst.lastuse = jiffies;
971 }
972
973 /* Caller must already hold reference to @from */
974 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
975 {
976         rt->rt6i_flags &= ~RTF_EXPIRES;
977         rcu_assign_pointer(rt->from, from);
978         ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
979 }
980
981 /* Caller must already hold reference to @ort */
982 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
983 {
984         struct net_device *dev = fib6_info_nh_dev(ort);
985
986         ip6_rt_init_dst(rt, ort);
987
988         rt->rt6i_dst = ort->fib6_dst;
989         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
990         rt->rt6i_flags = ort->fib6_flags;
991         if (ort->fib6_nh.fib_nh_gw_family) {
992                 rt->rt6i_gateway = ort->fib6_nh.fib_nh_gw6;
993                 rt->rt6i_flags |= RTF_GATEWAY;
994         }
995         rt6_set_from(rt, ort);
996 #ifdef CONFIG_IPV6_SUBTREES
997         rt->rt6i_src = ort->fib6_src;
998 #endif
999 }
1000
1001 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1002                                         struct in6_addr *saddr)
1003 {
1004         struct fib6_node *pn, *sn;
1005         while (1) {
1006                 if (fn->fn_flags & RTN_TL_ROOT)
1007                         return NULL;
1008                 pn = rcu_dereference(fn->parent);
1009                 sn = FIB6_SUBTREE(pn);
1010                 if (sn && sn != fn)
1011                         fn = fib6_node_lookup(sn, NULL, saddr);
1012                 else
1013                         fn = pn;
1014                 if (fn->fn_flags & RTN_RTINFO)
1015                         return fn;
1016         }
1017 }
1018
1019 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1020 {
1021         struct rt6_info *rt = *prt;
1022
1023         if (dst_hold_safe(&rt->dst))
1024                 return true;
1025         if (net) {
1026                 rt = net->ipv6.ip6_null_entry;
1027                 dst_hold(&rt->dst);
1028         } else {
1029                 rt = NULL;
1030         }
1031         *prt = rt;
1032         return false;
1033 }
1034
1035 /* called with rcu_lock held */
1036 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1037 {
1038         unsigned short flags = fib6_info_dst_flags(rt);
1039         struct net_device *dev = rt->fib6_nh.fib_nh_dev;
1040         struct rt6_info *nrt;
1041
1042         if (!fib6_info_hold_safe(rt))
1043                 goto fallback;
1044
1045         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1046         if (!nrt) {
1047                 fib6_info_release(rt);
1048                 goto fallback;
1049         }
1050
1051         ip6_rt_copy_init(nrt, rt);
1052         return nrt;
1053
1054 fallback:
1055         nrt = dev_net(dev)->ipv6.ip6_null_entry;
1056         dst_hold(&nrt->dst);
1057         return nrt;
1058 }
1059
1060 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1061                                              struct fib6_table *table,
1062                                              struct flowi6 *fl6,
1063                                              const struct sk_buff *skb,
1064                                              int flags)
1065 {
1066         struct fib6_info *f6i;
1067         struct fib6_node *fn;
1068         struct rt6_info *rt;
1069
1070         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1071                 flags &= ~RT6_LOOKUP_F_IFACE;
1072
1073         rcu_read_lock();
1074         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1075 restart:
1076         f6i = rcu_dereference(fn->leaf);
1077         if (!f6i)
1078                 f6i = net->ipv6.fib6_null_entry;
1079         else
1080                 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1081                                       fl6->flowi6_oif, flags);
1082
1083         if (f6i == net->ipv6.fib6_null_entry) {
1084                 fn = fib6_backtrack(fn, &fl6->saddr);
1085                 if (fn)
1086                         goto restart;
1087
1088                 rt = net->ipv6.ip6_null_entry;
1089                 dst_hold(&rt->dst);
1090                 goto out;
1091         }
1092
1093         if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1094                 f6i = fib6_multipath_select(net, f6i, fl6, fl6->flowi6_oif, skb,
1095                                             flags);
1096         /* Search through exception table */
1097         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1098         if (rt) {
1099                 if (ip6_hold_safe(net, &rt))
1100                         dst_use_noref(&rt->dst, jiffies);
1101         } else {
1102                 rt = ip6_create_rt_rcu(f6i);
1103         }
1104
1105 out:
1106         trace_fib6_table_lookup(net, f6i, table, fl6);
1107
1108         rcu_read_unlock();
1109
1110         return rt;
1111 }
1112
1113 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1114                                    const struct sk_buff *skb, int flags)
1115 {
1116         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1117 }
1118 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1119
1120 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1121                             const struct in6_addr *saddr, int oif,
1122                             const struct sk_buff *skb, int strict)
1123 {
1124         struct flowi6 fl6 = {
1125                 .flowi6_oif = oif,
1126                 .daddr = *daddr,
1127         };
1128         struct dst_entry *dst;
1129         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1130
1131         if (saddr) {
1132                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1133                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1134         }
1135
1136         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1137         if (dst->error == 0)
1138                 return (struct rt6_info *) dst;
1139
1140         dst_release(dst);
1141
1142         return NULL;
1143 }
1144 EXPORT_SYMBOL(rt6_lookup);
1145
1146 /* ip6_ins_rt is called with FREE table->tb6_lock.
1147  * It takes new route entry, the addition fails by any reason the
1148  * route is released.
1149  * Caller must hold dst before calling it.
1150  */
1151
1152 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1153                         struct netlink_ext_ack *extack)
1154 {
1155         int err;
1156         struct fib6_table *table;
1157
1158         table = rt->fib6_table;
1159         spin_lock_bh(&table->tb6_lock);
1160         err = fib6_add(&table->tb6_root, rt, info, extack);
1161         spin_unlock_bh(&table->tb6_lock);
1162
1163         return err;
1164 }
1165
1166 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1167 {
1168         struct nl_info info = { .nl_net = net, };
1169
1170         return __ip6_ins_rt(rt, &info, NULL);
1171 }
1172
1173 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1174                                            const struct in6_addr *daddr,
1175                                            const struct in6_addr *saddr)
1176 {
1177         struct net_device *dev;
1178         struct rt6_info *rt;
1179
1180         /*
1181          *      Clone the route.
1182          */
1183
1184         if (!fib6_info_hold_safe(ort))
1185                 return NULL;
1186
1187         dev = ip6_rt_get_dev_rcu(ort);
1188         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1189         if (!rt) {
1190                 fib6_info_release(ort);
1191                 return NULL;
1192         }
1193
1194         ip6_rt_copy_init(rt, ort);
1195         rt->rt6i_flags |= RTF_CACHE;
1196         rt->dst.flags |= DST_HOST;
1197         rt->rt6i_dst.addr = *daddr;
1198         rt->rt6i_dst.plen = 128;
1199
1200         if (!rt6_is_gw_or_nonexthop(ort)) {
1201                 if (ort->fib6_dst.plen != 128 &&
1202                     ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1203                         rt->rt6i_flags |= RTF_ANYCAST;
1204 #ifdef CONFIG_IPV6_SUBTREES
1205                 if (rt->rt6i_src.plen && saddr) {
1206                         rt->rt6i_src.addr = *saddr;
1207                         rt->rt6i_src.plen = 128;
1208                 }
1209 #endif
1210         }
1211
1212         return rt;
1213 }
1214
1215 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1216 {
1217         unsigned short flags = fib6_info_dst_flags(rt);
1218         struct net_device *dev;
1219         struct rt6_info *pcpu_rt;
1220
1221         if (!fib6_info_hold_safe(rt))
1222                 return NULL;
1223
1224         rcu_read_lock();
1225         dev = ip6_rt_get_dev_rcu(rt);
1226         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1227         rcu_read_unlock();
1228         if (!pcpu_rt) {
1229                 fib6_info_release(rt);
1230                 return NULL;
1231         }
1232         ip6_rt_copy_init(pcpu_rt, rt);
1233         pcpu_rt->rt6i_flags |= RTF_PCPU;
1234         return pcpu_rt;
1235 }
1236
1237 /* It should be called with rcu_read_lock() acquired */
1238 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1239 {
1240         struct rt6_info *pcpu_rt, **p;
1241
1242         p = this_cpu_ptr(rt->rt6i_pcpu);
1243         pcpu_rt = *p;
1244
1245         if (pcpu_rt)
1246                 ip6_hold_safe(NULL, &pcpu_rt);
1247
1248         return pcpu_rt;
1249 }
1250
1251 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1252                                             struct fib6_info *rt)
1253 {
1254         struct rt6_info *pcpu_rt, *prev, **p;
1255
1256         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1257         if (!pcpu_rt) {
1258                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1259                 return net->ipv6.ip6_null_entry;
1260         }
1261
1262         dst_hold(&pcpu_rt->dst);
1263         p = this_cpu_ptr(rt->rt6i_pcpu);
1264         prev = cmpxchg(p, NULL, pcpu_rt);
1265         BUG_ON(prev);
1266
1267         return pcpu_rt;
1268 }
1269
1270 /* exception hash table implementation
1271  */
1272 static DEFINE_SPINLOCK(rt6_exception_lock);
1273
1274 /* Remove rt6_ex from hash table and free the memory
1275  * Caller must hold rt6_exception_lock
1276  */
1277 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1278                                  struct rt6_exception *rt6_ex)
1279 {
1280         struct fib6_info *from;
1281         struct net *net;
1282
1283         if (!bucket || !rt6_ex)
1284                 return;
1285
1286         net = dev_net(rt6_ex->rt6i->dst.dev);
1287         net->ipv6.rt6_stats->fib_rt_cache--;
1288
1289         /* purge completely the exception to allow releasing the held resources:
1290          * some [sk] cache may keep the dst around for unlimited time
1291          */
1292         from = rcu_dereference_protected(rt6_ex->rt6i->from,
1293                                          lockdep_is_held(&rt6_exception_lock));
1294         rcu_assign_pointer(rt6_ex->rt6i->from, NULL);
1295         fib6_info_release(from);
1296         dst_dev_put(&rt6_ex->rt6i->dst);
1297
1298         hlist_del_rcu(&rt6_ex->hlist);
1299         dst_release(&rt6_ex->rt6i->dst);
1300         kfree_rcu(rt6_ex, rcu);
1301         WARN_ON_ONCE(!bucket->depth);
1302         bucket->depth--;
1303 }
1304
1305 /* Remove oldest rt6_ex in bucket and free the memory
1306  * Caller must hold rt6_exception_lock
1307  */
1308 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1309 {
1310         struct rt6_exception *rt6_ex, *oldest = NULL;
1311
1312         if (!bucket)
1313                 return;
1314
1315         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1316                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1317                         oldest = rt6_ex;
1318         }
1319         rt6_remove_exception(bucket, oldest);
1320 }
1321
1322 static u32 rt6_exception_hash(const struct in6_addr *dst,
1323                               const struct in6_addr *src)
1324 {
1325         static u32 seed __read_mostly;
1326         u32 val;
1327
1328         net_get_random_once(&seed, sizeof(seed));
1329         val = jhash(dst, sizeof(*dst), seed);
1330
1331 #ifdef CONFIG_IPV6_SUBTREES
1332         if (src)
1333                 val = jhash(src, sizeof(*src), val);
1334 #endif
1335         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1336 }
1337
1338 /* Helper function to find the cached rt in the hash table
1339  * and update bucket pointer to point to the bucket for this
1340  * (daddr, saddr) pair
1341  * Caller must hold rt6_exception_lock
1342  */
1343 static struct rt6_exception *
1344 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1345                               const struct in6_addr *daddr,
1346                               const struct in6_addr *saddr)
1347 {
1348         struct rt6_exception *rt6_ex;
1349         u32 hval;
1350
1351         if (!(*bucket) || !daddr)
1352                 return NULL;
1353
1354         hval = rt6_exception_hash(daddr, saddr);
1355         *bucket += hval;
1356
1357         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1358                 struct rt6_info *rt6 = rt6_ex->rt6i;
1359                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1360
1361 #ifdef CONFIG_IPV6_SUBTREES
1362                 if (matched && saddr)
1363                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1364 #endif
1365                 if (matched)
1366                         return rt6_ex;
1367         }
1368         return NULL;
1369 }
1370
1371 /* Helper function to find the cached rt in the hash table
1372  * and update bucket pointer to point to the bucket for this
1373  * (daddr, saddr) pair
1374  * Caller must hold rcu_read_lock()
1375  */
1376 static struct rt6_exception *
1377 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1378                          const struct in6_addr *daddr,
1379                          const struct in6_addr *saddr)
1380 {
1381         struct rt6_exception *rt6_ex;
1382         u32 hval;
1383
1384         WARN_ON_ONCE(!rcu_read_lock_held());
1385
1386         if (!(*bucket) || !daddr)
1387                 return NULL;
1388
1389         hval = rt6_exception_hash(daddr, saddr);
1390         *bucket += hval;
1391
1392         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1393                 struct rt6_info *rt6 = rt6_ex->rt6i;
1394                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1395
1396 #ifdef CONFIG_IPV6_SUBTREES
1397                 if (matched && saddr)
1398                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1399 #endif
1400                 if (matched)
1401                         return rt6_ex;
1402         }
1403         return NULL;
1404 }
1405
1406 static unsigned int fib6_mtu(const struct fib6_info *rt)
1407 {
1408         unsigned int mtu;
1409
1410         if (rt->fib6_pmtu) {
1411                 mtu = rt->fib6_pmtu;
1412         } else {
1413                 struct net_device *dev = fib6_info_nh_dev(rt);
1414                 struct inet6_dev *idev;
1415
1416                 rcu_read_lock();
1417                 idev = __in6_dev_get(dev);
1418                 mtu = idev->cnf.mtu6;
1419                 rcu_read_unlock();
1420         }
1421
1422         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1423
1424         return mtu - lwtunnel_headroom(rt->fib6_nh.fib_nh_lws, mtu);
1425 }
1426
1427 static int rt6_insert_exception(struct rt6_info *nrt,
1428                                 struct fib6_info *ort)
1429 {
1430         struct net *net = dev_net(nrt->dst.dev);
1431         struct rt6_exception_bucket *bucket;
1432         struct in6_addr *src_key = NULL;
1433         struct rt6_exception *rt6_ex;
1434         int err = 0;
1435
1436         spin_lock_bh(&rt6_exception_lock);
1437
1438         if (ort->exception_bucket_flushed) {
1439                 err = -EINVAL;
1440                 goto out;
1441         }
1442
1443         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1444                                         lockdep_is_held(&rt6_exception_lock));
1445         if (!bucket) {
1446                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1447                                  GFP_ATOMIC);
1448                 if (!bucket) {
1449                         err = -ENOMEM;
1450                         goto out;
1451                 }
1452                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1453         }
1454
1455 #ifdef CONFIG_IPV6_SUBTREES
1456         /* rt6i_src.plen != 0 indicates ort is in subtree
1457          * and exception table is indexed by a hash of
1458          * both rt6i_dst and rt6i_src.
1459          * Otherwise, the exception table is indexed by
1460          * a hash of only rt6i_dst.
1461          */
1462         if (ort->fib6_src.plen)
1463                 src_key = &nrt->rt6i_src.addr;
1464 #endif
1465         /* rt6_mtu_change() might lower mtu on ort.
1466          * Only insert this exception route if its mtu
1467          * is less than ort's mtu value.
1468          */
1469         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1470                 err = -EINVAL;
1471                 goto out;
1472         }
1473
1474         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1475                                                src_key);
1476         if (rt6_ex)
1477                 rt6_remove_exception(bucket, rt6_ex);
1478
1479         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1480         if (!rt6_ex) {
1481                 err = -ENOMEM;
1482                 goto out;
1483         }
1484         rt6_ex->rt6i = nrt;
1485         rt6_ex->stamp = jiffies;
1486         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1487         bucket->depth++;
1488         net->ipv6.rt6_stats->fib_rt_cache++;
1489
1490         if (bucket->depth > FIB6_MAX_DEPTH)
1491                 rt6_exception_remove_oldest(bucket);
1492
1493 out:
1494         spin_unlock_bh(&rt6_exception_lock);
1495
1496         /* Update fn->fn_sernum to invalidate all cached dst */
1497         if (!err) {
1498                 spin_lock_bh(&ort->fib6_table->tb6_lock);
1499                 fib6_update_sernum(net, ort);
1500                 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1501                 fib6_force_start_gc(net);
1502         }
1503
1504         return err;
1505 }
1506
1507 void rt6_flush_exceptions(struct fib6_info *rt)
1508 {
1509         struct rt6_exception_bucket *bucket;
1510         struct rt6_exception *rt6_ex;
1511         struct hlist_node *tmp;
1512         int i;
1513
1514         spin_lock_bh(&rt6_exception_lock);
1515         /* Prevent rt6_insert_exception() to recreate the bucket list */
1516         rt->exception_bucket_flushed = 1;
1517
1518         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1519                                     lockdep_is_held(&rt6_exception_lock));
1520         if (!bucket)
1521                 goto out;
1522
1523         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1524                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1525                         rt6_remove_exception(bucket, rt6_ex);
1526                 WARN_ON_ONCE(bucket->depth);
1527                 bucket++;
1528         }
1529
1530 out:
1531         spin_unlock_bh(&rt6_exception_lock);
1532 }
1533
1534 /* Find cached rt in the hash table inside passed in rt
1535  * Caller has to hold rcu_read_lock()
1536  */
1537 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1538                                            struct in6_addr *daddr,
1539                                            struct in6_addr *saddr)
1540 {
1541         struct rt6_exception_bucket *bucket;
1542         struct in6_addr *src_key = NULL;
1543         struct rt6_exception *rt6_ex;
1544         struct rt6_info *res = NULL;
1545
1546         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1547
1548 #ifdef CONFIG_IPV6_SUBTREES
1549         /* rt6i_src.plen != 0 indicates rt is in subtree
1550          * and exception table is indexed by a hash of
1551          * both rt6i_dst and rt6i_src.
1552          * Otherwise, the exception table is indexed by
1553          * a hash of only rt6i_dst.
1554          */
1555         if (rt->fib6_src.plen)
1556                 src_key = saddr;
1557 #endif
1558         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1559
1560         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1561                 res = rt6_ex->rt6i;
1562
1563         return res;
1564 }
1565
1566 /* Remove the passed in cached rt from the hash table that contains it */
1567 static int rt6_remove_exception_rt(struct rt6_info *rt)
1568 {
1569         struct rt6_exception_bucket *bucket;
1570         struct in6_addr *src_key = NULL;
1571         struct rt6_exception *rt6_ex;
1572         struct fib6_info *from;
1573         int err;
1574
1575         from = rcu_dereference(rt->from);
1576         if (!from ||
1577             !(rt->rt6i_flags & RTF_CACHE))
1578                 return -EINVAL;
1579
1580         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1581                 return -ENOENT;
1582
1583         spin_lock_bh(&rt6_exception_lock);
1584         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1585                                     lockdep_is_held(&rt6_exception_lock));
1586 #ifdef CONFIG_IPV6_SUBTREES
1587         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1588          * and exception table is indexed by a hash of
1589          * both rt6i_dst and rt6i_src.
1590          * Otherwise, the exception table is indexed by
1591          * a hash of only rt6i_dst.
1592          */
1593         if (from->fib6_src.plen)
1594                 src_key = &rt->rt6i_src.addr;
1595 #endif
1596         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1597                                                &rt->rt6i_dst.addr,
1598                                                src_key);
1599         if (rt6_ex) {
1600                 rt6_remove_exception(bucket, rt6_ex);
1601                 err = 0;
1602         } else {
1603                 err = -ENOENT;
1604         }
1605
1606         spin_unlock_bh(&rt6_exception_lock);
1607         return err;
1608 }
1609
1610 /* Find rt6_ex which contains the passed in rt cache and
1611  * refresh its stamp
1612  */
1613 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1614 {
1615         struct rt6_exception_bucket *bucket;
1616         struct in6_addr *src_key = NULL;
1617         struct rt6_exception *rt6_ex;
1618         struct fib6_info *from;
1619
1620         rcu_read_lock();
1621         from = rcu_dereference(rt->from);
1622         if (!from || !(rt->rt6i_flags & RTF_CACHE))
1623                 goto unlock;
1624
1625         bucket = rcu_dereference(from->rt6i_exception_bucket);
1626
1627 #ifdef CONFIG_IPV6_SUBTREES
1628         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1629          * and exception table is indexed by a hash of
1630          * both rt6i_dst and rt6i_src.
1631          * Otherwise, the exception table is indexed by
1632          * a hash of only rt6i_dst.
1633          */
1634         if (from->fib6_src.plen)
1635                 src_key = &rt->rt6i_src.addr;
1636 #endif
1637         rt6_ex = __rt6_find_exception_rcu(&bucket,
1638                                           &rt->rt6i_dst.addr,
1639                                           src_key);
1640         if (rt6_ex)
1641                 rt6_ex->stamp = jiffies;
1642
1643 unlock:
1644         rcu_read_unlock();
1645 }
1646
1647 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1648                                          struct rt6_info *rt, int mtu)
1649 {
1650         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1651          * lowest MTU in the path: always allow updating the route PMTU to
1652          * reflect PMTU decreases.
1653          *
1654          * If the new MTU is higher, and the route PMTU is equal to the local
1655          * MTU, this means the old MTU is the lowest in the path, so allow
1656          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1657          * handle this.
1658          */
1659
1660         if (dst_mtu(&rt->dst) >= mtu)
1661                 return true;
1662
1663         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1664                 return true;
1665
1666         return false;
1667 }
1668
1669 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1670                                        struct fib6_info *rt, int mtu)
1671 {
1672         struct rt6_exception_bucket *bucket;
1673         struct rt6_exception *rt6_ex;
1674         int i;
1675
1676         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1677                                         lockdep_is_held(&rt6_exception_lock));
1678
1679         if (!bucket)
1680                 return;
1681
1682         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1683                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1684                         struct rt6_info *entry = rt6_ex->rt6i;
1685
1686                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1687                          * route), the metrics of its rt->from have already
1688                          * been updated.
1689                          */
1690                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1691                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1692                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1693                 }
1694                 bucket++;
1695         }
1696 }
1697
1698 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1699
1700 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1701                                         struct in6_addr *gateway)
1702 {
1703         struct rt6_exception_bucket *bucket;
1704         struct rt6_exception *rt6_ex;
1705         struct hlist_node *tmp;
1706         int i;
1707
1708         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1709                 return;
1710
1711         spin_lock_bh(&rt6_exception_lock);
1712         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1713                                      lockdep_is_held(&rt6_exception_lock));
1714
1715         if (bucket) {
1716                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1717                         hlist_for_each_entry_safe(rt6_ex, tmp,
1718                                                   &bucket->chain, hlist) {
1719                                 struct rt6_info *entry = rt6_ex->rt6i;
1720
1721                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1722                                     RTF_CACHE_GATEWAY &&
1723                                     ipv6_addr_equal(gateway,
1724                                                     &entry->rt6i_gateway)) {
1725                                         rt6_remove_exception(bucket, rt6_ex);
1726                                 }
1727                         }
1728                         bucket++;
1729                 }
1730         }
1731
1732         spin_unlock_bh(&rt6_exception_lock);
1733 }
1734
1735 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1736                                       struct rt6_exception *rt6_ex,
1737                                       struct fib6_gc_args *gc_args,
1738                                       unsigned long now)
1739 {
1740         struct rt6_info *rt = rt6_ex->rt6i;
1741
1742         /* we are pruning and obsoleting aged-out and non gateway exceptions
1743          * even if others have still references to them, so that on next
1744          * dst_check() such references can be dropped.
1745          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1746          * expired, independently from their aging, as per RFC 8201 section 4
1747          */
1748         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1749                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1750                         RT6_TRACE("aging clone %p\n", rt);
1751                         rt6_remove_exception(bucket, rt6_ex);
1752                         return;
1753                 }
1754         } else if (time_after(jiffies, rt->dst.expires)) {
1755                 RT6_TRACE("purging expired route %p\n", rt);
1756                 rt6_remove_exception(bucket, rt6_ex);
1757                 return;
1758         }
1759
1760         if (rt->rt6i_flags & RTF_GATEWAY) {
1761                 struct neighbour *neigh;
1762                 __u8 neigh_flags = 0;
1763
1764                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1765                 if (neigh)
1766                         neigh_flags = neigh->flags;
1767
1768                 if (!(neigh_flags & NTF_ROUTER)) {
1769                         RT6_TRACE("purging route %p via non-router but gateway\n",
1770                                   rt);
1771                         rt6_remove_exception(bucket, rt6_ex);
1772                         return;
1773                 }
1774         }
1775
1776         gc_args->more++;
1777 }
1778
1779 void rt6_age_exceptions(struct fib6_info *rt,
1780                         struct fib6_gc_args *gc_args,
1781                         unsigned long now)
1782 {
1783         struct rt6_exception_bucket *bucket;
1784         struct rt6_exception *rt6_ex;
1785         struct hlist_node *tmp;
1786         int i;
1787
1788         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1789                 return;
1790
1791         rcu_read_lock_bh();
1792         spin_lock(&rt6_exception_lock);
1793         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1794                                     lockdep_is_held(&rt6_exception_lock));
1795
1796         if (bucket) {
1797                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1798                         hlist_for_each_entry_safe(rt6_ex, tmp,
1799                                                   &bucket->chain, hlist) {
1800                                 rt6_age_examine_exception(bucket, rt6_ex,
1801                                                           gc_args, now);
1802                         }
1803                         bucket++;
1804                 }
1805         }
1806         spin_unlock(&rt6_exception_lock);
1807         rcu_read_unlock_bh();
1808 }
1809
1810 /* must be called with rcu lock held */
1811 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1812                                     int oif, struct flowi6 *fl6, int strict)
1813 {
1814         struct fib6_node *fn, *saved_fn;
1815         struct fib6_info *f6i;
1816
1817         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1818         saved_fn = fn;
1819
1820         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1821                 oif = 0;
1822
1823 redo_rt6_select:
1824         f6i = rt6_select(net, fn, oif, strict);
1825         if (f6i == net->ipv6.fib6_null_entry) {
1826                 fn = fib6_backtrack(fn, &fl6->saddr);
1827                 if (fn)
1828                         goto redo_rt6_select;
1829                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1830                         /* also consider unreachable route */
1831                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1832                         fn = saved_fn;
1833                         goto redo_rt6_select;
1834                 }
1835         }
1836
1837         trace_fib6_table_lookup(net, f6i, table, fl6);
1838
1839         return f6i;
1840 }
1841
1842 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1843                                int oif, struct flowi6 *fl6,
1844                                const struct sk_buff *skb, int flags)
1845 {
1846         struct fib6_info *f6i;
1847         struct rt6_info *rt;
1848         int strict = 0;
1849
1850         strict |= flags & RT6_LOOKUP_F_IFACE;
1851         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1852         if (net->ipv6.devconf_all->forwarding == 0)
1853                 strict |= RT6_LOOKUP_F_REACHABLE;
1854
1855         rcu_read_lock();
1856
1857         f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1858         if (f6i == net->ipv6.fib6_null_entry) {
1859                 rt = net->ipv6.ip6_null_entry;
1860                 rcu_read_unlock();
1861                 dst_hold(&rt->dst);
1862                 return rt;
1863         }
1864
1865         if (f6i->fib6_nsiblings)
1866                 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1867
1868         /*Search through exception table */
1869         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1870         if (rt) {
1871                 if (ip6_hold_safe(net, &rt))
1872                         dst_use_noref(&rt->dst, jiffies);
1873
1874                 rcu_read_unlock();
1875                 return rt;
1876         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1877                             !f6i->fib6_nh.fib_nh_gw_family)) {
1878                 /* Create a RTF_CACHE clone which will not be
1879                  * owned by the fib6 tree.  It is for the special case where
1880                  * the daddr in the skb during the neighbor look-up is different
1881                  * from the fl6->daddr used to look-up route here.
1882                  */
1883                 struct rt6_info *uncached_rt;
1884
1885                 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1886
1887                 rcu_read_unlock();
1888
1889                 if (uncached_rt) {
1890                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1891                          * No need for another dst_hold()
1892                          */
1893                         rt6_uncached_list_add(uncached_rt);
1894                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1895                 } else {
1896                         uncached_rt = net->ipv6.ip6_null_entry;
1897                         dst_hold(&uncached_rt->dst);
1898                 }
1899
1900                 return uncached_rt;
1901         } else {
1902                 /* Get a percpu copy */
1903
1904                 struct rt6_info *pcpu_rt;
1905
1906                 local_bh_disable();
1907                 pcpu_rt = rt6_get_pcpu_route(f6i);
1908
1909                 if (!pcpu_rt)
1910                         pcpu_rt = rt6_make_pcpu_route(net, f6i);
1911
1912                 local_bh_enable();
1913                 rcu_read_unlock();
1914
1915                 return pcpu_rt;
1916         }
1917 }
1918 EXPORT_SYMBOL_GPL(ip6_pol_route);
1919
1920 static struct rt6_info *ip6_pol_route_input(struct net *net,
1921                                             struct fib6_table *table,
1922                                             struct flowi6 *fl6,
1923                                             const struct sk_buff *skb,
1924                                             int flags)
1925 {
1926         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1927 }
1928
1929 struct dst_entry *ip6_route_input_lookup(struct net *net,
1930                                          struct net_device *dev,
1931                                          struct flowi6 *fl6,
1932                                          const struct sk_buff *skb,
1933                                          int flags)
1934 {
1935         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1936                 flags |= RT6_LOOKUP_F_IFACE;
1937
1938         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1939 }
1940 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1941
1942 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1943                                   struct flow_keys *keys,
1944                                   struct flow_keys *flkeys)
1945 {
1946         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1947         const struct ipv6hdr *key_iph = outer_iph;
1948         struct flow_keys *_flkeys = flkeys;
1949         const struct ipv6hdr *inner_iph;
1950         const struct icmp6hdr *icmph;
1951         struct ipv6hdr _inner_iph;
1952         struct icmp6hdr _icmph;
1953
1954         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1955                 goto out;
1956
1957         icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1958                                    sizeof(_icmph), &_icmph);
1959         if (!icmph)
1960                 goto out;
1961
1962         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1963             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1964             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1965             icmph->icmp6_type != ICMPV6_PARAMPROB)
1966                 goto out;
1967
1968         inner_iph = skb_header_pointer(skb,
1969                                        skb_transport_offset(skb) + sizeof(*icmph),
1970                                        sizeof(_inner_iph), &_inner_iph);
1971         if (!inner_iph)
1972                 goto out;
1973
1974         key_iph = inner_iph;
1975         _flkeys = NULL;
1976 out:
1977         if (_flkeys) {
1978                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1979                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1980                 keys->tags.flow_label = _flkeys->tags.flow_label;
1981                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1982         } else {
1983                 keys->addrs.v6addrs.src = key_iph->saddr;
1984                 keys->addrs.v6addrs.dst = key_iph->daddr;
1985                 keys->tags.flow_label = ip6_flowlabel(key_iph);
1986                 keys->basic.ip_proto = key_iph->nexthdr;
1987         }
1988 }
1989
1990 /* if skb is set it will be used and fl6 can be NULL */
1991 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1992                        const struct sk_buff *skb, struct flow_keys *flkeys)
1993 {
1994         struct flow_keys hash_keys;
1995         u32 mhash;
1996
1997         switch (ip6_multipath_hash_policy(net)) {
1998         case 0:
1999                 memset(&hash_keys, 0, sizeof(hash_keys));
2000                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2001                 if (skb) {
2002                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2003                 } else {
2004                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2005                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2006                         hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2007                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2008                 }
2009                 break;
2010         case 1:
2011                 if (skb) {
2012                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2013                         struct flow_keys keys;
2014
2015                         /* short-circuit if we already have L4 hash present */
2016                         if (skb->l4_hash)
2017                                 return skb_get_hash_raw(skb) >> 1;
2018
2019                         memset(&hash_keys, 0, sizeof(hash_keys));
2020
2021                         if (!flkeys) {
2022                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2023                                 flkeys = &keys;
2024                         }
2025                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2026                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2027                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2028                         hash_keys.ports.src = flkeys->ports.src;
2029                         hash_keys.ports.dst = flkeys->ports.dst;
2030                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2031                 } else {
2032                         memset(&hash_keys, 0, sizeof(hash_keys));
2033                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2034                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2035                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2036                         hash_keys.ports.src = fl6->fl6_sport;
2037                         hash_keys.ports.dst = fl6->fl6_dport;
2038                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2039                 }
2040                 break;
2041         }
2042         mhash = flow_hash_from_keys(&hash_keys);
2043
2044         return mhash >> 1;
2045 }
2046
2047 void ip6_route_input(struct sk_buff *skb)
2048 {
2049         const struct ipv6hdr *iph = ipv6_hdr(skb);
2050         struct net *net = dev_net(skb->dev);
2051         int flags = RT6_LOOKUP_F_HAS_SADDR;
2052         struct ip_tunnel_info *tun_info;
2053         struct flowi6 fl6 = {
2054                 .flowi6_iif = skb->dev->ifindex,
2055                 .daddr = iph->daddr,
2056                 .saddr = iph->saddr,
2057                 .flowlabel = ip6_flowinfo(iph),
2058                 .flowi6_mark = skb->mark,
2059                 .flowi6_proto = iph->nexthdr,
2060         };
2061         struct flow_keys *flkeys = NULL, _flkeys;
2062
2063         tun_info = skb_tunnel_info(skb);
2064         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2065                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2066
2067         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2068                 flkeys = &_flkeys;
2069
2070         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2071                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2072         skb_dst_drop(skb);
2073         skb_dst_set(skb,
2074                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2075 }
2076
2077 static struct rt6_info *ip6_pol_route_output(struct net *net,
2078                                              struct fib6_table *table,
2079                                              struct flowi6 *fl6,
2080                                              const struct sk_buff *skb,
2081                                              int flags)
2082 {
2083         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2084 }
2085
2086 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2087                                          struct flowi6 *fl6, int flags)
2088 {
2089         bool any_src;
2090
2091         if (ipv6_addr_type(&fl6->daddr) &
2092             (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2093                 struct dst_entry *dst;
2094
2095                 dst = l3mdev_link_scope_lookup(net, fl6);
2096                 if (dst)
2097                         return dst;
2098         }
2099
2100         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2101
2102         any_src = ipv6_addr_any(&fl6->saddr);
2103         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2104             (fl6->flowi6_oif && any_src))
2105                 flags |= RT6_LOOKUP_F_IFACE;
2106
2107         if (!any_src)
2108                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2109         else if (sk)
2110                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2111
2112         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2113 }
2114 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2115
2116 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2117 {
2118         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2119         struct net_device *loopback_dev = net->loopback_dev;
2120         struct dst_entry *new = NULL;
2121
2122         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2123                        DST_OBSOLETE_DEAD, 0);
2124         if (rt) {
2125                 rt6_info_init(rt);
2126                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2127
2128                 new = &rt->dst;
2129                 new->__use = 1;
2130                 new->input = dst_discard;
2131                 new->output = dst_discard_out;
2132
2133                 dst_copy_metrics(new, &ort->dst);
2134
2135                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2136                 rt->rt6i_gateway = ort->rt6i_gateway;
2137                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2138
2139                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2140 #ifdef CONFIG_IPV6_SUBTREES
2141                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2142 #endif
2143         }
2144
2145         dst_release(dst_orig);
2146         return new ? new : ERR_PTR(-ENOMEM);
2147 }
2148
2149 /*
2150  *      Destination cache support functions
2151  */
2152
2153 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2154 {
2155         u32 rt_cookie = 0;
2156
2157         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2158                 return false;
2159
2160         if (fib6_check_expired(f6i))
2161                 return false;
2162
2163         return true;
2164 }
2165
2166 static struct dst_entry *rt6_check(struct rt6_info *rt,
2167                                    struct fib6_info *from,
2168                                    u32 cookie)
2169 {
2170         u32 rt_cookie = 0;
2171
2172         if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2173             rt_cookie != cookie)
2174                 return NULL;
2175
2176         if (rt6_check_expired(rt))
2177                 return NULL;
2178
2179         return &rt->dst;
2180 }
2181
2182 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2183                                             struct fib6_info *from,
2184                                             u32 cookie)
2185 {
2186         if (!__rt6_check_expired(rt) &&
2187             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2188             fib6_check(from, cookie))
2189                 return &rt->dst;
2190         else
2191                 return NULL;
2192 }
2193
2194 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2195 {
2196         struct dst_entry *dst_ret;
2197         struct fib6_info *from;
2198         struct rt6_info *rt;
2199
2200         rt = container_of(dst, struct rt6_info, dst);
2201
2202         rcu_read_lock();
2203
2204         /* All IPV6 dsts are created with ->obsolete set to the value
2205          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2206          * into this function always.
2207          */
2208
2209         from = rcu_dereference(rt->from);
2210
2211         if (from && (rt->rt6i_flags & RTF_PCPU ||
2212             unlikely(!list_empty(&rt->rt6i_uncached))))
2213                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2214         else
2215                 dst_ret = rt6_check(rt, from, cookie);
2216
2217         rcu_read_unlock();
2218
2219         return dst_ret;
2220 }
2221
2222 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2223 {
2224         struct rt6_info *rt = (struct rt6_info *) dst;
2225
2226         if (rt) {
2227                 if (rt->rt6i_flags & RTF_CACHE) {
2228                         rcu_read_lock();
2229                         if (rt6_check_expired(rt)) {
2230                                 rt6_remove_exception_rt(rt);
2231                                 dst = NULL;
2232                         }
2233                         rcu_read_unlock();
2234                 } else {
2235                         dst_release(dst);
2236                         dst = NULL;
2237                 }
2238         }
2239         return dst;
2240 }
2241
2242 static void ip6_link_failure(struct sk_buff *skb)
2243 {
2244         struct rt6_info *rt;
2245
2246         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2247
2248         rt = (struct rt6_info *) skb_dst(skb);
2249         if (rt) {
2250                 rcu_read_lock();
2251                 if (rt->rt6i_flags & RTF_CACHE) {
2252                         rt6_remove_exception_rt(rt);
2253                 } else {
2254                         struct fib6_info *from;
2255                         struct fib6_node *fn;
2256
2257                         from = rcu_dereference(rt->from);
2258                         if (from) {
2259                                 fn = rcu_dereference(from->fib6_node);
2260                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2261                                         fn->fn_sernum = -1;
2262                         }
2263                 }
2264                 rcu_read_unlock();
2265         }
2266 }
2267
2268 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2269 {
2270         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2271                 struct fib6_info *from;
2272
2273                 rcu_read_lock();
2274                 from = rcu_dereference(rt0->from);
2275                 if (from)
2276                         rt0->dst.expires = from->expires;
2277                 rcu_read_unlock();
2278         }
2279
2280         dst_set_expires(&rt0->dst, timeout);
2281         rt0->rt6i_flags |= RTF_EXPIRES;
2282 }
2283
2284 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2285 {
2286         struct net *net = dev_net(rt->dst.dev);
2287
2288         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2289         rt->rt6i_flags |= RTF_MODIFIED;
2290         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2291 }
2292
2293 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2294 {
2295         return !(rt->rt6i_flags & RTF_CACHE) &&
2296                 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2297 }
2298
2299 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2300                                  const struct ipv6hdr *iph, u32 mtu)
2301 {
2302         const struct in6_addr *daddr, *saddr;
2303         struct rt6_info *rt6 = (struct rt6_info *)dst;
2304
2305         if (dst_metric_locked(dst, RTAX_MTU))
2306                 return;
2307
2308         if (iph) {
2309                 daddr = &iph->daddr;
2310                 saddr = &iph->saddr;
2311         } else if (sk) {
2312                 daddr = &sk->sk_v6_daddr;
2313                 saddr = &inet6_sk(sk)->saddr;
2314         } else {
2315                 daddr = NULL;
2316                 saddr = NULL;
2317         }
2318         dst_confirm_neigh(dst, daddr);
2319         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2320         if (mtu >= dst_mtu(dst))
2321                 return;
2322
2323         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2324                 rt6_do_update_pmtu(rt6, mtu);
2325                 /* update rt6_ex->stamp for cache */
2326                 if (rt6->rt6i_flags & RTF_CACHE)
2327                         rt6_update_exception_stamp_rt(rt6);
2328         } else if (daddr) {
2329                 struct fib6_info *from;
2330                 struct rt6_info *nrt6;
2331
2332                 rcu_read_lock();
2333                 from = rcu_dereference(rt6->from);
2334                 if (!from) {
2335                         rcu_read_unlock();
2336                         return;
2337                 }
2338                 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2339                 if (nrt6) {
2340                         rt6_do_update_pmtu(nrt6, mtu);
2341                         if (rt6_insert_exception(nrt6, from))
2342                                 dst_release_immediate(&nrt6->dst);
2343                 }
2344                 rcu_read_unlock();
2345         }
2346 }
2347
2348 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2349                                struct sk_buff *skb, u32 mtu)
2350 {
2351         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2352 }
2353
2354 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2355                      int oif, u32 mark, kuid_t uid)
2356 {
2357         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2358         struct dst_entry *dst;
2359         struct flowi6 fl6 = {
2360                 .flowi6_oif = oif,
2361                 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2362                 .daddr = iph->daddr,
2363                 .saddr = iph->saddr,
2364                 .flowlabel = ip6_flowinfo(iph),
2365                 .flowi6_uid = uid,
2366         };
2367
2368         dst = ip6_route_output(net, NULL, &fl6);
2369         if (!dst->error)
2370                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2371         dst_release(dst);
2372 }
2373 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2374
2375 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2376 {
2377         int oif = sk->sk_bound_dev_if;
2378         struct dst_entry *dst;
2379
2380         if (!oif && skb->dev)
2381                 oif = l3mdev_master_ifindex(skb->dev);
2382
2383         ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2384
2385         dst = __sk_dst_get(sk);
2386         if (!dst || !dst->obsolete ||
2387             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2388                 return;
2389
2390         bh_lock_sock(sk);
2391         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2392                 ip6_datagram_dst_update(sk, false);
2393         bh_unlock_sock(sk);
2394 }
2395 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2396
2397 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2398                            const struct flowi6 *fl6)
2399 {
2400 #ifdef CONFIG_IPV6_SUBTREES
2401         struct ipv6_pinfo *np = inet6_sk(sk);
2402 #endif
2403
2404         ip6_dst_store(sk, dst,
2405                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2406                       &sk->sk_v6_daddr : NULL,
2407 #ifdef CONFIG_IPV6_SUBTREES
2408                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2409                       &np->saddr :
2410 #endif
2411                       NULL);
2412 }
2413
2414 static bool ip6_redirect_nh_match(struct fib6_info *f6i,
2415                                   struct fib6_nh *nh,
2416                                   struct flowi6 *fl6,
2417                                   const struct in6_addr *gw,
2418                                   struct rt6_info **ret)
2419 {
2420         if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
2421             fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
2422                 return false;
2423
2424         /* rt_cache's gateway might be different from its 'parent'
2425          * in the case of an ip redirect.
2426          * So we keep searching in the exception table if the gateway
2427          * is different.
2428          */
2429         if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
2430                 struct rt6_info *rt_cache;
2431
2432                 rt_cache = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
2433                 if (rt_cache &&
2434                     ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
2435                         *ret = rt_cache;
2436                         return true;
2437                 }
2438                 return false;
2439         }
2440         return true;
2441 }
2442
2443 /* Handle redirects */
2444 struct ip6rd_flowi {
2445         struct flowi6 fl6;
2446         struct in6_addr gateway;
2447 };
2448
2449 static struct rt6_info *__ip6_route_redirect(struct net *net,
2450                                              struct fib6_table *table,
2451                                              struct flowi6 *fl6,
2452                                              const struct sk_buff *skb,
2453                                              int flags)
2454 {
2455         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2456         struct rt6_info *ret = NULL;
2457         struct fib6_info *rt;
2458         struct fib6_node *fn;
2459
2460         /* Get the "current" route for this destination and
2461          * check if the redirect has come from appropriate router.
2462          *
2463          * RFC 4861 specifies that redirects should only be
2464          * accepted if they come from the nexthop to the target.
2465          * Due to the way the routes are chosen, this notion
2466          * is a bit fuzzy and one might need to check all possible
2467          * routes.
2468          */
2469
2470         rcu_read_lock();
2471         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2472 restart:
2473         for_each_fib6_node_rt_rcu(fn) {
2474                 if (fib6_check_expired(rt))
2475                         continue;
2476                 if (rt->fib6_flags & RTF_REJECT)
2477                         break;
2478                 if (ip6_redirect_nh_match(rt, &rt->fib6_nh, fl6,
2479                                           &rdfl->gateway, &ret))
2480                         goto out;
2481         }
2482
2483         if (!rt)
2484                 rt = net->ipv6.fib6_null_entry;
2485         else if (rt->fib6_flags & RTF_REJECT) {
2486                 ret = net->ipv6.ip6_null_entry;
2487                 goto out;
2488         }
2489
2490         if (rt == net->ipv6.fib6_null_entry) {
2491                 fn = fib6_backtrack(fn, &fl6->saddr);
2492                 if (fn)
2493                         goto restart;
2494         }
2495
2496 out:
2497         if (ret)
2498                 ip6_hold_safe(net, &ret);
2499         else
2500                 ret = ip6_create_rt_rcu(rt);
2501
2502         rcu_read_unlock();
2503
2504         trace_fib6_table_lookup(net, rt, table, fl6);
2505         return ret;
2506 };
2507
2508 static struct dst_entry *ip6_route_redirect(struct net *net,
2509                                             const struct flowi6 *fl6,
2510                                             const struct sk_buff *skb,
2511                                             const struct in6_addr *gateway)
2512 {
2513         int flags = RT6_LOOKUP_F_HAS_SADDR;
2514         struct ip6rd_flowi rdfl;
2515
2516         rdfl.fl6 = *fl6;
2517         rdfl.gateway = *gateway;
2518
2519         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2520                                 flags, __ip6_route_redirect);
2521 }
2522
2523 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2524                   kuid_t uid)
2525 {
2526         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2527         struct dst_entry *dst;
2528         struct flowi6 fl6 = {
2529                 .flowi6_iif = LOOPBACK_IFINDEX,
2530                 .flowi6_oif = oif,
2531                 .flowi6_mark = mark,
2532                 .daddr = iph->daddr,
2533                 .saddr = iph->saddr,
2534                 .flowlabel = ip6_flowinfo(iph),
2535                 .flowi6_uid = uid,
2536         };
2537
2538         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2539         rt6_do_redirect(dst, NULL, skb);
2540         dst_release(dst);
2541 }
2542 EXPORT_SYMBOL_GPL(ip6_redirect);
2543
2544 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2545 {
2546         const struct ipv6hdr *iph = ipv6_hdr(skb);
2547         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2548         struct dst_entry *dst;
2549         struct flowi6 fl6 = {
2550                 .flowi6_iif = LOOPBACK_IFINDEX,
2551                 .flowi6_oif = oif,
2552                 .daddr = msg->dest,
2553                 .saddr = iph->daddr,
2554                 .flowi6_uid = sock_net_uid(net, NULL),
2555         };
2556
2557         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2558         rt6_do_redirect(dst, NULL, skb);
2559         dst_release(dst);
2560 }
2561
2562 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2563 {
2564         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2565                      sk->sk_uid);
2566 }
2567 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2568
2569 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2570 {
2571         struct net_device *dev = dst->dev;
2572         unsigned int mtu = dst_mtu(dst);
2573         struct net *net = dev_net(dev);
2574
2575         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2576
2577         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2578                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2579
2580         /*
2581          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2582          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2583          * IPV6_MAXPLEN is also valid and means: "any MSS,
2584          * rely only on pmtu discovery"
2585          */
2586         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2587                 mtu = IPV6_MAXPLEN;
2588         return mtu;
2589 }
2590
2591 static unsigned int ip6_mtu(const struct dst_entry *dst)
2592 {
2593         struct inet6_dev *idev;
2594         unsigned int mtu;
2595
2596         mtu = dst_metric_raw(dst, RTAX_MTU);
2597         if (mtu)
2598                 goto out;
2599
2600         mtu = IPV6_MIN_MTU;
2601
2602         rcu_read_lock();
2603         idev = __in6_dev_get(dst->dev);
2604         if (idev)
2605                 mtu = idev->cnf.mtu6;
2606         rcu_read_unlock();
2607
2608 out:
2609         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2610
2611         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2612 }
2613
2614 /* MTU selection:
2615  * 1. mtu on route is locked - use it
2616  * 2. mtu from nexthop exception
2617  * 3. mtu from egress device
2618  *
2619  * based on ip6_dst_mtu_forward and exception logic of
2620  * rt6_find_cached_rt; called with rcu_read_lock
2621  */
2622 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2623                       struct in6_addr *saddr)
2624 {
2625         struct rt6_exception_bucket *bucket;
2626         struct rt6_exception *rt6_ex;
2627         struct in6_addr *src_key;
2628         struct inet6_dev *idev;
2629         u32 mtu = 0;
2630
2631         if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2632                 mtu = f6i->fib6_pmtu;
2633                 if (mtu)
2634                         goto out;
2635         }
2636
2637         src_key = NULL;
2638 #ifdef CONFIG_IPV6_SUBTREES
2639         if (f6i->fib6_src.plen)
2640                 src_key = saddr;
2641 #endif
2642
2643         bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2644         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2645         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2646                 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2647
2648         if (likely(!mtu)) {
2649                 struct net_device *dev = fib6_info_nh_dev(f6i);
2650
2651                 mtu = IPV6_MIN_MTU;
2652                 idev = __in6_dev_get(dev);
2653                 if (idev && idev->cnf.mtu6 > mtu)
2654                         mtu = idev->cnf.mtu6;
2655         }
2656
2657         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2658 out:
2659         return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2660 }
2661
2662 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2663                                   struct flowi6 *fl6)
2664 {
2665         struct dst_entry *dst;
2666         struct rt6_info *rt;
2667         struct inet6_dev *idev = in6_dev_get(dev);
2668         struct net *net = dev_net(dev);
2669
2670         if (unlikely(!idev))
2671                 return ERR_PTR(-ENODEV);
2672
2673         rt = ip6_dst_alloc(net, dev, 0);
2674         if (unlikely(!rt)) {
2675                 in6_dev_put(idev);
2676                 dst = ERR_PTR(-ENOMEM);
2677                 goto out;
2678         }
2679
2680         rt->dst.flags |= DST_HOST;
2681         rt->dst.input = ip6_input;
2682         rt->dst.output  = ip6_output;
2683         rt->rt6i_gateway  = fl6->daddr;
2684         rt->rt6i_dst.addr = fl6->daddr;
2685         rt->rt6i_dst.plen = 128;
2686         rt->rt6i_idev     = idev;
2687         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2688
2689         /* Add this dst into uncached_list so that rt6_disable_ip() can
2690          * do proper release of the net_device
2691          */
2692         rt6_uncached_list_add(rt);
2693         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2694
2695         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2696
2697 out:
2698         return dst;
2699 }
2700
2701 static int ip6_dst_gc(struct dst_ops *ops)
2702 {
2703         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2704         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2705         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2706         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2707         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2708         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2709         int entries;
2710
2711         entries = dst_entries_get_fast(ops);
2712         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2713             entries <= rt_max_size)
2714                 goto out;
2715
2716         net->ipv6.ip6_rt_gc_expire++;
2717         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2718         entries = dst_entries_get_slow(ops);
2719         if (entries < ops->gc_thresh)
2720                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2721 out:
2722         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2723         return entries > rt_max_size;
2724 }
2725
2726 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2727                                             struct fib6_config *cfg,
2728                                             const struct in6_addr *gw_addr,
2729                                             u32 tbid, int flags)
2730 {
2731         struct flowi6 fl6 = {
2732                 .flowi6_oif = cfg->fc_ifindex,
2733                 .daddr = *gw_addr,
2734                 .saddr = cfg->fc_prefsrc,
2735         };
2736         struct fib6_table *table;
2737         struct rt6_info *rt;
2738
2739         table = fib6_get_table(net, tbid);
2740         if (!table)
2741                 return NULL;
2742
2743         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2744                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2745
2746         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2747         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2748
2749         /* if table lookup failed, fall back to full lookup */
2750         if (rt == net->ipv6.ip6_null_entry) {
2751                 ip6_rt_put(rt);
2752                 rt = NULL;
2753         }
2754
2755         return rt;
2756 }
2757
2758 static int ip6_route_check_nh_onlink(struct net *net,
2759                                      struct fib6_config *cfg,
2760                                      const struct net_device *dev,
2761                                      struct netlink_ext_ack *extack)
2762 {
2763         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2764         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2765         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2766         struct fib6_info *from;
2767         struct rt6_info *grt;
2768         int err;
2769
2770         err = 0;
2771         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2772         if (grt) {
2773                 rcu_read_lock();
2774                 from = rcu_dereference(grt->from);
2775                 if (!grt->dst.error &&
2776                     /* ignore match if it is the default route */
2777                     from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2778                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2779                         NL_SET_ERR_MSG(extack,
2780                                        "Nexthop has invalid gateway or device mismatch");
2781                         err = -EINVAL;
2782                 }
2783                 rcu_read_unlock();
2784
2785                 ip6_rt_put(grt);
2786         }
2787
2788         return err;
2789 }
2790
2791 static int ip6_route_check_nh(struct net *net,
2792                               struct fib6_config *cfg,
2793                               struct net_device **_dev,
2794                               struct inet6_dev **idev)
2795 {
2796         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2797         struct net_device *dev = _dev ? *_dev : NULL;
2798         struct rt6_info *grt = NULL;
2799         int err = -EHOSTUNREACH;
2800
2801         if (cfg->fc_table) {
2802                 int flags = RT6_LOOKUP_F_IFACE;
2803
2804                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2805                                           cfg->fc_table, flags);
2806                 if (grt) {
2807                         if (grt->rt6i_flags & RTF_GATEWAY ||
2808                             (dev && dev != grt->dst.dev)) {
2809                                 ip6_rt_put(grt);
2810                                 grt = NULL;
2811                         }
2812                 }
2813         }
2814
2815         if (!grt)
2816                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2817
2818         if (!grt)
2819                 goto out;
2820
2821         if (dev) {
2822                 if (dev != grt->dst.dev) {
2823                         ip6_rt_put(grt);
2824                         goto out;
2825                 }
2826         } else {
2827                 *_dev = dev = grt->dst.dev;
2828                 *idev = grt->rt6i_idev;
2829                 dev_hold(dev);
2830                 in6_dev_hold(grt->rt6i_idev);
2831         }
2832
2833         if (!(grt->rt6i_flags & RTF_GATEWAY))
2834                 err = 0;
2835
2836         ip6_rt_put(grt);
2837
2838 out:
2839         return err;
2840 }
2841
2842 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2843                            struct net_device **_dev, struct inet6_dev **idev,
2844                            struct netlink_ext_ack *extack)
2845 {
2846         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2847         int gwa_type = ipv6_addr_type(gw_addr);
2848         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2849         const struct net_device *dev = *_dev;
2850         bool need_addr_check = !dev;
2851         int err = -EINVAL;
2852
2853         /* if gw_addr is local we will fail to detect this in case
2854          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2855          * will return already-added prefix route via interface that
2856          * prefix route was assigned to, which might be non-loopback.
2857          */
2858         if (dev &&
2859             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2860                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2861                 goto out;
2862         }
2863
2864         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2865                 /* IPv6 strictly inhibits using not link-local
2866                  * addresses as nexthop address.
2867                  * Otherwise, router will not able to send redirects.
2868                  * It is very good, but in some (rare!) circumstances
2869                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2870                  * some exceptions. --ANK
2871                  * We allow IPv4-mapped nexthops to support RFC4798-type
2872                  * addressing
2873                  */
2874                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2875                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2876                         goto out;
2877                 }
2878
2879                 if (cfg->fc_flags & RTNH_F_ONLINK)
2880                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2881                 else
2882                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2883
2884                 if (err)
2885                         goto out;
2886         }
2887
2888         /* reload in case device was changed */
2889         dev = *_dev;
2890
2891         err = -EINVAL;
2892         if (!dev) {
2893                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2894                 goto out;
2895         } else if (dev->flags & IFF_LOOPBACK) {
2896                 NL_SET_ERR_MSG(extack,
2897                                "Egress device can not be loopback device for this route");
2898                 goto out;
2899         }
2900
2901         /* if we did not check gw_addr above, do so now that the
2902          * egress device has been resolved.
2903          */
2904         if (need_addr_check &&
2905             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2906                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2907                 goto out;
2908         }
2909
2910         err = 0;
2911 out:
2912         return err;
2913 }
2914
2915 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
2916 {
2917         if ((flags & RTF_REJECT) ||
2918             (dev && (dev->flags & IFF_LOOPBACK) &&
2919              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2920              !(flags & RTF_LOCAL)))
2921                 return true;
2922
2923         return false;
2924 }
2925
2926 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
2927                  struct fib6_config *cfg, gfp_t gfp_flags,
2928                  struct netlink_ext_ack *extack)
2929 {
2930         struct net_device *dev = NULL;
2931         struct inet6_dev *idev = NULL;
2932         int addr_type;
2933         int err;
2934
2935         fib6_nh->fib_nh_family = AF_INET6;
2936
2937         err = -ENODEV;
2938         if (cfg->fc_ifindex) {
2939                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2940                 if (!dev)
2941                         goto out;
2942                 idev = in6_dev_get(dev);
2943                 if (!idev)
2944                         goto out;
2945         }
2946
2947         if (cfg->fc_flags & RTNH_F_ONLINK) {
2948                 if (!dev) {
2949                         NL_SET_ERR_MSG(extack,
2950                                        "Nexthop device required for onlink");
2951                         goto out;
2952                 }
2953
2954                 if (!(dev->flags & IFF_UP)) {
2955                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2956                         err = -ENETDOWN;
2957                         goto out;
2958                 }
2959
2960                 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
2961         }
2962
2963         fib6_nh->fib_nh_weight = 1;
2964
2965         /* We cannot add true routes via loopback here,
2966          * they would result in kernel looping; promote them to reject routes
2967          */
2968         addr_type = ipv6_addr_type(&cfg->fc_dst);
2969         if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
2970                 /* hold loopback dev/idev if we haven't done so. */
2971                 if (dev != net->loopback_dev) {
2972                         if (dev) {
2973                                 dev_put(dev);
2974                                 in6_dev_put(idev);
2975                         }
2976                         dev = net->loopback_dev;
2977                         dev_hold(dev);
2978                         idev = in6_dev_get(dev);
2979                         if (!idev) {
2980                                 err = -ENODEV;
2981                                 goto out;
2982                         }
2983                 }
2984                 goto set_dev;
2985         }
2986
2987         if (cfg->fc_flags & RTF_GATEWAY) {
2988                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
2989                 if (err)
2990                         goto out;
2991
2992                 fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
2993                 fib6_nh->fib_nh_gw_family = AF_INET6;
2994         }
2995
2996         err = -ENODEV;
2997         if (!dev)
2998                 goto out;
2999
3000         if (idev->cnf.disable_ipv6) {
3001                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3002                 err = -EACCES;
3003                 goto out;
3004         }
3005
3006         if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
3007                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3008                 err = -ENETDOWN;
3009                 goto out;
3010         }
3011
3012         if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3013             !netif_carrier_ok(dev))
3014                 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
3015
3016         err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap,
3017                                  cfg->fc_encap_type, cfg, gfp_flags, extack);
3018         if (err)
3019                 goto out;
3020 set_dev:
3021         fib6_nh->fib_nh_dev = dev;
3022         fib6_nh->fib_nh_oif = dev->ifindex;
3023         err = 0;
3024 out:
3025         if (idev)
3026                 in6_dev_put(idev);
3027
3028         if (err) {
3029                 lwtstate_put(fib6_nh->fib_nh_lws);
3030                 fib6_nh->fib_nh_lws = NULL;
3031                 if (dev)
3032                         dev_put(dev);
3033         }
3034
3035         return err;
3036 }
3037
3038 void fib6_nh_release(struct fib6_nh *fib6_nh)
3039 {
3040         fib_nh_common_release(&fib6_nh->nh_common);
3041 }
3042
3043 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3044                                               gfp_t gfp_flags,
3045                                               struct netlink_ext_ack *extack)
3046 {
3047         struct net *net = cfg->fc_nlinfo.nl_net;
3048         struct fib6_info *rt = NULL;
3049         struct fib6_table *table;
3050         int err = -EINVAL;
3051         int addr_type;
3052
3053         /* RTF_PCPU is an internal flag; can not be set by userspace */
3054         if (cfg->fc_flags & RTF_PCPU) {
3055                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3056                 goto out;
3057         }
3058
3059         /* RTF_CACHE is an internal flag; can not be set by userspace */
3060         if (cfg->fc_flags & RTF_CACHE) {
3061                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3062                 goto out;
3063         }
3064
3065         if (cfg->fc_type > RTN_MAX) {
3066                 NL_SET_ERR_MSG(extack, "Invalid route type");
3067                 goto out;
3068         }
3069
3070         if (cfg->fc_dst_len > 128) {
3071                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
3072                 goto out;
3073         }
3074         if (cfg->fc_src_len > 128) {
3075                 NL_SET_ERR_MSG(extack, "Invalid source address length");
3076                 goto out;
3077         }
3078 #ifndef CONFIG_IPV6_SUBTREES
3079         if (cfg->fc_src_len) {
3080                 NL_SET_ERR_MSG(extack,
3081                                "Specifying source address requires IPV6_SUBTREES to be enabled");
3082                 goto out;
3083         }
3084 #endif
3085
3086         err = -ENOBUFS;
3087         if (cfg->fc_nlinfo.nlh &&
3088             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3089                 table = fib6_get_table(net, cfg->fc_table);
3090                 if (!table) {
3091                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3092                         table = fib6_new_table(net, cfg->fc_table);
3093                 }
3094         } else {
3095                 table = fib6_new_table(net, cfg->fc_table);
3096         }
3097
3098         if (!table)
3099                 goto out;
3100
3101         err = -ENOMEM;
3102         rt = fib6_info_alloc(gfp_flags);
3103         if (!rt)
3104                 goto out;
3105
3106         rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3107                                                extack);
3108         if (IS_ERR(rt->fib6_metrics)) {
3109                 err = PTR_ERR(rt->fib6_metrics);
3110                 /* Do not leave garbage there. */
3111                 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3112                 goto out;
3113         }
3114
3115         if (cfg->fc_flags & RTF_ADDRCONF)
3116                 rt->dst_nocount = true;
3117
3118         if (cfg->fc_flags & RTF_EXPIRES)
3119                 fib6_set_expires(rt, jiffies +
3120                                 clock_t_to_jiffies(cfg->fc_expires));
3121         else
3122                 fib6_clean_expires(rt);
3123
3124         if (cfg->fc_protocol == RTPROT_UNSPEC)
3125                 cfg->fc_protocol = RTPROT_BOOT;
3126         rt->fib6_protocol = cfg->fc_protocol;
3127
3128         rt->fib6_table = table;
3129         rt->fib6_metric = cfg->fc_metric;
3130         rt->fib6_type = cfg->fc_type;
3131         rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3132
3133         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3134         rt->fib6_dst.plen = cfg->fc_dst_len;
3135         if (rt->fib6_dst.plen == 128)
3136                 rt->dst_host = true;
3137
3138 #ifdef CONFIG_IPV6_SUBTREES
3139         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3140         rt->fib6_src.plen = cfg->fc_src_len;
3141 #endif
3142         err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
3143         if (err)
3144                 goto out;
3145
3146         /* We cannot add true routes via loopback here,
3147          * they would result in kernel looping; promote them to reject routes
3148          */
3149         addr_type = ipv6_addr_type(&cfg->fc_dst);
3150         if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type))
3151                 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3152
3153         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3154                 struct net_device *dev = fib6_info_nh_dev(rt);
3155
3156                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3157                         NL_SET_ERR_MSG(extack, "Invalid source address");
3158                         err = -EINVAL;
3159                         goto out;
3160                 }
3161                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3162                 rt->fib6_prefsrc.plen = 128;
3163         } else
3164                 rt->fib6_prefsrc.plen = 0;
3165
3166         return rt;
3167 out:
3168         fib6_info_release(rt);
3169         return ERR_PTR(err);
3170 }
3171
3172 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3173                   struct netlink_ext_ack *extack)
3174 {
3175         struct fib6_info *rt;
3176         int err;
3177
3178         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3179         if (IS_ERR(rt))
3180                 return PTR_ERR(rt);
3181
3182         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3183         fib6_info_release(rt);
3184
3185         return err;
3186 }
3187
3188 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3189 {
3190         struct net *net = info->nl_net;
3191         struct fib6_table *table;
3192         int err;
3193
3194         if (rt == net->ipv6.fib6_null_entry) {
3195                 err = -ENOENT;
3196                 goto out;
3197         }
3198
3199         table = rt->fib6_table;
3200         spin_lock_bh(&table->tb6_lock);
3201         err = fib6_del(rt, info);
3202         spin_unlock_bh(&table->tb6_lock);
3203
3204 out:
3205         fib6_info_release(rt);
3206         return err;
3207 }
3208
3209 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3210 {
3211         struct nl_info info = { .nl_net = net };
3212
3213         return __ip6_del_rt(rt, &info);
3214 }
3215
3216 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3217 {
3218         struct nl_info *info = &cfg->fc_nlinfo;
3219         struct net *net = info->nl_net;
3220         struct sk_buff *skb = NULL;
3221         struct fib6_table *table;
3222         int err = -ENOENT;
3223
3224         if (rt == net->ipv6.fib6_null_entry)
3225                 goto out_put;
3226         table = rt->fib6_table;
3227         spin_lock_bh(&table->tb6_lock);
3228
3229         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3230                 struct fib6_info *sibling, *next_sibling;
3231
3232                 /* prefer to send a single notification with all hops */
3233                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3234                 if (skb) {
3235                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3236
3237                         if (rt6_fill_node(net, skb, rt, NULL,
3238                                           NULL, NULL, 0, RTM_DELROUTE,
3239                                           info->portid, seq, 0) < 0) {
3240                                 kfree_skb(skb);
3241                                 skb = NULL;
3242                         } else
3243                                 info->skip_notify = 1;
3244                 }
3245
3246                 list_for_each_entry_safe(sibling, next_sibling,
3247                                          &rt->fib6_siblings,
3248                                          fib6_siblings) {
3249                         err = fib6_del(sibling, info);
3250                         if (err)
3251                                 goto out_unlock;
3252                 }
3253         }
3254
3255         err = fib6_del(rt, info);
3256 out_unlock:
3257         spin_unlock_bh(&table->tb6_lock);
3258 out_put:
3259         fib6_info_release(rt);
3260
3261         if (skb) {
3262                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3263                             info->nlh, gfp_any());
3264         }
3265         return err;
3266 }
3267
3268 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3269 {
3270         int rc = -ESRCH;
3271
3272         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3273                 goto out;
3274
3275         if (cfg->fc_flags & RTF_GATEWAY &&
3276             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3277                 goto out;
3278
3279         rc = rt6_remove_exception_rt(rt);
3280 out:
3281         return rc;
3282 }
3283
3284 static int ip6_route_del(struct fib6_config *cfg,
3285                          struct netlink_ext_ack *extack)
3286 {
3287         struct rt6_info *rt_cache;
3288         struct fib6_table *table;
3289         struct fib6_info *rt;
3290         struct fib6_node *fn;
3291         int err = -ESRCH;
3292
3293         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3294         if (!table) {
3295                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3296                 return err;
3297         }
3298
3299         rcu_read_lock();
3300
3301         fn = fib6_locate(&table->tb6_root,
3302                          &cfg->fc_dst, cfg->fc_dst_len,
3303                          &cfg->fc_src, cfg->fc_src_len,
3304                          !(cfg->fc_flags & RTF_CACHE));
3305
3306         if (fn) {
3307                 for_each_fib6_node_rt_rcu(fn) {
3308                         struct fib6_nh *nh;
3309
3310                         if (cfg->fc_flags & RTF_CACHE) {
3311                                 int rc;
3312
3313                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3314                                                               &cfg->fc_src);
3315                                 if (rt_cache) {
3316                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3317                                         if (rc != -ESRCH) {
3318                                                 rcu_read_unlock();
3319                                                 return rc;
3320                                         }
3321                                 }
3322                                 continue;
3323                         }
3324
3325                         nh = &rt->fib6_nh;
3326                         if (cfg->fc_ifindex &&
3327                             (!nh->fib_nh_dev ||
3328                              nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
3329                                 continue;
3330                         if (cfg->fc_flags & RTF_GATEWAY &&
3331                             !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
3332                                 continue;
3333                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3334                                 continue;
3335                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3336                                 continue;
3337                         if (!fib6_info_hold_safe(rt))
3338                                 continue;
3339                         rcu_read_unlock();
3340
3341                         /* if gateway was specified only delete the one hop */
3342                         if (cfg->fc_flags & RTF_GATEWAY)
3343                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3344
3345                         return __ip6_del_rt_siblings(rt, cfg);
3346                 }
3347         }
3348         rcu_read_unlock();
3349
3350         return err;
3351 }
3352
3353 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3354 {
3355         struct netevent_redirect netevent;
3356         struct rt6_info *rt, *nrt = NULL;
3357         struct ndisc_options ndopts;
3358         struct inet6_dev *in6_dev;
3359         struct neighbour *neigh;
3360         struct fib6_info *from;
3361         struct rd_msg *msg;
3362         int optlen, on_link;
3363         u8 *lladdr;
3364
3365         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3366         optlen -= sizeof(*msg);
3367
3368         if (optlen < 0) {
3369                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3370                 return;
3371         }
3372
3373         msg = (struct rd_msg *)icmp6_hdr(skb);
3374
3375         if (ipv6_addr_is_multicast(&msg->dest)) {
3376                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3377                 return;
3378         }
3379
3380         on_link = 0;
3381         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3382                 on_link = 1;
3383         } else if (ipv6_addr_type(&msg->target) !=
3384                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3385                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3386                 return;
3387         }
3388
3389         in6_dev = __in6_dev_get(skb->dev);
3390         if (!in6_dev)
3391                 return;
3392         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3393                 return;
3394
3395         /* RFC2461 8.1:
3396          *      The IP source address of the Redirect MUST be the same as the current
3397          *      first-hop router for the specified ICMP Destination Address.
3398          */
3399
3400         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3401                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3402                 return;
3403         }
3404
3405         lladdr = NULL;
3406         if (ndopts.nd_opts_tgt_lladdr) {
3407                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3408                                              skb->dev);
3409                 if (!lladdr) {
3410                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3411                         return;
3412                 }
3413         }
3414
3415         rt = (struct rt6_info *) dst;
3416         if (rt->rt6i_flags & RTF_REJECT) {
3417                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3418                 return;
3419         }
3420
3421         /* Redirect received -> path was valid.
3422          * Look, redirects are sent only in response to data packets,
3423          * so that this nexthop apparently is reachable. --ANK
3424          */
3425         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3426
3427         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3428         if (!neigh)
3429                 return;
3430
3431         /*
3432          *      We have finally decided to accept it.
3433          */
3434
3435         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3436                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3437                      NEIGH_UPDATE_F_OVERRIDE|
3438                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3439                                      NEIGH_UPDATE_F_ISROUTER)),
3440                      NDISC_REDIRECT, &ndopts);
3441
3442         rcu_read_lock();
3443         from = rcu_dereference(rt->from);
3444         /* This fib6_info_hold() is safe here because we hold reference to rt
3445          * and rt already holds reference to fib6_info.
3446          */
3447         fib6_info_hold(from);
3448         rcu_read_unlock();
3449
3450         nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3451         if (!nrt)
3452                 goto out;
3453
3454         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3455         if (on_link)
3456                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3457
3458         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3459
3460         /* No need to remove rt from the exception table if rt is
3461          * a cached route because rt6_insert_exception() will
3462          * takes care of it
3463          */
3464         if (rt6_insert_exception(nrt, from)) {
3465                 dst_release_immediate(&nrt->dst);
3466                 goto out;
3467         }
3468
3469         netevent.old = &rt->dst;
3470         netevent.new = &nrt->dst;
3471         netevent.daddr = &msg->dest;
3472         netevent.neigh = neigh;
3473         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3474
3475 out:
3476         fib6_info_release(from);
3477         neigh_release(neigh);
3478 }
3479
3480 #ifdef CONFIG_IPV6_ROUTE_INFO
3481 static struct fib6_info *rt6_get_route_info(struct net *net,
3482                                            const struct in6_addr *prefix, int prefixlen,
3483                                            const struct in6_addr *gwaddr,
3484                                            struct net_device *dev)
3485 {
3486         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3487         int ifindex = dev->ifindex;
3488         struct fib6_node *fn;
3489         struct fib6_info *rt = NULL;
3490         struct fib6_table *table;
3491
3492         table = fib6_get_table(net, tb_id);
3493         if (!table)
3494                 return NULL;
3495
3496         rcu_read_lock();
3497         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3498         if (!fn)
3499                 goto out;
3500
3501         for_each_fib6_node_rt_rcu(fn) {
3502                 if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
3503                         continue;
3504                 if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
3505                     !rt->fib6_nh.fib_nh_gw_family)
3506                         continue;
3507                 if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
3508                         continue;
3509                 if (!fib6_info_hold_safe(rt))
3510                         continue;
3511                 break;
3512         }
3513 out:
3514         rcu_read_unlock();
3515         return rt;
3516 }
3517
3518 static struct fib6_info *rt6_add_route_info(struct net *net,
3519                                            const struct in6_addr *prefix, int prefixlen,
3520                                            const struct in6_addr *gwaddr,
3521                                            struct net_device *dev,
3522                                            unsigned int pref)
3523 {
3524         struct fib6_config cfg = {
3525                 .fc_metric      = IP6_RT_PRIO_USER,
3526                 .fc_ifindex     = dev->ifindex,
3527                 .fc_dst_len     = prefixlen,
3528                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3529                                   RTF_UP | RTF_PREF(pref),
3530                 .fc_protocol = RTPROT_RA,
3531                 .fc_type = RTN_UNICAST,
3532                 .fc_nlinfo.portid = 0,
3533                 .fc_nlinfo.nlh = NULL,
3534                 .fc_nlinfo.nl_net = net,
3535         };
3536
3537         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3538         cfg.fc_dst = *prefix;
3539         cfg.fc_gateway = *gwaddr;
3540
3541         /* We should treat it as a default route if prefix length is 0. */
3542         if (!prefixlen)
3543                 cfg.fc_flags |= RTF_DEFAULT;
3544
3545         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3546
3547         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3548 }
3549 #endif
3550
3551 struct fib6_info *rt6_get_dflt_router(struct net *net,
3552                                      const struct in6_addr *addr,
3553                                      struct net_device *dev)
3554 {
3555         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3556         struct fib6_info *rt;
3557         struct fib6_table *table;
3558
3559         table = fib6_get_table(net, tb_id);
3560         if (!table)
3561                 return NULL;
3562
3563         rcu_read_lock();
3564         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3565                 struct fib6_nh *nh = &rt->fib6_nh;
3566
3567                 if (dev == nh->fib_nh_dev &&
3568                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3569                     ipv6_addr_equal(&nh->fib_nh_gw6, addr))
3570                         break;
3571         }
3572         if (rt && !fib6_info_hold_safe(rt))
3573                 rt = NULL;
3574         rcu_read_unlock();
3575         return rt;
3576 }
3577
3578 struct fib6_info *rt6_add_dflt_router(struct net *net,
3579                                      const struct in6_addr *gwaddr,
3580                                      struct net_device *dev,
3581                                      unsigned int pref)
3582 {
3583         struct fib6_config cfg = {
3584                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3585                 .fc_metric      = IP6_RT_PRIO_USER,
3586                 .fc_ifindex     = dev->ifindex,
3587                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3588                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3589                 .fc_protocol = RTPROT_RA,
3590                 .fc_type = RTN_UNICAST,
3591                 .fc_nlinfo.portid = 0,
3592                 .fc_nlinfo.nlh = NULL,
3593                 .fc_nlinfo.nl_net = net,
3594         };
3595
3596         cfg.fc_gateway = *gwaddr;
3597
3598         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3599                 struct fib6_table *table;
3600
3601                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3602                 if (table)
3603                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3604         }
3605
3606         return rt6_get_dflt_router(net, gwaddr, dev);
3607 }
3608
3609 static void __rt6_purge_dflt_routers(struct net *net,
3610                                      struct fib6_table *table)
3611 {
3612         struct fib6_info *rt;
3613
3614 restart:
3615         rcu_read_lock();
3616         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3617                 struct net_device *dev = fib6_info_nh_dev(rt);
3618                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3619
3620                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3621                     (!idev || idev->cnf.accept_ra != 2) &&
3622                     fib6_info_hold_safe(rt)) {
3623                         rcu_read_unlock();
3624                         ip6_del_rt(net, rt);
3625                         goto restart;
3626                 }
3627         }
3628         rcu_read_unlock();
3629
3630         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3631 }
3632
3633 void rt6_purge_dflt_routers(struct net *net)
3634 {
3635         struct fib6_table *table;
3636         struct hlist_head *head;
3637         unsigned int h;
3638
3639         rcu_read_lock();
3640
3641         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3642                 head = &net->ipv6.fib_table_hash[h];
3643                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3644                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3645                                 __rt6_purge_dflt_routers(net, table);
3646                 }
3647         }
3648
3649         rcu_read_unlock();
3650 }
3651
3652 static void rtmsg_to_fib6_config(struct net *net,
3653                                  struct in6_rtmsg *rtmsg,
3654                                  struct fib6_config *cfg)
3655 {
3656         *cfg = (struct fib6_config){
3657                 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3658                          : RT6_TABLE_MAIN,
3659                 .fc_ifindex = rtmsg->rtmsg_ifindex,
3660                 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
3661                 .fc_expires = rtmsg->rtmsg_info,
3662                 .fc_dst_len = rtmsg->rtmsg_dst_len,
3663                 .fc_src_len = rtmsg->rtmsg_src_len,
3664                 .fc_flags = rtmsg->rtmsg_flags,
3665                 .fc_type = rtmsg->rtmsg_type,
3666
3667                 .fc_nlinfo.nl_net = net,
3668
3669                 .fc_dst = rtmsg->rtmsg_dst,
3670                 .fc_src = rtmsg->rtmsg_src,
3671                 .fc_gateway = rtmsg->rtmsg_gateway,
3672         };
3673 }
3674
3675 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3676 {
3677         struct fib6_config cfg;
3678         struct in6_rtmsg rtmsg;
3679         int err;
3680
3681         switch (cmd) {
3682         case SIOCADDRT:         /* Add a route */
3683         case SIOCDELRT:         /* Delete a route */
3684                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3685                         return -EPERM;
3686                 err = copy_from_user(&rtmsg, arg,
3687                                      sizeof(struct in6_rtmsg));
3688                 if (err)
3689                         return -EFAULT;
3690
3691                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3692
3693                 rtnl_lock();
3694                 switch (cmd) {
3695                 case SIOCADDRT:
3696                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3697                         break;
3698                 case SIOCDELRT:
3699                         err = ip6_route_del(&cfg, NULL);
3700                         break;
3701                 default:
3702                         err = -EINVAL;
3703                 }
3704                 rtnl_unlock();
3705
3706                 return err;
3707         }
3708
3709         return -EINVAL;
3710 }
3711
3712 /*
3713  *      Drop the packet on the floor
3714  */
3715
3716 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3717 {
3718         int type;
3719         struct dst_entry *dst = skb_dst(skb);
3720         switch (ipstats_mib_noroutes) {
3721         case IPSTATS_MIB_INNOROUTES:
3722                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3723                 if (type == IPV6_ADDR_ANY) {
3724                         IP6_INC_STATS(dev_net(dst->dev),
3725                                       __in6_dev_get_safely(skb->dev),
3726                                       IPSTATS_MIB_INADDRERRORS);
3727                         break;
3728                 }
3729                 /* FALLTHROUGH */
3730         case IPSTATS_MIB_OUTNOROUTES:
3731                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3732                               ipstats_mib_noroutes);
3733                 break;
3734         }
3735         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3736         kfree_skb(skb);
3737         return 0;
3738 }
3739
3740 static int ip6_pkt_discard(struct sk_buff *skb)
3741 {
3742         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3743 }
3744
3745 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3746 {
3747         skb->dev = skb_dst(skb)->dev;
3748         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3749 }
3750
3751 static int ip6_pkt_prohibit(struct sk_buff *skb)
3752 {
3753         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3754 }
3755
3756 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3757 {
3758         skb->dev = skb_dst(skb)->dev;
3759         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3760 }
3761
3762 /*
3763  *      Allocate a dst for local (unicast / anycast) address.
3764  */
3765
3766 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3767                                      struct inet6_dev *idev,
3768                                      const struct in6_addr *addr,
3769                                      bool anycast, gfp_t gfp_flags)
3770 {
3771         struct fib6_config cfg = {
3772                 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
3773                 .fc_ifindex = idev->dev->ifindex,
3774                 .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
3775                 .fc_dst = *addr,
3776                 .fc_dst_len = 128,
3777                 .fc_protocol = RTPROT_KERNEL,
3778                 .fc_nlinfo.nl_net = net,
3779                 .fc_ignore_dev_down = true,
3780         };
3781
3782         if (anycast) {
3783                 cfg.fc_type = RTN_ANYCAST;
3784                 cfg.fc_flags |= RTF_ANYCAST;
3785         } else {
3786                 cfg.fc_type = RTN_LOCAL;
3787                 cfg.fc_flags |= RTF_LOCAL;
3788         }
3789
3790         return ip6_route_info_create(&cfg, gfp_flags, NULL);
3791 }
3792
3793 /* remove deleted ip from prefsrc entries */
3794 struct arg_dev_net_ip {
3795         struct net_device *dev;
3796         struct net *net;
3797         struct in6_addr *addr;
3798 };
3799
3800 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3801 {
3802         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3803         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3804         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3805
3806         if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) &&
3807             rt != net->ipv6.fib6_null_entry &&
3808             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3809                 spin_lock_bh(&rt6_exception_lock);
3810                 /* remove prefsrc entry */
3811                 rt->fib6_prefsrc.plen = 0;
3812                 spin_unlock_bh(&rt6_exception_lock);
3813         }
3814         return 0;
3815 }
3816
3817 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3818 {
3819         struct net *net = dev_net(ifp->idev->dev);
3820         struct arg_dev_net_ip adni = {
3821                 .dev = ifp->idev->dev,
3822                 .net = net,
3823                 .addr = &ifp->addr,
3824         };
3825         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3826 }
3827
3828 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT)
3829
3830 /* Remove routers and update dst entries when gateway turn into host. */
3831 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3832 {
3833         struct in6_addr *gateway = (struct in6_addr *)arg;
3834
3835         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3836             rt->fib6_nh.fib_nh_gw_family &&
3837             ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
3838                 return -1;
3839         }
3840
3841         /* Further clean up cached routes in exception table.
3842          * This is needed because cached route may have a different
3843          * gateway than its 'parent' in the case of an ip redirect.
3844          */
3845         rt6_exceptions_clean_tohost(rt, gateway);
3846
3847         return 0;
3848 }
3849
3850 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3851 {
3852         fib6_clean_all(net, fib6_clean_tohost, gateway);
3853 }
3854
3855 struct arg_netdev_event {
3856         const struct net_device *dev;
3857         union {
3858                 unsigned int nh_flags;
3859                 unsigned long event;
3860         };
3861 };
3862
3863 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3864 {
3865         struct fib6_info *iter;
3866         struct fib6_node *fn;
3867
3868         fn = rcu_dereference_protected(rt->fib6_node,
3869                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3870         iter = rcu_dereference_protected(fn->leaf,
3871                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3872         while (iter) {
3873                 if (iter->fib6_metric == rt->fib6_metric &&
3874                     rt6_qualify_for_ecmp(iter))
3875                         return iter;
3876                 iter = rcu_dereference_protected(iter->fib6_next,
3877                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3878         }
3879
3880         return NULL;
3881 }
3882
3883 static bool rt6_is_dead(const struct fib6_info *rt)
3884 {
3885         if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ||
3886             (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
3887              ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev)))
3888                 return true;
3889
3890         return false;
3891 }
3892
3893 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3894 {
3895         struct fib6_info *iter;
3896         int total = 0;
3897
3898         if (!rt6_is_dead(rt))
3899                 total += rt->fib6_nh.fib_nh_weight;
3900
3901         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3902                 if (!rt6_is_dead(iter))
3903                         total += iter->fib6_nh.fib_nh_weight;
3904         }
3905
3906         return total;
3907 }
3908
3909 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3910 {
3911         int upper_bound = -1;
3912
3913         if (!rt6_is_dead(rt)) {
3914                 *weight += rt->fib6_nh.fib_nh_weight;
3915                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3916                                                     total) - 1;
3917         }
3918         atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound);
3919 }
3920
3921 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3922 {
3923         struct fib6_info *iter;
3924         int weight = 0;
3925
3926         rt6_upper_bound_set(rt, &weight, total);
3927
3928         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3929                 rt6_upper_bound_set(iter, &weight, total);
3930 }
3931
3932 void rt6_multipath_rebalance(struct fib6_info *rt)
3933 {
3934         struct fib6_info *first;
3935         int total;
3936
3937         /* In case the entire multipath route was marked for flushing,
3938          * then there is no need to rebalance upon the removal of every
3939          * sibling route.
3940          */
3941         if (!rt->fib6_nsiblings || rt->should_flush)
3942                 return;
3943
3944         /* During lookup routes are evaluated in order, so we need to
3945          * make sure upper bounds are assigned from the first sibling
3946          * onwards.
3947          */
3948         first = rt6_multipath_first_sibling(rt);
3949         if (WARN_ON_ONCE(!first))
3950                 return;
3951
3952         total = rt6_multipath_total_weight(first);
3953         rt6_multipath_upper_bound_set(first, total);
3954 }
3955
3956 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3957 {
3958         const struct arg_netdev_event *arg = p_arg;
3959         struct net *net = dev_net(arg->dev);
3960
3961         if (rt != net->ipv6.fib6_null_entry &&
3962             rt->fib6_nh.fib_nh_dev == arg->dev) {
3963                 rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags;
3964                 fib6_update_sernum_upto_root(net, rt);
3965                 rt6_multipath_rebalance(rt);
3966         }
3967
3968         return 0;
3969 }
3970
3971 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3972 {
3973         struct arg_netdev_event arg = {
3974                 .dev = dev,
3975                 {
3976                         .nh_flags = nh_flags,
3977                 },
3978         };
3979
3980         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3981                 arg.nh_flags |= RTNH_F_LINKDOWN;
3982
3983         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3984 }
3985
3986 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3987                                    const struct net_device *dev)
3988 {
3989         struct fib6_info *iter;
3990
3991         if (rt->fib6_nh.fib_nh_dev == dev)
3992                 return true;
3993         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3994                 if (iter->fib6_nh.fib_nh_dev == dev)
3995                         return true;
3996
3997         return false;
3998 }
3999
4000 static void rt6_multipath_flush(struct fib6_info *rt)
4001 {
4002         struct fib6_info *iter;
4003
4004         rt->should_flush = 1;
4005         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4006                 iter->should_flush = 1;
4007 }
4008
4009 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4010                                              const struct net_device *down_dev)
4011 {
4012         struct fib6_info *iter;
4013         unsigned int dead = 0;
4014
4015         if (rt->fib6_nh.fib_nh_dev == down_dev ||
4016             rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4017                 dead++;
4018         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4019                 if (iter->fib6_nh.fib_nh_dev == down_dev ||
4020                     iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4021                         dead++;
4022
4023         return dead;
4024 }
4025
4026 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4027                                        const struct net_device *dev,
4028                                        unsigned int nh_flags)
4029 {
4030         struct fib6_info *iter;
4031
4032         if (rt->fib6_nh.fib_nh_dev == dev)
4033                 rt->fib6_nh.fib_nh_flags |= nh_flags;
4034         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4035                 if (iter->fib6_nh.fib_nh_dev == dev)
4036                         iter->fib6_nh.fib_nh_flags |= nh_flags;
4037 }
4038
4039 /* called with write lock held for table with rt */
4040 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4041 {
4042         const struct arg_netdev_event *arg = p_arg;
4043         const struct net_device *dev = arg->dev;
4044         struct net *net = dev_net(dev);
4045
4046         if (rt == net->ipv6.fib6_null_entry)
4047                 return 0;
4048
4049         switch (arg->event) {
4050         case NETDEV_UNREGISTER:
4051                 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4052         case NETDEV_DOWN:
4053                 if (rt->should_flush)
4054                         return -1;
4055                 if (!rt->fib6_nsiblings)
4056                         return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4057                 if (rt6_multipath_uses_dev(rt, dev)) {
4058                         unsigned int count;
4059
4060                         count = rt6_multipath_dead_count(rt, dev);
4061                         if (rt->fib6_nsiblings + 1 == count) {
4062                                 rt6_multipath_flush(rt);
4063                                 return -1;
4064                         }
4065                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4066                                                    RTNH_F_LINKDOWN);
4067                         fib6_update_sernum(net, rt);
4068                         rt6_multipath_rebalance(rt);
4069                 }
4070                 return -2;
4071         case NETDEV_CHANGE:
4072                 if (rt->fib6_nh.fib_nh_dev != dev ||
4073                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4074                         break;
4075                 rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN;
4076                 rt6_multipath_rebalance(rt);
4077                 break;
4078         }
4079
4080         return 0;
4081 }
4082
4083 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4084 {
4085         struct arg_netdev_event arg = {
4086                 .dev = dev,
4087                 {
4088                         .event = event,
4089                 },
4090         };
4091         struct net *net = dev_net(dev);
4092
4093         if (net->ipv6.sysctl.skip_notify_on_dev_down)
4094                 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4095         else
4096                 fib6_clean_all(net, fib6_ifdown, &arg);
4097 }
4098
4099 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4100 {
4101         rt6_sync_down_dev(dev, event);
4102         rt6_uncached_list_flush_dev(dev_net(dev), dev);
4103         neigh_ifdown(&nd_tbl, dev);
4104 }
4105
4106 struct rt6_mtu_change_arg {
4107         struct net_device *dev;
4108         unsigned int mtu;
4109 };
4110
4111 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4112 {
4113         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4114         struct inet6_dev *idev;
4115
4116         /* In IPv6 pmtu discovery is not optional,
4117            so that RTAX_MTU lock cannot disable it.
4118            We still use this lock to block changes
4119            caused by addrconf/ndisc.
4120         */
4121
4122         idev = __in6_dev_get(arg->dev);
4123         if (!idev)
4124                 return 0;
4125
4126         /* For administrative MTU increase, there is no way to discover
4127            IPv6 PMTU increase, so PMTU increase should be updated here.
4128            Since RFC 1981 doesn't include administrative MTU increase
4129            update PMTU increase is a MUST. (i.e. jumbo frame)
4130          */
4131         if (rt->fib6_nh.fib_nh_dev == arg->dev &&
4132             !fib6_metric_locked(rt, RTAX_MTU)) {
4133                 u32 mtu = rt->fib6_pmtu;
4134
4135                 if (mtu >= arg->mtu ||
4136                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4137                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4138
4139                 spin_lock_bh(&rt6_exception_lock);
4140                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4141                 spin_unlock_bh(&rt6_exception_lock);
4142         }
4143         return 0;
4144 }
4145
4146 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4147 {
4148         struct rt6_mtu_change_arg arg = {
4149                 .dev = dev,
4150                 .mtu = mtu,
4151         };
4152
4153         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4154 }
4155
4156 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4157         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4158         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
4159         [RTA_OIF]               = { .type = NLA_U32 },
4160         [RTA_IIF]               = { .type = NLA_U32 },
4161         [RTA_PRIORITY]          = { .type = NLA_U32 },
4162         [RTA_METRICS]           = { .type = NLA_NESTED },
4163         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4164         [RTA_PREF]              = { .type = NLA_U8 },
4165         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4166         [RTA_ENCAP]             = { .type = NLA_NESTED },
4167         [RTA_EXPIRES]           = { .type = NLA_U32 },
4168         [RTA_UID]               = { .type = NLA_U32 },
4169         [RTA_MARK]              = { .type = NLA_U32 },
4170         [RTA_TABLE]             = { .type = NLA_U32 },
4171         [RTA_IP_PROTO]          = { .type = NLA_U8 },
4172         [RTA_SPORT]             = { .type = NLA_U16 },
4173         [RTA_DPORT]             = { .type = NLA_U16 },
4174 };
4175
4176 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4177                               struct fib6_config *cfg,
4178                               struct netlink_ext_ack *extack)
4179 {
4180         struct rtmsg *rtm;
4181         struct nlattr *tb[RTA_MAX+1];
4182         unsigned int pref;
4183         int err;
4184
4185         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4186                           extack);
4187         if (err < 0)
4188                 goto errout;
4189
4190         err = -EINVAL;
4191         rtm = nlmsg_data(nlh);
4192
4193         *cfg = (struct fib6_config){
4194                 .fc_table = rtm->rtm_table,
4195                 .fc_dst_len = rtm->rtm_dst_len,
4196                 .fc_src_len = rtm->rtm_src_len,
4197                 .fc_flags = RTF_UP,
4198                 .fc_protocol = rtm->rtm_protocol,
4199                 .fc_type = rtm->rtm_type,
4200
4201                 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4202                 .fc_nlinfo.nlh = nlh,
4203                 .fc_nlinfo.nl_net = sock_net(skb->sk),
4204         };
4205
4206         if (rtm->rtm_type == RTN_UNREACHABLE ||
4207             rtm->rtm_type == RTN_BLACKHOLE ||
4208             rtm->rtm_type == RTN_PROHIBIT ||
4209             rtm->rtm_type == RTN_THROW)
4210                 cfg->fc_flags |= RTF_REJECT;
4211
4212         if (rtm->rtm_type == RTN_LOCAL)
4213                 cfg->fc_flags |= RTF_LOCAL;
4214
4215         if (rtm->rtm_flags & RTM_F_CLONED)
4216                 cfg->fc_flags |= RTF_CACHE;
4217
4218         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4219
4220         if (tb[RTA_GATEWAY]) {
4221                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4222                 cfg->fc_flags |= RTF_GATEWAY;
4223         }
4224         if (tb[RTA_VIA]) {
4225                 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4226                 goto errout;
4227         }
4228
4229         if (tb[RTA_DST]) {
4230                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4231
4232                 if (nla_len(tb[RTA_DST]) < plen)
4233                         goto errout;
4234
4235                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4236         }
4237
4238         if (tb[RTA_SRC]) {
4239                 int plen = (rtm->rtm_src_len + 7) >> 3;
4240
4241                 if (nla_len(tb[RTA_SRC]) < plen)
4242                         goto errout;
4243
4244                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4245         }
4246
4247         if (tb[RTA_PREFSRC])
4248                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4249
4250         if (tb[RTA_OIF])
4251                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4252
4253         if (tb[RTA_PRIORITY])
4254                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4255
4256         if (tb[RTA_METRICS]) {
4257                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4258                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4259         }
4260
4261         if (tb[RTA_TABLE])
4262                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4263
4264         if (tb[RTA_MULTIPATH]) {
4265                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4266                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4267
4268                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4269                                                      cfg->fc_mp_len, extack);
4270                 if (err < 0)
4271                         goto errout;
4272         }
4273
4274         if (tb[RTA_PREF]) {
4275                 pref = nla_get_u8(tb[RTA_PREF]);
4276                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4277                     pref != ICMPV6_ROUTER_PREF_HIGH)
4278                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4279                 cfg->fc_flags |= RTF_PREF(pref);
4280         }
4281
4282         if (tb[RTA_ENCAP])
4283                 cfg->fc_encap = tb[RTA_ENCAP];
4284
4285         if (tb[RTA_ENCAP_TYPE]) {
4286                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4287
4288                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4289                 if (err < 0)
4290                         goto errout;
4291         }
4292
4293         if (tb[RTA_EXPIRES]) {
4294                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4295
4296                 if (addrconf_finite_timeout(timeout)) {
4297                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4298                         cfg->fc_flags |= RTF_EXPIRES;
4299                 }
4300         }
4301
4302         err = 0;
4303 errout:
4304         return err;
4305 }
4306
4307 struct rt6_nh {
4308         struct fib6_info *fib6_info;
4309         struct fib6_config r_cfg;
4310         struct list_head next;
4311 };
4312
4313 static int ip6_route_info_append(struct net *net,
4314                                  struct list_head *rt6_nh_list,
4315                                  struct fib6_info *rt,
4316                                  struct fib6_config *r_cfg)
4317 {
4318         struct rt6_nh *nh;
4319         int err = -EEXIST;
4320
4321         list_for_each_entry(nh, rt6_nh_list, next) {
4322                 /* check if fib6_info already exists */
4323                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4324                         return err;
4325         }
4326
4327         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4328         if (!nh)
4329                 return -ENOMEM;
4330         nh->fib6_info = rt;
4331         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4332         list_add_tail(&nh->next, rt6_nh_list);
4333
4334         return 0;
4335 }
4336
4337 static void ip6_route_mpath_notify(struct fib6_info *rt,
4338                                    struct fib6_info *rt_last,
4339                                    struct nl_info *info,
4340                                    __u16 nlflags)
4341 {
4342         /* if this is an APPEND route, then rt points to the first route
4343          * inserted and rt_last points to last route inserted. Userspace
4344          * wants a consistent dump of the route which starts at the first
4345          * nexthop. Since sibling routes are always added at the end of
4346          * the list, find the first sibling of the last route appended
4347          */
4348         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4349                 rt = list_first_entry(&rt_last->fib6_siblings,
4350                                       struct fib6_info,
4351                                       fib6_siblings);
4352         }
4353
4354         if (rt)
4355                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4356 }
4357
4358 static int ip6_route_multipath_add(struct fib6_config *cfg,
4359                                    struct netlink_ext_ack *extack)
4360 {
4361         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4362         struct nl_info *info = &cfg->fc_nlinfo;
4363         struct fib6_config r_cfg;
4364         struct rtnexthop *rtnh;
4365         struct fib6_info *rt;
4366         struct rt6_nh *err_nh;
4367         struct rt6_nh *nh, *nh_safe;
4368         __u16 nlflags;
4369         int remaining;
4370         int attrlen;
4371         int err = 1;
4372         int nhn = 0;
4373         int replace = (cfg->fc_nlinfo.nlh &&
4374                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4375         LIST_HEAD(rt6_nh_list);
4376
4377         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4378         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4379                 nlflags |= NLM_F_APPEND;
4380
4381         remaining = cfg->fc_mp_len;
4382         rtnh = (struct rtnexthop *)cfg->fc_mp;
4383
4384         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4385          * fib6_info structs per nexthop
4386          */
4387         while (rtnh_ok(rtnh, remaining)) {
4388                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4389                 if (rtnh->rtnh_ifindex)
4390                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4391
4392                 attrlen = rtnh_attrlen(rtnh);
4393                 if (attrlen > 0) {
4394                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4395
4396                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4397                         if (nla) {
4398                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4399                                 r_cfg.fc_flags |= RTF_GATEWAY;
4400                         }
4401                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4402                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4403                         if (nla)
4404                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4405                 }
4406
4407                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4408                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4409                 if (IS_ERR(rt)) {
4410                         err = PTR_ERR(rt);
4411                         rt = NULL;
4412                         goto cleanup;
4413                 }
4414                 if (!rt6_qualify_for_ecmp(rt)) {
4415                         err = -EINVAL;
4416                         NL_SET_ERR_MSG(extack,
4417                                        "Device only routes can not be added for IPv6 using the multipath API.");
4418                         fib6_info_release(rt);
4419                         goto cleanup;
4420                 }
4421
4422                 rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1;
4423
4424                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4425                                             rt, &r_cfg);
4426                 if (err) {
4427                         fib6_info_release(rt);
4428                         goto cleanup;
4429                 }
4430
4431                 rtnh = rtnh_next(rtnh, &remaining);
4432         }
4433
4434         /* for add and replace send one notification with all nexthops.
4435          * Skip the notification in fib6_add_rt2node and send one with
4436          * the full route when done
4437          */
4438         info->skip_notify = 1;
4439
4440         err_nh = NULL;
4441         list_for_each_entry(nh, &rt6_nh_list, next) {
4442                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4443                 fib6_info_release(nh->fib6_info);
4444
4445                 if (!err) {
4446                         /* save reference to last route successfully inserted */
4447                         rt_last = nh->fib6_info;
4448
4449                         /* save reference to first route for notification */
4450                         if (!rt_notif)
4451                                 rt_notif = nh->fib6_info;
4452                 }
4453
4454                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4455                 nh->fib6_info = NULL;
4456                 if (err) {
4457                         if (replace && nhn)
4458                                 NL_SET_ERR_MSG_MOD(extack,
4459                                                    "multipath route replace failed (check consistency of installed routes)");
4460                         err_nh = nh;
4461                         goto add_errout;
4462                 }
4463
4464                 /* Because each route is added like a single route we remove
4465                  * these flags after the first nexthop: if there is a collision,
4466                  * we have already failed to add the first nexthop:
4467                  * fib6_add_rt2node() has rejected it; when replacing, old
4468                  * nexthops have been replaced by first new, the rest should
4469                  * be added to it.
4470                  */
4471                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4472                                                      NLM_F_REPLACE);
4473                 nhn++;
4474         }
4475
4476         /* success ... tell user about new route */
4477         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4478         goto cleanup;
4479
4480 add_errout:
4481         /* send notification for routes that were added so that
4482          * the delete notifications sent by ip6_route_del are
4483          * coherent
4484          */
4485         if (rt_notif)
4486                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4487
4488         /* Delete routes that were already added */
4489         list_for_each_entry(nh, &rt6_nh_list, next) {
4490                 if (err_nh == nh)
4491                         break;
4492                 ip6_route_del(&nh->r_cfg, extack);
4493         }
4494
4495 cleanup:
4496         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4497                 if (nh->fib6_info)
4498                         fib6_info_release(nh->fib6_info);
4499                 list_del(&nh->next);
4500                 kfree(nh);
4501         }
4502
4503         return err;
4504 }
4505
4506 static int ip6_route_multipath_del(struct fib6_config *cfg,
4507                                    struct netlink_ext_ack *extack)
4508 {
4509         struct fib6_config r_cfg;
4510         struct rtnexthop *rtnh;
4511         int remaining;
4512         int attrlen;
4513         int err = 1, last_err = 0;
4514
4515         remaining = cfg->fc_mp_len;
4516         rtnh = (struct rtnexthop *)cfg->fc_mp;
4517
4518         /* Parse a Multipath Entry */
4519         while (rtnh_ok(rtnh, remaining)) {
4520                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4521                 if (rtnh->rtnh_ifindex)
4522                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4523
4524                 attrlen = rtnh_attrlen(rtnh);
4525                 if (attrlen > 0) {
4526                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4527
4528                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4529                         if (nla) {
4530                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4531                                 r_cfg.fc_flags |= RTF_GATEWAY;
4532                         }
4533                 }
4534                 err = ip6_route_del(&r_cfg, extack);
4535                 if (err)
4536                         last_err = err;
4537
4538                 rtnh = rtnh_next(rtnh, &remaining);
4539         }
4540
4541         return last_err;
4542 }
4543
4544 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4545                               struct netlink_ext_ack *extack)
4546 {
4547         struct fib6_config cfg;
4548         int err;
4549
4550         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4551         if (err < 0)
4552                 return err;
4553
4554         if (cfg.fc_mp)
4555                 return ip6_route_multipath_del(&cfg, extack);
4556         else {
4557                 cfg.fc_delete_all_nh = 1;
4558                 return ip6_route_del(&cfg, extack);
4559         }
4560 }
4561
4562 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4563                               struct netlink_ext_ack *extack)
4564 {
4565         struct fib6_config cfg;
4566         int err;
4567
4568         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4569         if (err < 0)
4570                 return err;
4571
4572         if (cfg.fc_metric == 0)
4573                 cfg.fc_metric = IP6_RT_PRIO_USER;
4574
4575         if (cfg.fc_mp)
4576                 return ip6_route_multipath_add(&cfg, extack);
4577         else
4578                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4579 }
4580
4581 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4582 {
4583         int nexthop_len = 0;
4584
4585         if (rt->fib6_nsiblings) {
4586                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4587                             + NLA_ALIGN(sizeof(struct rtnexthop))
4588                             + nla_total_size(16) /* RTA_GATEWAY */
4589                             + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws);
4590
4591                 nexthop_len *= rt->fib6_nsiblings;
4592         }
4593
4594         return NLMSG_ALIGN(sizeof(struct rtmsg))
4595                + nla_total_size(16) /* RTA_SRC */
4596                + nla_total_size(16) /* RTA_DST */
4597                + nla_total_size(16) /* RTA_GATEWAY */
4598                + nla_total_size(16) /* RTA_PREFSRC */
4599                + nla_total_size(4) /* RTA_TABLE */
4600                + nla_total_size(4) /* RTA_IIF */
4601                + nla_total_size(4) /* RTA_OIF */
4602                + nla_total_size(4) /* RTA_PRIORITY */
4603                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4604                + nla_total_size(sizeof(struct rta_cacheinfo))
4605                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4606                + nla_total_size(1) /* RTA_PREF */
4607                + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws)
4608                + nexthop_len;
4609 }
4610
4611 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4612                          struct fib6_info *rt, struct dst_entry *dst,
4613                          struct in6_addr *dest, struct in6_addr *src,
4614                          int iif, int type, u32 portid, u32 seq,
4615                          unsigned int flags)
4616 {
4617         struct rt6_info *rt6 = (struct rt6_info *)dst;
4618         struct rt6key *rt6_dst, *rt6_src;
4619         u32 *pmetrics, table, rt6_flags;
4620         struct nlmsghdr *nlh;
4621         struct rtmsg *rtm;
4622         long expires = 0;
4623
4624         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4625         if (!nlh)
4626                 return -EMSGSIZE;
4627
4628         if (rt6) {
4629                 rt6_dst = &rt6->rt6i_dst;
4630                 rt6_src = &rt6->rt6i_src;
4631                 rt6_flags = rt6->rt6i_flags;
4632         } else {
4633                 rt6_dst = &rt->fib6_dst;
4634                 rt6_src = &rt->fib6_src;
4635                 rt6_flags = rt->fib6_flags;
4636         }
4637
4638         rtm = nlmsg_data(nlh);
4639         rtm->rtm_family = AF_INET6;
4640         rtm->rtm_dst_len = rt6_dst->plen;
4641         rtm->rtm_src_len = rt6_src->plen;
4642         rtm->rtm_tos = 0;
4643         if (rt->fib6_table)
4644                 table = rt->fib6_table->tb6_id;
4645         else
4646                 table = RT6_TABLE_UNSPEC;
4647         rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4648         if (nla_put_u32(skb, RTA_TABLE, table))
4649                 goto nla_put_failure;
4650
4651         rtm->rtm_type = rt->fib6_type;
4652         rtm->rtm_flags = 0;
4653         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4654         rtm->rtm_protocol = rt->fib6_protocol;
4655
4656         if (rt6_flags & RTF_CACHE)
4657                 rtm->rtm_flags |= RTM_F_CLONED;
4658
4659         if (dest) {
4660                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4661                         goto nla_put_failure;
4662                 rtm->rtm_dst_len = 128;
4663         } else if (rtm->rtm_dst_len)
4664                 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4665                         goto nla_put_failure;
4666 #ifdef CONFIG_IPV6_SUBTREES
4667         if (src) {
4668                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4669                         goto nla_put_failure;
4670                 rtm->rtm_src_len = 128;
4671         } else if (rtm->rtm_src_len &&
4672                    nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4673                 goto nla_put_failure;
4674 #endif
4675         if (iif) {
4676 #ifdef CONFIG_IPV6_MROUTE
4677                 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4678                         int err = ip6mr_get_route(net, skb, rtm, portid);
4679
4680                         if (err == 0)
4681                                 return 0;
4682                         if (err < 0)
4683                                 goto nla_put_failure;
4684                 } else
4685 #endif
4686                         if (nla_put_u32(skb, RTA_IIF, iif))
4687                                 goto nla_put_failure;
4688         } else if (dest) {
4689                 struct in6_addr saddr_buf;
4690                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4691                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4692                         goto nla_put_failure;
4693         }
4694
4695         if (rt->fib6_prefsrc.plen) {
4696                 struct in6_addr saddr_buf;
4697                 saddr_buf = rt->fib6_prefsrc.addr;
4698                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4699                         goto nla_put_failure;
4700         }
4701
4702         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4703         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4704                 goto nla_put_failure;
4705
4706         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4707                 goto nla_put_failure;
4708
4709         /* For multipath routes, walk the siblings list and add
4710          * each as a nexthop within RTA_MULTIPATH.
4711          */
4712         if (rt6) {
4713                 if (rt6_flags & RTF_GATEWAY &&
4714                     nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4715                         goto nla_put_failure;
4716
4717                 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4718                         goto nla_put_failure;
4719         } else if (rt->fib6_nsiblings) {
4720                 struct fib6_info *sibling, *next_sibling;
4721                 struct nlattr *mp;
4722
4723                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4724                 if (!mp)
4725                         goto nla_put_failure;
4726
4727                 if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common,
4728                                     rt->fib6_nh.fib_nh_weight) < 0)
4729                         goto nla_put_failure;
4730
4731                 list_for_each_entry_safe(sibling, next_sibling,
4732                                          &rt->fib6_siblings, fib6_siblings) {
4733                         if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common,
4734                                             sibling->fib6_nh.fib_nh_weight) < 0)
4735                                 goto nla_put_failure;
4736                 }
4737
4738                 nla_nest_end(skb, mp);
4739         } else {
4740                 if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common,
4741                                      &rtm->rtm_flags, false) < 0)
4742                         goto nla_put_failure;
4743         }
4744
4745         if (rt6_flags & RTF_EXPIRES) {
4746                 expires = dst ? dst->expires : rt->expires;
4747                 expires -= jiffies;
4748         }
4749
4750         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4751                 goto nla_put_failure;
4752
4753         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4754                 goto nla_put_failure;
4755
4756
4757         nlmsg_end(skb, nlh);
4758         return 0;
4759
4760 nla_put_failure:
4761         nlmsg_cancel(skb, nlh);
4762         return -EMSGSIZE;
4763 }
4764
4765 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4766                                const struct net_device *dev)
4767 {
4768         if (f6i->fib6_nh.fib_nh_dev == dev)
4769                 return true;
4770
4771         if (f6i->fib6_nsiblings) {
4772                 struct fib6_info *sibling, *next_sibling;
4773
4774                 list_for_each_entry_safe(sibling, next_sibling,
4775                                          &f6i->fib6_siblings, fib6_siblings) {
4776                         if (sibling->fib6_nh.fib_nh_dev == dev)
4777                                 return true;
4778                 }
4779         }
4780
4781         return false;
4782 }
4783
4784 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4785 {
4786         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4787         struct fib_dump_filter *filter = &arg->filter;
4788         unsigned int flags = NLM_F_MULTI;
4789         struct net *net = arg->net;
4790
4791         if (rt == net->ipv6.fib6_null_entry)
4792                 return 0;
4793
4794         if ((filter->flags & RTM_F_PREFIX) &&
4795             !(rt->fib6_flags & RTF_PREFIX_RT)) {
4796                 /* success since this is not a prefix route */
4797                 return 1;
4798         }
4799         if (filter->filter_set) {
4800                 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4801                     (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4802                     (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4803                         return 1;
4804                 }
4805                 flags |= NLM_F_DUMP_FILTERED;
4806         }
4807
4808         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4809                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4810                              arg->cb->nlh->nlmsg_seq, flags);
4811 }
4812
4813 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4814                                         const struct nlmsghdr *nlh,
4815                                         struct nlattr **tb,
4816                                         struct netlink_ext_ack *extack)
4817 {
4818         struct rtmsg *rtm;
4819         int i, err;
4820
4821         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4822                 NL_SET_ERR_MSG_MOD(extack,
4823                                    "Invalid header for get route request");
4824                 return -EINVAL;
4825         }
4826
4827         if (!netlink_strict_get_check(skb))
4828                 return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
4829                                    rtm_ipv6_policy, extack);
4830
4831         rtm = nlmsg_data(nlh);
4832         if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4833             (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4834             rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4835             rtm->rtm_type) {
4836                 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4837                 return -EINVAL;
4838         }
4839         if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4840                 NL_SET_ERR_MSG_MOD(extack,
4841                                    "Invalid flags for get route request");
4842                 return -EINVAL;
4843         }
4844
4845         err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4846                                  rtm_ipv6_policy, extack);
4847         if (err)
4848                 return err;
4849
4850         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4851             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4852                 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4853                 return -EINVAL;
4854         }
4855
4856         for (i = 0; i <= RTA_MAX; i++) {
4857                 if (!tb[i])
4858                         continue;
4859
4860                 switch (i) {
4861                 case RTA_SRC:
4862                 case RTA_DST:
4863                 case RTA_IIF:
4864                 case RTA_OIF:
4865                 case RTA_MARK:
4866                 case RTA_UID:
4867                 case RTA_SPORT:
4868                 case RTA_DPORT:
4869                 case RTA_IP_PROTO:
4870                         break;
4871                 default:
4872                         NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4873                         return -EINVAL;
4874                 }
4875         }
4876
4877         return 0;
4878 }
4879
4880 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4881                               struct netlink_ext_ack *extack)
4882 {
4883         struct net *net = sock_net(in_skb->sk);
4884         struct nlattr *tb[RTA_MAX+1];
4885         int err, iif = 0, oif = 0;
4886         struct fib6_info *from;
4887         struct dst_entry *dst;
4888         struct rt6_info *rt;
4889         struct sk_buff *skb;
4890         struct rtmsg *rtm;
4891         struct flowi6 fl6 = {};
4892         bool fibmatch;
4893
4894         err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4895         if (err < 0)
4896                 goto errout;
4897
4898         err = -EINVAL;
4899         rtm = nlmsg_data(nlh);
4900         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4901         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4902
4903         if (tb[RTA_SRC]) {
4904                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4905                         goto errout;
4906
4907                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4908         }
4909
4910         if (tb[RTA_DST]) {
4911                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4912                         goto errout;
4913
4914                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4915         }
4916
4917         if (tb[RTA_IIF])
4918                 iif = nla_get_u32(tb[RTA_IIF]);
4919
4920         if (tb[RTA_OIF])
4921                 oif = nla_get_u32(tb[RTA_OIF]);
4922
4923         if (tb[RTA_MARK])
4924                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4925
4926         if (tb[RTA_UID])
4927                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4928                                            nla_get_u32(tb[RTA_UID]));
4929         else
4930                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4931
4932         if (tb[RTA_SPORT])
4933                 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4934
4935         if (tb[RTA_DPORT])
4936                 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4937
4938         if (tb[RTA_IP_PROTO]) {
4939                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4940                                                   &fl6.flowi6_proto, AF_INET6,
4941                                                   extack);
4942                 if (err)
4943                         goto errout;
4944         }
4945
4946         if (iif) {
4947                 struct net_device *dev;
4948                 int flags = 0;
4949
4950                 rcu_read_lock();
4951
4952                 dev = dev_get_by_index_rcu(net, iif);
4953                 if (!dev) {
4954                         rcu_read_unlock();
4955                         err = -ENODEV;
4956                         goto errout;
4957                 }
4958
4959                 fl6.flowi6_iif = iif;
4960
4961                 if (!ipv6_addr_any(&fl6.saddr))
4962                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4963
4964                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4965
4966                 rcu_read_unlock();
4967         } else {
4968                 fl6.flowi6_oif = oif;
4969
4970                 dst = ip6_route_output(net, NULL, &fl6);
4971         }
4972
4973
4974         rt = container_of(dst, struct rt6_info, dst);
4975         if (rt->dst.error) {
4976                 err = rt->dst.error;
4977                 ip6_rt_put(rt);
4978                 goto errout;
4979         }
4980
4981         if (rt == net->ipv6.ip6_null_entry) {
4982                 err = rt->dst.error;
4983                 ip6_rt_put(rt);
4984                 goto errout;
4985         }
4986
4987         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4988         if (!skb) {
4989                 ip6_rt_put(rt);
4990                 err = -ENOBUFS;
4991                 goto errout;
4992         }
4993
4994         skb_dst_set(skb, &rt->dst);
4995
4996         rcu_read_lock();
4997         from = rcu_dereference(rt->from);
4998
4999         if (fibmatch)
5000                 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
5001                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
5002                                     nlh->nlmsg_seq, 0);
5003         else
5004                 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5005                                     &fl6.saddr, iif, RTM_NEWROUTE,
5006                                     NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
5007                                     0);
5008         rcu_read_unlock();
5009
5010         if (err < 0) {
5011                 kfree_skb(skb);
5012                 goto errout;
5013         }
5014
5015         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5016 errout:
5017         return err;
5018 }
5019
5020 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5021                      unsigned int nlm_flags)
5022 {
5023         struct sk_buff *skb;
5024         struct net *net = info->nl_net;
5025         u32 seq;
5026         int err;
5027
5028         err = -ENOBUFS;
5029         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5030
5031         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5032         if (!skb)
5033                 goto errout;
5034
5035         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5036                             event, info->portid, seq, nlm_flags);
5037         if (err < 0) {
5038                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5039                 WARN_ON(err == -EMSGSIZE);
5040                 kfree_skb(skb);
5041                 goto errout;
5042         }
5043         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5044                     info->nlh, gfp_any());
5045         return;
5046 errout:
5047         if (err < 0)
5048                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5049 }
5050
5051 static int ip6_route_dev_notify(struct notifier_block *this,
5052                                 unsigned long event, void *ptr)
5053 {
5054         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5055         struct net *net = dev_net(dev);
5056
5057         if (!(dev->flags & IFF_LOOPBACK))
5058                 return NOTIFY_OK;
5059
5060         if (event == NETDEV_REGISTER) {
5061                 net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev;
5062                 net->ipv6.ip6_null_entry->dst.dev = dev;
5063                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5064 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5065                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5066                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5067                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5068                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5069 #endif
5070          } else if (event == NETDEV_UNREGISTER &&
5071                     dev->reg_state != NETREG_UNREGISTERED) {
5072                 /* NETDEV_UNREGISTER could be fired for multiple times by
5073                  * netdev_wait_allrefs(). Make sure we only call this once.
5074                  */
5075                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5076 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5077                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5078                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5079 #endif
5080         }
5081
5082         return NOTIFY_OK;
5083 }
5084
5085 /*
5086  *      /proc
5087  */
5088
5089 #ifdef CONFIG_PROC_FS
5090 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5091 {
5092         struct net *net = (struct net *)seq->private;
5093         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5094                    net->ipv6.rt6_stats->fib_nodes,
5095                    net->ipv6.rt6_stats->fib_route_nodes,
5096                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5097                    net->ipv6.rt6_stats->fib_rt_entries,
5098                    net->ipv6.rt6_stats->fib_rt_cache,
5099                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5100                    net->ipv6.rt6_stats->fib_discarded_routes);
5101
5102         return 0;
5103 }
5104 #endif  /* CONFIG_PROC_FS */
5105
5106 #ifdef CONFIG_SYSCTL
5107
5108 static
5109 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5110                               void __user *buffer, size_t *lenp, loff_t *ppos)
5111 {
5112         struct net *net;
5113         int delay;
5114         int ret;
5115         if (!write)
5116                 return -EINVAL;
5117
5118         net = (struct net *)ctl->extra1;
5119         delay = net->ipv6.sysctl.flush_delay;
5120         ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5121         if (ret)
5122                 return ret;
5123
5124         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5125         return 0;
5126 }
5127
5128 static int zero;
5129 static int one = 1;
5130
5131 static struct ctl_table ipv6_route_table_template[] = {
5132         {
5133                 .procname       =       "flush",
5134                 .data           =       &init_net.ipv6.sysctl.flush_delay,
5135                 .maxlen         =       sizeof(int),
5136                 .mode           =       0200,
5137                 .proc_handler   =       ipv6_sysctl_rtcache_flush
5138         },
5139         {
5140                 .procname       =       "gc_thresh",
5141                 .data           =       &ip6_dst_ops_template.gc_thresh,
5142                 .maxlen         =       sizeof(int),
5143                 .mode           =       0644,
5144                 .proc_handler   =       proc_dointvec,
5145         },
5146         {
5147                 .procname       =       "max_size",
5148                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
5149                 .maxlen         =       sizeof(int),
5150                 .mode           =       0644,
5151                 .proc_handler   =       proc_dointvec,
5152         },
5153         {
5154                 .procname       =       "gc_min_interval",
5155                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5156                 .maxlen         =       sizeof(int),
5157                 .mode           =       0644,
5158                 .proc_handler   =       proc_dointvec_jiffies,
5159         },
5160         {
5161                 .procname       =       "gc_timeout",
5162                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5163                 .maxlen         =       sizeof(int),
5164                 .mode           =       0644,
5165                 .proc_handler   =       proc_dointvec_jiffies,
5166         },
5167         {
5168                 .procname       =       "gc_interval",
5169                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5170                 .maxlen         =       sizeof(int),
5171                 .mode           =       0644,
5172                 .proc_handler   =       proc_dointvec_jiffies,
5173         },
5174         {
5175                 .procname       =       "gc_elasticity",
5176                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5177                 .maxlen         =       sizeof(int),
5178                 .mode           =       0644,
5179                 .proc_handler   =       proc_dointvec,
5180         },
5181         {
5182                 .procname       =       "mtu_expires",
5183                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5184                 .maxlen         =       sizeof(int),
5185                 .mode           =       0644,
5186                 .proc_handler   =       proc_dointvec_jiffies,
5187         },
5188         {
5189                 .procname       =       "min_adv_mss",
5190                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5191                 .maxlen         =       sizeof(int),
5192                 .mode           =       0644,
5193                 .proc_handler   =       proc_dointvec,
5194         },
5195         {
5196                 .procname       =       "gc_min_interval_ms",
5197                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5198                 .maxlen         =       sizeof(int),
5199                 .mode           =       0644,
5200                 .proc_handler   =       proc_dointvec_ms_jiffies,
5201         },
5202         {
5203                 .procname       =       "skip_notify_on_dev_down",
5204                 .data           =       &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5205                 .maxlen         =       sizeof(int),
5206                 .mode           =       0644,
5207                 .proc_handler   =       proc_dointvec,
5208                 .extra1         =       &zero,
5209                 .extra2         =       &one,
5210         },
5211         { }
5212 };
5213
5214 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5215 {
5216         struct ctl_table *table;
5217
5218         table = kmemdup(ipv6_route_table_template,
5219                         sizeof(ipv6_route_table_template),
5220                         GFP_KERNEL);
5221
5222         if (table) {
5223                 table[0].data = &net->ipv6.sysctl.flush_delay;
5224                 table[0].extra1 = net;
5225                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5226                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5227                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5228                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5229                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5230                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5231                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5232                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5233                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5234                 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5235
5236                 /* Don't export sysctls to unprivileged users */
5237                 if (net->user_ns != &init_user_ns)
5238                         table[0].procname = NULL;
5239         }
5240
5241         return table;
5242 }
5243 #endif
5244
5245 static int __net_init ip6_route_net_init(struct net *net)
5246 {
5247         int ret = -ENOMEM;
5248
5249         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5250                sizeof(net->ipv6.ip6_dst_ops));
5251
5252         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5253                 goto out_ip6_dst_ops;
5254
5255         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5256                                             sizeof(*net->ipv6.fib6_null_entry),
5257                                             GFP_KERNEL);
5258         if (!net->ipv6.fib6_null_entry)
5259                 goto out_ip6_dst_entries;
5260
5261         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5262                                            sizeof(*net->ipv6.ip6_null_entry),
5263                                            GFP_KERNEL);
5264         if (!net->ipv6.ip6_null_entry)
5265                 goto out_fib6_null_entry;
5266         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5267         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5268                          ip6_template_metrics, true);
5269
5270 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5271         net->ipv6.fib6_has_custom_rules = false;
5272         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5273                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5274                                                GFP_KERNEL);
5275         if (!net->ipv6.ip6_prohibit_entry)
5276                 goto out_ip6_null_entry;
5277         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5278         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5279                          ip6_template_metrics, true);
5280
5281         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5282                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5283                                                GFP_KERNEL);
5284         if (!net->ipv6.ip6_blk_hole_entry)
5285                 goto out_ip6_prohibit_entry;
5286         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5287         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5288                          ip6_template_metrics, true);
5289 #endif
5290
5291         net->ipv6.sysctl.flush_delay = 0;
5292         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5293         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5294         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5295         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5296         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5297         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5298         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5299         net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5300
5301         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5302
5303         ret = 0;
5304 out:
5305         return ret;
5306
5307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5308 out_ip6_prohibit_entry:
5309         kfree(net->ipv6.ip6_prohibit_entry);
5310 out_ip6_null_entry:
5311         kfree(net->ipv6.ip6_null_entry);
5312 #endif
5313 out_fib6_null_entry:
5314         kfree(net->ipv6.fib6_null_entry);
5315 out_ip6_dst_entries:
5316         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5317 out_ip6_dst_ops:
5318         goto out;
5319 }
5320
5321 static void __net_exit ip6_route_net_exit(struct net *net)
5322 {
5323         kfree(net->ipv6.fib6_null_entry);
5324         kfree(net->ipv6.ip6_null_entry);
5325 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5326         kfree(net->ipv6.ip6_prohibit_entry);
5327         kfree(net->ipv6.ip6_blk_hole_entry);
5328 #endif
5329         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5330 }
5331
5332 static int __net_init ip6_route_net_init_late(struct net *net)
5333 {
5334 #ifdef CONFIG_PROC_FS
5335         proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5336                         sizeof(struct ipv6_route_iter));
5337         proc_create_net_single("rt6_stats", 0444, net->proc_net,
5338                         rt6_stats_seq_show, NULL);
5339 #endif
5340         return 0;
5341 }
5342
5343 static void __net_exit ip6_route_net_exit_late(struct net *net)
5344 {
5345 #ifdef CONFIG_PROC_FS
5346         remove_proc_entry("ipv6_route", net->proc_net);
5347         remove_proc_entry("rt6_stats", net->proc_net);
5348 #endif
5349 }
5350
5351 static struct pernet_operations ip6_route_net_ops = {
5352         .init = ip6_route_net_init,
5353         .exit = ip6_route_net_exit,
5354 };
5355
5356 static int __net_init ipv6_inetpeer_init(struct net *net)
5357 {
5358         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5359
5360         if (!bp)
5361                 return -ENOMEM;
5362         inet_peer_base_init(bp);
5363         net->ipv6.peers = bp;
5364         return 0;
5365 }
5366
5367 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5368 {
5369         struct inet_peer_base *bp = net->ipv6.peers;
5370
5371         net->ipv6.peers = NULL;
5372         inetpeer_invalidate_tree(bp);
5373         kfree(bp);
5374 }
5375
5376 static struct pernet_operations ipv6_inetpeer_ops = {
5377         .init   =       ipv6_inetpeer_init,
5378         .exit   =       ipv6_inetpeer_exit,
5379 };
5380
5381 static struct pernet_operations ip6_route_net_late_ops = {
5382         .init = ip6_route_net_init_late,
5383         .exit = ip6_route_net_exit_late,
5384 };
5385
5386 static struct notifier_block ip6_route_dev_notifier = {
5387         .notifier_call = ip6_route_dev_notify,
5388         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5389 };
5390
5391 void __init ip6_route_init_special_entries(void)
5392 {
5393         /* Registering of the loopback is done before this portion of code,
5394          * the loopback reference in rt6_info will not be taken, do it
5395          * manually for init_net */
5396         init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev;
5397         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5398         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5399   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5400         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5401         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5402         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5403         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5404   #endif
5405 }
5406
5407 int __init ip6_route_init(void)
5408 {
5409         int ret;
5410         int cpu;
5411
5412         ret = -ENOMEM;
5413         ip6_dst_ops_template.kmem_cachep =
5414                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5415                                   SLAB_HWCACHE_ALIGN, NULL);
5416         if (!ip6_dst_ops_template.kmem_cachep)
5417                 goto out;
5418
5419         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5420         if (ret)
5421                 goto out_kmem_cache;
5422
5423         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5424         if (ret)
5425                 goto out_dst_entries;
5426
5427         ret = register_pernet_subsys(&ip6_route_net_ops);
5428         if (ret)
5429                 goto out_register_inetpeer;
5430
5431         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5432
5433         ret = fib6_init();
5434         if (ret)
5435                 goto out_register_subsys;
5436
5437         ret = xfrm6_init();
5438         if (ret)
5439                 goto out_fib6_init;
5440
5441         ret = fib6_rules_init();
5442         if (ret)
5443                 goto xfrm6_init;
5444
5445         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5446         if (ret)
5447                 goto fib6_rules_init;
5448
5449         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5450                                    inet6_rtm_newroute, NULL, 0);
5451         if (ret < 0)
5452                 goto out_register_late_subsys;
5453
5454         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5455                                    inet6_rtm_delroute, NULL, 0);
5456         if (ret < 0)
5457                 goto out_register_late_subsys;
5458
5459         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5460                                    inet6_rtm_getroute, NULL,
5461                                    RTNL_FLAG_DOIT_UNLOCKED);
5462         if (ret < 0)
5463                 goto out_register_late_subsys;
5464
5465         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5466         if (ret)
5467                 goto out_register_late_subsys;
5468
5469         for_each_possible_cpu(cpu) {
5470                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5471
5472                 INIT_LIST_HEAD(&ul->head);
5473                 spin_lock_init(&ul->lock);
5474         }
5475
5476 out:
5477         return ret;
5478
5479 out_register_late_subsys:
5480         rtnl_unregister_all(PF_INET6);
5481         unregister_pernet_subsys(&ip6_route_net_late_ops);
5482 fib6_rules_init:
5483         fib6_rules_cleanup();
5484 xfrm6_init:
5485         xfrm6_fini();
5486 out_fib6_init:
5487         fib6_gc_cleanup();
5488 out_register_subsys:
5489         unregister_pernet_subsys(&ip6_route_net_ops);
5490 out_register_inetpeer:
5491         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5492 out_dst_entries:
5493         dst_entries_destroy(&ip6_dst_blackhole_ops);
5494 out_kmem_cache:
5495         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5496         goto out;
5497 }
5498
5499 void ip6_route_cleanup(void)
5500 {
5501         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5502         unregister_pernet_subsys(&ip6_route_net_late_ops);
5503         fib6_rules_cleanup();
5504         xfrm6_fini();
5505         fib6_gc_cleanup();
5506         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5507         unregister_pernet_subsys(&ip6_route_net_ops);
5508         dst_entries_destroy(&ip6_dst_blackhole_ops);
5509         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5510 }