]> asedeno.scripts.mit.edu Git - linux.git/blob - net/ipv6/route.c
efecdcff5055af9a252b466a166d548a0ee0ce3a
[linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67
68 #include <linux/uaccess.h>
69
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73
74 enum rt6_nud_state {
75         RT6_NUD_FAIL_HARD = -3,
76         RT6_NUD_FAIL_PROBE = -2,
77         RT6_NUD_FAIL_DO_RR = -1,
78         RT6_NUD_SUCCEED = 1
79 };
80
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int      ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void             ip6_dst_destroy(struct dst_entry *);
87 static void             ip6_dst_ifdown(struct dst_entry *,
88                                        struct net_device *dev, int how);
89 static int               ip6_dst_gc(struct dst_ops *ops);
90
91 static int              ip6_pkt_discard(struct sk_buff *skb);
92 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int              ip6_pkt_prohibit(struct sk_buff *skb);
94 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void             ip6_link_failure(struct sk_buff *skb);
96 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97                                            struct sk_buff *skb, u32 mtu);
98 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99                                         struct sk_buff *skb);
100 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104                          struct sk_buff *skb, struct rt6_info *rt,
105                          struct in6_addr *dst, struct in6_addr *src,
106                          int iif, int type, u32 portid, u32 seq,
107                          unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109                                            struct in6_addr *daddr,
110                                            struct in6_addr *saddr);
111
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114                                            const struct in6_addr *prefix, int prefixlen,
115                                            const struct in6_addr *gwaddr,
116                                            struct net_device *dev,
117                                            unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119                                            const struct in6_addr *prefix, int prefixlen,
120                                            const struct in6_addr *gwaddr,
121                                            struct net_device *dev);
122 #endif
123
124 struct uncached_list {
125         spinlock_t              lock;
126         struct list_head        head;
127 };
128
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130
131 static void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134
135         rt->rt6i_uncached_list = ul;
136
137         spin_lock_bh(&ul->lock);
138         list_add_tail(&rt->rt6i_uncached, &ul->head);
139         spin_unlock_bh(&ul->lock);
140 }
141
142 static void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144         if (!list_empty(&rt->rt6i_uncached)) {
145                 struct uncached_list *ul = rt->rt6i_uncached_list;
146                 struct net *net = dev_net(rt->dst.dev);
147
148                 spin_lock_bh(&ul->lock);
149                 list_del(&rt->rt6i_uncached);
150                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151                 spin_unlock_bh(&ul->lock);
152         }
153 }
154
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
156 {
157         struct net_device *loopback_dev = net->loopback_dev;
158         int cpu;
159
160         if (dev == loopback_dev)
161                 return;
162
163         for_each_possible_cpu(cpu) {
164                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165                 struct rt6_info *rt;
166
167                 spin_lock_bh(&ul->lock);
168                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169                         struct inet6_dev *rt_idev = rt->rt6i_idev;
170                         struct net_device *rt_dev = rt->dst.dev;
171
172                         if (rt_idev->dev == dev) {
173                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
174                                 in6_dev_put(rt_idev);
175                         }
176
177                         if (rt_dev == dev) {
178                                 rt->dst.dev = loopback_dev;
179                                 dev_hold(rt->dst.dev);
180                                 dev_put(rt_dev);
181                         }
182                 }
183                 spin_unlock_bh(&ul->lock);
184         }
185 }
186
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
188 {
189         return dst_metrics_write_ptr(rt->dst.from);
190 }
191
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193 {
194         struct rt6_info *rt = (struct rt6_info *)dst;
195
196         if (rt->rt6i_flags & RTF_PCPU)
197                 return rt6_pcpu_cow_metrics(rt);
198         else if (rt->rt6i_flags & RTF_CACHE)
199                 return NULL;
200         else
201                 return dst_cow_metrics_generic(dst, old);
202 }
203
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205                                              struct sk_buff *skb,
206                                              const void *daddr)
207 {
208         struct in6_addr *p = &rt->rt6i_gateway;
209
210         if (!ipv6_addr_any(p))
211                 return (const void *) p;
212         else if (skb)
213                 return &ipv6_hdr(skb)->daddr;
214         return daddr;
215 }
216
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218                                           struct sk_buff *skb,
219                                           const void *daddr)
220 {
221         struct rt6_info *rt = (struct rt6_info *) dst;
222         struct neighbour *n;
223
224         daddr = choose_neigh_daddr(rt, skb, daddr);
225         n = __ipv6_neigh_lookup(dst->dev, daddr);
226         if (n)
227                 return n;
228         return neigh_create(&nd_tbl, daddr, dst->dev);
229 }
230
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
232 {
233         struct net_device *dev = dst->dev;
234         struct rt6_info *rt = (struct rt6_info *)dst;
235
236         daddr = choose_neigh_daddr(rt, NULL, daddr);
237         if (!daddr)
238                 return;
239         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240                 return;
241         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242                 return;
243         __ipv6_confirm_neigh(dev, daddr);
244 }
245
246 static struct dst_ops ip6_dst_ops_template = {
247         .family                 =       AF_INET6,
248         .gc                     =       ip6_dst_gc,
249         .gc_thresh              =       1024,
250         .check                  =       ip6_dst_check,
251         .default_advmss         =       ip6_default_advmss,
252         .mtu                    =       ip6_mtu,
253         .cow_metrics            =       ipv6_cow_metrics,
254         .destroy                =       ip6_dst_destroy,
255         .ifdown                 =       ip6_dst_ifdown,
256         .negative_advice        =       ip6_negative_advice,
257         .link_failure           =       ip6_link_failure,
258         .update_pmtu            =       ip6_rt_update_pmtu,
259         .redirect               =       rt6_do_redirect,
260         .local_out              =       __ip6_local_out,
261         .neigh_lookup           =       ip6_neigh_lookup,
262         .confirm_neigh          =       ip6_confirm_neigh,
263 };
264
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
266 {
267         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
268
269         return mtu ? : dst->dev->mtu;
270 }
271
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273                                          struct sk_buff *skb, u32 mtu)
274 {
275 }
276
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278                                       struct sk_buff *skb)
279 {
280 }
281
282 static struct dst_ops ip6_dst_blackhole_ops = {
283         .family                 =       AF_INET6,
284         .destroy                =       ip6_dst_destroy,
285         .check                  =       ip6_dst_check,
286         .mtu                    =       ip6_blackhole_mtu,
287         .default_advmss         =       ip6_default_advmss,
288         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
289         .redirect               =       ip6_rt_blackhole_redirect,
290         .cow_metrics            =       dst_cow_metrics_generic,
291         .neigh_lookup           =       ip6_neigh_lookup,
292 };
293
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295         [RTAX_HOPLIMIT - 1] = 0,
296 };
297
298 static const struct rt6_info ip6_null_entry_template = {
299         .dst = {
300                 .__refcnt       = ATOMIC_INIT(1),
301                 .__use          = 1,
302                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
303                 .error          = -ENETUNREACH,
304                 .input          = ip6_pkt_discard,
305                 .output         = ip6_pkt_discard_out,
306         },
307         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
308         .rt6i_protocol  = RTPROT_KERNEL,
309         .rt6i_metric    = ~(u32) 0,
310         .rt6i_ref       = ATOMIC_INIT(1),
311 };
312
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
315 static const struct rt6_info ip6_prohibit_entry_template = {
316         .dst = {
317                 .__refcnt       = ATOMIC_INIT(1),
318                 .__use          = 1,
319                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
320                 .error          = -EACCES,
321                 .input          = ip6_pkt_prohibit,
322                 .output         = ip6_pkt_prohibit_out,
323         },
324         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
325         .rt6i_protocol  = RTPROT_KERNEL,
326         .rt6i_metric    = ~(u32) 0,
327         .rt6i_ref       = ATOMIC_INIT(1),
328 };
329
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331         .dst = {
332                 .__refcnt       = ATOMIC_INIT(1),
333                 .__use          = 1,
334                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
335                 .error          = -EINVAL,
336                 .input          = dst_discard,
337                 .output         = dst_discard_out,
338         },
339         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
340         .rt6i_protocol  = RTPROT_KERNEL,
341         .rt6i_metric    = ~(u32) 0,
342         .rt6i_ref       = ATOMIC_INIT(1),
343 };
344
345 #endif
346
347 static void rt6_info_init(struct rt6_info *rt)
348 {
349         struct dst_entry *dst = &rt->dst;
350
351         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352         INIT_LIST_HEAD(&rt->rt6i_siblings);
353         INIT_LIST_HEAD(&rt->rt6i_uncached);
354 }
355
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358                                         struct net_device *dev,
359                                         int flags)
360 {
361         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362                                         1, DST_OBSOLETE_FORCE_CHK, flags);
363
364         if (rt) {
365                 rt6_info_init(rt);
366                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
367         }
368
369         return rt;
370 }
371
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373                                struct net_device *dev,
374                                int flags)
375 {
376         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
377
378         if (rt) {
379                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380                 if (!rt->rt6i_pcpu) {
381                         dst_release_immediate(&rt->dst);
382                         return NULL;
383                 }
384         }
385
386         return rt;
387 }
388 EXPORT_SYMBOL(ip6_dst_alloc);
389
390 static void ip6_dst_destroy(struct dst_entry *dst)
391 {
392         struct rt6_info *rt = (struct rt6_info *)dst;
393         struct rt6_exception_bucket *bucket;
394         struct dst_entry *from = dst->from;
395         struct inet6_dev *idev;
396
397         dst_destroy_metrics_generic(dst);
398         free_percpu(rt->rt6i_pcpu);
399         rt6_uncached_list_del(rt);
400
401         idev = rt->rt6i_idev;
402         if (idev) {
403                 rt->rt6i_idev = NULL;
404                 in6_dev_put(idev);
405         }
406         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
407         if (bucket) {
408                 rt->rt6i_exception_bucket = NULL;
409                 kfree(bucket);
410         }
411
412         dst->from = NULL;
413         dst_release(from);
414 }
415
416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
417                            int how)
418 {
419         struct rt6_info *rt = (struct rt6_info *)dst;
420         struct inet6_dev *idev = rt->rt6i_idev;
421         struct net_device *loopback_dev =
422                 dev_net(dev)->loopback_dev;
423
424         if (idev && idev->dev != loopback_dev) {
425                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
426                 if (loopback_idev) {
427                         rt->rt6i_idev = loopback_idev;
428                         in6_dev_put(idev);
429                 }
430         }
431 }
432
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435         if (rt->rt6i_flags & RTF_EXPIRES)
436                 return time_after(jiffies, rt->dst.expires);
437         else
438                 return false;
439 }
440
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443         if (rt->rt6i_flags & RTF_EXPIRES) {
444                 if (time_after(jiffies, rt->dst.expires))
445                         return true;
446         } else if (rt->dst.from) {
447                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448                        rt6_check_expired((struct rt6_info *)rt->dst.from);
449         }
450         return false;
451 }
452
453 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
454                                              struct flowi6 *fl6, int oif,
455                                              int strict)
456 {
457         struct rt6_info *sibling, *next_sibling;
458         int route_choosen;
459
460         /* We might have already computed the hash for ICMPv6 errors. In such
461          * case it will always be non-zero. Otherwise now is the time to do it.
462          */
463         if (!fl6->mp_hash)
464                 fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
465
466         route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
467         /* Don't change the route, if route_choosen == 0
468          * (siblings does not include ourself)
469          */
470         if (route_choosen)
471                 list_for_each_entry_safe(sibling, next_sibling,
472                                 &match->rt6i_siblings, rt6i_siblings) {
473                         route_choosen--;
474                         if (route_choosen == 0) {
475                                 if (rt6_score_route(sibling, oif, strict) < 0)
476                                         break;
477                                 match = sibling;
478                                 break;
479                         }
480                 }
481         return match;
482 }
483
484 /*
485  *      Route lookup. rcu_read_lock() should be held.
486  */
487
488 static inline struct rt6_info *rt6_device_match(struct net *net,
489                                                     struct rt6_info *rt,
490                                                     const struct in6_addr *saddr,
491                                                     int oif,
492                                                     int flags)
493 {
494         struct rt6_info *local = NULL;
495         struct rt6_info *sprt;
496
497         if (!oif && ipv6_addr_any(saddr))
498                 goto out;
499
500         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) {
501                 struct net_device *dev = sprt->dst.dev;
502
503                 if (oif) {
504                         if (dev->ifindex == oif)
505                                 return sprt;
506                         if (dev->flags & IFF_LOOPBACK) {
507                                 if (!sprt->rt6i_idev ||
508                                     sprt->rt6i_idev->dev->ifindex != oif) {
509                                         if (flags & RT6_LOOKUP_F_IFACE)
510                                                 continue;
511                                         if (local &&
512                                             local->rt6i_idev->dev->ifindex == oif)
513                                                 continue;
514                                 }
515                                 local = sprt;
516                         }
517                 } else {
518                         if (ipv6_chk_addr(net, saddr, dev,
519                                           flags & RT6_LOOKUP_F_IFACE))
520                                 return sprt;
521                 }
522         }
523
524         if (oif) {
525                 if (local)
526                         return local;
527
528                 if (flags & RT6_LOOKUP_F_IFACE)
529                         return net->ipv6.ip6_null_entry;
530         }
531 out:
532         return rt;
533 }
534
535 #ifdef CONFIG_IPV6_ROUTER_PREF
536 struct __rt6_probe_work {
537         struct work_struct work;
538         struct in6_addr target;
539         struct net_device *dev;
540 };
541
542 static void rt6_probe_deferred(struct work_struct *w)
543 {
544         struct in6_addr mcaddr;
545         struct __rt6_probe_work *work =
546                 container_of(w, struct __rt6_probe_work, work);
547
548         addrconf_addr_solict_mult(&work->target, &mcaddr);
549         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
550         dev_put(work->dev);
551         kfree(work);
552 }
553
554 static void rt6_probe(struct rt6_info *rt)
555 {
556         struct __rt6_probe_work *work;
557         struct neighbour *neigh;
558         /*
559          * Okay, this does not seem to be appropriate
560          * for now, however, we need to check if it
561          * is really so; aka Router Reachability Probing.
562          *
563          * Router Reachability Probe MUST be rate-limited
564          * to no more than one per minute.
565          */
566         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
567                 return;
568         rcu_read_lock_bh();
569         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
570         if (neigh) {
571                 if (neigh->nud_state & NUD_VALID)
572                         goto out;
573
574                 work = NULL;
575                 write_lock(&neigh->lock);
576                 if (!(neigh->nud_state & NUD_VALID) &&
577                     time_after(jiffies,
578                                neigh->updated +
579                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
580                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
581                         if (work)
582                                 __neigh_set_probe_once(neigh);
583                 }
584                 write_unlock(&neigh->lock);
585         } else {
586                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
587         }
588
589         if (work) {
590                 INIT_WORK(&work->work, rt6_probe_deferred);
591                 work->target = rt->rt6i_gateway;
592                 dev_hold(rt->dst.dev);
593                 work->dev = rt->dst.dev;
594                 schedule_work(&work->work);
595         }
596
597 out:
598         rcu_read_unlock_bh();
599 }
600 #else
601 static inline void rt6_probe(struct rt6_info *rt)
602 {
603 }
604 #endif
605
606 /*
607  * Default Router Selection (RFC 2461 6.3.6)
608  */
609 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
610 {
611         struct net_device *dev = rt->dst.dev;
612         if (!oif || dev->ifindex == oif)
613                 return 2;
614         if ((dev->flags & IFF_LOOPBACK) &&
615             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
616                 return 1;
617         return 0;
618 }
619
620 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
621 {
622         struct neighbour *neigh;
623         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
624
625         if (rt->rt6i_flags & RTF_NONEXTHOP ||
626             !(rt->rt6i_flags & RTF_GATEWAY))
627                 return RT6_NUD_SUCCEED;
628
629         rcu_read_lock_bh();
630         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
631         if (neigh) {
632                 read_lock(&neigh->lock);
633                 if (neigh->nud_state & NUD_VALID)
634                         ret = RT6_NUD_SUCCEED;
635 #ifdef CONFIG_IPV6_ROUTER_PREF
636                 else if (!(neigh->nud_state & NUD_FAILED))
637                         ret = RT6_NUD_SUCCEED;
638                 else
639                         ret = RT6_NUD_FAIL_PROBE;
640 #endif
641                 read_unlock(&neigh->lock);
642         } else {
643                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
644                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
645         }
646         rcu_read_unlock_bh();
647
648         return ret;
649 }
650
651 static int rt6_score_route(struct rt6_info *rt, int oif,
652                            int strict)
653 {
654         int m;
655
656         m = rt6_check_dev(rt, oif);
657         if (!m && (strict & RT6_LOOKUP_F_IFACE))
658                 return RT6_NUD_FAIL_HARD;
659 #ifdef CONFIG_IPV6_ROUTER_PREF
660         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
661 #endif
662         if (strict & RT6_LOOKUP_F_REACHABLE) {
663                 int n = rt6_check_neigh(rt);
664                 if (n < 0)
665                         return n;
666         }
667         return m;
668 }
669
670 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
671                                    int *mpri, struct rt6_info *match,
672                                    bool *do_rr)
673 {
674         int m;
675         bool match_do_rr = false;
676         struct inet6_dev *idev = rt->rt6i_idev;
677         struct net_device *dev = rt->dst.dev;
678
679         if (dev && !netif_carrier_ok(dev) &&
680             idev->cnf.ignore_routes_with_linkdown &&
681             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
682                 goto out;
683
684         if (rt6_check_expired(rt))
685                 goto out;
686
687         m = rt6_score_route(rt, oif, strict);
688         if (m == RT6_NUD_FAIL_DO_RR) {
689                 match_do_rr = true;
690                 m = 0; /* lowest valid score */
691         } else if (m == RT6_NUD_FAIL_HARD) {
692                 goto out;
693         }
694
695         if (strict & RT6_LOOKUP_F_REACHABLE)
696                 rt6_probe(rt);
697
698         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
699         if (m > *mpri) {
700                 *do_rr = match_do_rr;
701                 *mpri = m;
702                 match = rt;
703         }
704 out:
705         return match;
706 }
707
708 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
709                                      struct rt6_info *leaf,
710                                      struct rt6_info *rr_head,
711                                      u32 metric, int oif, int strict,
712                                      bool *do_rr)
713 {
714         struct rt6_info *rt, *match, *cont;
715         int mpri = -1;
716
717         match = NULL;
718         cont = NULL;
719         for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) {
720                 if (rt->rt6i_metric != metric) {
721                         cont = rt;
722                         break;
723                 }
724
725                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
726         }
727
728         for (rt = leaf; rt && rt != rr_head;
729              rt = rcu_dereference(rt->dst.rt6_next)) {
730                 if (rt->rt6i_metric != metric) {
731                         cont = rt;
732                         break;
733                 }
734
735                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
736         }
737
738         if (match || !cont)
739                 return match;
740
741         for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next))
742                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
743
744         return match;
745 }
746
747 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
748                                    int oif, int strict)
749 {
750         struct rt6_info *leaf = rcu_dereference(fn->leaf);
751         struct rt6_info *match, *rt0;
752         bool do_rr = false;
753         int key_plen;
754
755         if (!leaf)
756                 return net->ipv6.ip6_null_entry;
757
758         rt0 = rcu_dereference(fn->rr_ptr);
759         if (!rt0)
760                 rt0 = leaf;
761
762         /* Double check to make sure fn is not an intermediate node
763          * and fn->leaf does not points to its child's leaf
764          * (This might happen if all routes under fn are deleted from
765          * the tree and fib6_repair_tree() is called on the node.)
766          */
767         key_plen = rt0->rt6i_dst.plen;
768 #ifdef CONFIG_IPV6_SUBTREES
769         if (rt0->rt6i_src.plen)
770                 key_plen = rt0->rt6i_src.plen;
771 #endif
772         if (fn->fn_bit != key_plen)
773                 return net->ipv6.ip6_null_entry;
774
775         match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
776                              &do_rr);
777
778         if (do_rr) {
779                 struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next);
780
781                 /* no entries matched; do round-robin */
782                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
783                         next = leaf;
784
785                 if (next != rt0) {
786                         spin_lock_bh(&leaf->rt6i_table->tb6_lock);
787                         /* make sure next is not being deleted from the tree */
788                         if (next->rt6i_node)
789                                 rcu_assign_pointer(fn->rr_ptr, next);
790                         spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
791                 }
792         }
793
794         return match ? match : net->ipv6.ip6_null_entry;
795 }
796
797 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
798 {
799         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
800 }
801
802 #ifdef CONFIG_IPV6_ROUTE_INFO
803 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
804                   const struct in6_addr *gwaddr)
805 {
806         struct net *net = dev_net(dev);
807         struct route_info *rinfo = (struct route_info *) opt;
808         struct in6_addr prefix_buf, *prefix;
809         unsigned int pref;
810         unsigned long lifetime;
811         struct rt6_info *rt;
812
813         if (len < sizeof(struct route_info)) {
814                 return -EINVAL;
815         }
816
817         /* Sanity check for prefix_len and length */
818         if (rinfo->length > 3) {
819                 return -EINVAL;
820         } else if (rinfo->prefix_len > 128) {
821                 return -EINVAL;
822         } else if (rinfo->prefix_len > 64) {
823                 if (rinfo->length < 2) {
824                         return -EINVAL;
825                 }
826         } else if (rinfo->prefix_len > 0) {
827                 if (rinfo->length < 1) {
828                         return -EINVAL;
829                 }
830         }
831
832         pref = rinfo->route_pref;
833         if (pref == ICMPV6_ROUTER_PREF_INVALID)
834                 return -EINVAL;
835
836         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
837
838         if (rinfo->length == 3)
839                 prefix = (struct in6_addr *)rinfo->prefix;
840         else {
841                 /* this function is safe */
842                 ipv6_addr_prefix(&prefix_buf,
843                                  (struct in6_addr *)rinfo->prefix,
844                                  rinfo->prefix_len);
845                 prefix = &prefix_buf;
846         }
847
848         if (rinfo->prefix_len == 0)
849                 rt = rt6_get_dflt_router(gwaddr, dev);
850         else
851                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
852                                         gwaddr, dev);
853
854         if (rt && !lifetime) {
855                 ip6_del_rt(rt);
856                 rt = NULL;
857         }
858
859         if (!rt && lifetime)
860                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
861                                         dev, pref);
862         else if (rt)
863                 rt->rt6i_flags = RTF_ROUTEINFO |
864                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
865
866         if (rt) {
867                 if (!addrconf_finite_timeout(lifetime))
868                         rt6_clean_expires(rt);
869                 else
870                         rt6_set_expires(rt, jiffies + HZ * lifetime);
871
872                 ip6_rt_put(rt);
873         }
874         return 0;
875 }
876 #endif
877
878 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
879                                         struct in6_addr *saddr)
880 {
881         struct fib6_node *pn, *sn;
882         while (1) {
883                 if (fn->fn_flags & RTN_TL_ROOT)
884                         return NULL;
885                 pn = rcu_dereference(fn->parent);
886                 sn = FIB6_SUBTREE(pn);
887                 if (sn && sn != fn)
888                         fn = fib6_lookup(sn, NULL, saddr);
889                 else
890                         fn = pn;
891                 if (fn->fn_flags & RTN_RTINFO)
892                         return fn;
893         }
894 }
895
896 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
897                           bool null_fallback)
898 {
899         struct rt6_info *rt = *prt;
900
901         if (dst_hold_safe(&rt->dst))
902                 return true;
903         if (null_fallback) {
904                 rt = net->ipv6.ip6_null_entry;
905                 dst_hold(&rt->dst);
906         } else {
907                 rt = NULL;
908         }
909         *prt = rt;
910         return false;
911 }
912
913 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
914                                              struct fib6_table *table,
915                                              struct flowi6 *fl6, int flags)
916 {
917         struct rt6_info *rt, *rt_cache;
918         struct fib6_node *fn;
919
920         rcu_read_lock();
921         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
922 restart:
923         rt = rcu_dereference(fn->leaf);
924         if (!rt) {
925                 rt = net->ipv6.ip6_null_entry;
926         } else {
927                 rt = rt6_device_match(net, rt, &fl6->saddr,
928                                       fl6->flowi6_oif, flags);
929                 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
930                         rt = rt6_multipath_select(rt, fl6,
931                                                   fl6->flowi6_oif, flags);
932         }
933         if (rt == net->ipv6.ip6_null_entry) {
934                 fn = fib6_backtrack(fn, &fl6->saddr);
935                 if (fn)
936                         goto restart;
937         }
938         /* Search through exception table */
939         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
940         if (rt_cache)
941                 rt = rt_cache;
942
943         if (ip6_hold_safe(net, &rt, true))
944                 dst_use_noref(&rt->dst, jiffies);
945
946         rcu_read_unlock();
947
948         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
949
950         return rt;
951
952 }
953
954 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
955                                     int flags)
956 {
957         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
958 }
959 EXPORT_SYMBOL_GPL(ip6_route_lookup);
960
961 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
962                             const struct in6_addr *saddr, int oif, int strict)
963 {
964         struct flowi6 fl6 = {
965                 .flowi6_oif = oif,
966                 .daddr = *daddr,
967         };
968         struct dst_entry *dst;
969         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
970
971         if (saddr) {
972                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
973                 flags |= RT6_LOOKUP_F_HAS_SADDR;
974         }
975
976         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
977         if (dst->error == 0)
978                 return (struct rt6_info *) dst;
979
980         dst_release(dst);
981
982         return NULL;
983 }
984 EXPORT_SYMBOL(rt6_lookup);
985
986 /* ip6_ins_rt is called with FREE table->tb6_lock.
987  * It takes new route entry, the addition fails by any reason the
988  * route is released.
989  * Caller must hold dst before calling it.
990  */
991
992 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
993                         struct mx6_config *mxc,
994                         struct netlink_ext_ack *extack)
995 {
996         int err;
997         struct fib6_table *table;
998
999         table = rt->rt6i_table;
1000         spin_lock_bh(&table->tb6_lock);
1001         err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1002         spin_unlock_bh(&table->tb6_lock);
1003
1004         return err;
1005 }
1006
1007 int ip6_ins_rt(struct rt6_info *rt)
1008 {
1009         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
1010         struct mx6_config mxc = { .mx = NULL, };
1011
1012         /* Hold dst to account for the reference from the fib6 tree */
1013         dst_hold(&rt->dst);
1014         return __ip6_ins_rt(rt, &info, &mxc, NULL);
1015 }
1016
1017 /* called with rcu_lock held */
1018 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1019 {
1020         struct net_device *dev = rt->dst.dev;
1021
1022         if (rt->rt6i_flags & RTF_LOCAL) {
1023                 /* for copies of local routes, dst->dev needs to be the
1024                  * device if it is a master device, the master device if
1025                  * device is enslaved, and the loopback as the default
1026                  */
1027                 if (netif_is_l3_slave(dev) &&
1028                     !rt6_need_strict(&rt->rt6i_dst.addr))
1029                         dev = l3mdev_master_dev_rcu(dev);
1030                 else if (!netif_is_l3_master(dev))
1031                         dev = dev_net(dev)->loopback_dev;
1032                 /* last case is netif_is_l3_master(dev) is true in which
1033                  * case we want dev returned to be dev
1034                  */
1035         }
1036
1037         return dev;
1038 }
1039
1040 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1041                                            const struct in6_addr *daddr,
1042                                            const struct in6_addr *saddr)
1043 {
1044         struct net_device *dev;
1045         struct rt6_info *rt;
1046
1047         /*
1048          *      Clone the route.
1049          */
1050
1051         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1052                 ort = (struct rt6_info *)ort->dst.from;
1053
1054         rcu_read_lock();
1055         dev = ip6_rt_get_dev_rcu(ort);
1056         rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1057         rcu_read_unlock();
1058         if (!rt)
1059                 return NULL;
1060
1061         ip6_rt_copy_init(rt, ort);
1062         rt->rt6i_flags |= RTF_CACHE;
1063         rt->rt6i_metric = 0;
1064         rt->dst.flags |= DST_HOST;
1065         rt->rt6i_dst.addr = *daddr;
1066         rt->rt6i_dst.plen = 128;
1067
1068         if (!rt6_is_gw_or_nonexthop(ort)) {
1069                 if (ort->rt6i_dst.plen != 128 &&
1070                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1071                         rt->rt6i_flags |= RTF_ANYCAST;
1072 #ifdef CONFIG_IPV6_SUBTREES
1073                 if (rt->rt6i_src.plen && saddr) {
1074                         rt->rt6i_src.addr = *saddr;
1075                         rt->rt6i_src.plen = 128;
1076                 }
1077 #endif
1078         }
1079
1080         return rt;
1081 }
1082
1083 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1084 {
1085         struct net_device *dev;
1086         struct rt6_info *pcpu_rt;
1087
1088         rcu_read_lock();
1089         dev = ip6_rt_get_dev_rcu(rt);
1090         pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1091         rcu_read_unlock();
1092         if (!pcpu_rt)
1093                 return NULL;
1094         ip6_rt_copy_init(pcpu_rt, rt);
1095         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1096         pcpu_rt->rt6i_flags |= RTF_PCPU;
1097         return pcpu_rt;
1098 }
1099
1100 /* It should be called with rcu_read_lock() acquired */
1101 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1102 {
1103         struct rt6_info *pcpu_rt, **p;
1104
1105         p = this_cpu_ptr(rt->rt6i_pcpu);
1106         pcpu_rt = *p;
1107
1108         if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1109                 rt6_dst_from_metrics_check(pcpu_rt);
1110
1111         return pcpu_rt;
1112 }
1113
1114 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1115 {
1116         struct rt6_info *pcpu_rt, *prev, **p;
1117
1118         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1119         if (!pcpu_rt) {
1120                 struct net *net = dev_net(rt->dst.dev);
1121
1122                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1123                 return net->ipv6.ip6_null_entry;
1124         }
1125
1126         dst_hold(&pcpu_rt->dst);
1127         p = this_cpu_ptr(rt->rt6i_pcpu);
1128         prev = cmpxchg(p, NULL, pcpu_rt);
1129         BUG_ON(prev);
1130
1131         rt6_dst_from_metrics_check(pcpu_rt);
1132         return pcpu_rt;
1133 }
1134
1135 /* exception hash table implementation
1136  */
1137 static DEFINE_SPINLOCK(rt6_exception_lock);
1138
1139 /* Remove rt6_ex from hash table and free the memory
1140  * Caller must hold rt6_exception_lock
1141  */
1142 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1143                                  struct rt6_exception *rt6_ex)
1144 {
1145         struct net *net;
1146
1147         if (!bucket || !rt6_ex)
1148                 return;
1149
1150         net = dev_net(rt6_ex->rt6i->dst.dev);
1151         rt6_ex->rt6i->rt6i_node = NULL;
1152         hlist_del_rcu(&rt6_ex->hlist);
1153         rt6_release(rt6_ex->rt6i);
1154         kfree_rcu(rt6_ex, rcu);
1155         WARN_ON_ONCE(!bucket->depth);
1156         bucket->depth--;
1157         net->ipv6.rt6_stats->fib_rt_cache--;
1158 }
1159
1160 /* Remove oldest rt6_ex in bucket and free the memory
1161  * Caller must hold rt6_exception_lock
1162  */
1163 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1164 {
1165         struct rt6_exception *rt6_ex, *oldest = NULL;
1166
1167         if (!bucket)
1168                 return;
1169
1170         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1171                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1172                         oldest = rt6_ex;
1173         }
1174         rt6_remove_exception(bucket, oldest);
1175 }
1176
1177 static u32 rt6_exception_hash(const struct in6_addr *dst,
1178                               const struct in6_addr *src)
1179 {
1180         static u32 seed __read_mostly;
1181         u32 val;
1182
1183         net_get_random_once(&seed, sizeof(seed));
1184         val = jhash(dst, sizeof(*dst), seed);
1185
1186 #ifdef CONFIG_IPV6_SUBTREES
1187         if (src)
1188                 val = jhash(src, sizeof(*src), val);
1189 #endif
1190         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1191 }
1192
1193 /* Helper function to find the cached rt in the hash table
1194  * and update bucket pointer to point to the bucket for this
1195  * (daddr, saddr) pair
1196  * Caller must hold rt6_exception_lock
1197  */
1198 static struct rt6_exception *
1199 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1200                               const struct in6_addr *daddr,
1201                               const struct in6_addr *saddr)
1202 {
1203         struct rt6_exception *rt6_ex;
1204         u32 hval;
1205
1206         if (!(*bucket) || !daddr)
1207                 return NULL;
1208
1209         hval = rt6_exception_hash(daddr, saddr);
1210         *bucket += hval;
1211
1212         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1213                 struct rt6_info *rt6 = rt6_ex->rt6i;
1214                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1215
1216 #ifdef CONFIG_IPV6_SUBTREES
1217                 if (matched && saddr)
1218                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1219 #endif
1220                 if (matched)
1221                         return rt6_ex;
1222         }
1223         return NULL;
1224 }
1225
1226 /* Helper function to find the cached rt in the hash table
1227  * and update bucket pointer to point to the bucket for this
1228  * (daddr, saddr) pair
1229  * Caller must hold rcu_read_lock()
1230  */
1231 static struct rt6_exception *
1232 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1233                          const struct in6_addr *daddr,
1234                          const struct in6_addr *saddr)
1235 {
1236         struct rt6_exception *rt6_ex;
1237         u32 hval;
1238
1239         WARN_ON_ONCE(!rcu_read_lock_held());
1240
1241         if (!(*bucket) || !daddr)
1242                 return NULL;
1243
1244         hval = rt6_exception_hash(daddr, saddr);
1245         *bucket += hval;
1246
1247         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1248                 struct rt6_info *rt6 = rt6_ex->rt6i;
1249                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1250
1251 #ifdef CONFIG_IPV6_SUBTREES
1252                 if (matched && saddr)
1253                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1254 #endif
1255                 if (matched)
1256                         return rt6_ex;
1257         }
1258         return NULL;
1259 }
1260
1261 static int rt6_insert_exception(struct rt6_info *nrt,
1262                                 struct rt6_info *ort)
1263 {
1264         struct net *net = dev_net(ort->dst.dev);
1265         struct rt6_exception_bucket *bucket;
1266         struct in6_addr *src_key = NULL;
1267         struct rt6_exception *rt6_ex;
1268         int err = 0;
1269
1270         /* ort can't be a cache or pcpu route */
1271         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1272                 ort = (struct rt6_info *)ort->dst.from;
1273         WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1274
1275         spin_lock_bh(&rt6_exception_lock);
1276
1277         if (ort->exception_bucket_flushed) {
1278                 err = -EINVAL;
1279                 goto out;
1280         }
1281
1282         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1283                                         lockdep_is_held(&rt6_exception_lock));
1284         if (!bucket) {
1285                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1286                                  GFP_ATOMIC);
1287                 if (!bucket) {
1288                         err = -ENOMEM;
1289                         goto out;
1290                 }
1291                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1292         }
1293
1294 #ifdef CONFIG_IPV6_SUBTREES
1295         /* rt6i_src.plen != 0 indicates ort is in subtree
1296          * and exception table is indexed by a hash of
1297          * both rt6i_dst and rt6i_src.
1298          * Otherwise, the exception table is indexed by
1299          * a hash of only rt6i_dst.
1300          */
1301         if (ort->rt6i_src.plen)
1302                 src_key = &nrt->rt6i_src.addr;
1303 #endif
1304
1305         /* Update rt6i_prefsrc as it could be changed
1306          * in rt6_remove_prefsrc()
1307          */
1308         nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1309         /* rt6_mtu_change() might lower mtu on ort.
1310          * Only insert this exception route if its mtu
1311          * is less than ort's mtu value.
1312          */
1313         if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1314                 err = -EINVAL;
1315                 goto out;
1316         }
1317
1318         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1319                                                src_key);
1320         if (rt6_ex)
1321                 rt6_remove_exception(bucket, rt6_ex);
1322
1323         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1324         if (!rt6_ex) {
1325                 err = -ENOMEM;
1326                 goto out;
1327         }
1328         rt6_ex->rt6i = nrt;
1329         rt6_ex->stamp = jiffies;
1330         atomic_inc(&nrt->rt6i_ref);
1331         nrt->rt6i_node = ort->rt6i_node;
1332         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1333         bucket->depth++;
1334         net->ipv6.rt6_stats->fib_rt_cache++;
1335
1336         if (bucket->depth > FIB6_MAX_DEPTH)
1337                 rt6_exception_remove_oldest(bucket);
1338
1339 out:
1340         spin_unlock_bh(&rt6_exception_lock);
1341
1342         /* Update fn->fn_sernum to invalidate all cached dst */
1343         if (!err) {
1344                 fib6_update_sernum(ort);
1345                 fib6_force_start_gc(net);
1346         }
1347
1348         return err;
1349 }
1350
1351 void rt6_flush_exceptions(struct rt6_info *rt)
1352 {
1353         struct rt6_exception_bucket *bucket;
1354         struct rt6_exception *rt6_ex;
1355         struct hlist_node *tmp;
1356         int i;
1357
1358         spin_lock_bh(&rt6_exception_lock);
1359         /* Prevent rt6_insert_exception() to recreate the bucket list */
1360         rt->exception_bucket_flushed = 1;
1361
1362         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1363                                     lockdep_is_held(&rt6_exception_lock));
1364         if (!bucket)
1365                 goto out;
1366
1367         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1368                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1369                         rt6_remove_exception(bucket, rt6_ex);
1370                 WARN_ON_ONCE(bucket->depth);
1371                 bucket++;
1372         }
1373
1374 out:
1375         spin_unlock_bh(&rt6_exception_lock);
1376 }
1377
1378 /* Find cached rt in the hash table inside passed in rt
1379  * Caller has to hold rcu_read_lock()
1380  */
1381 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1382                                            struct in6_addr *daddr,
1383                                            struct in6_addr *saddr)
1384 {
1385         struct rt6_exception_bucket *bucket;
1386         struct in6_addr *src_key = NULL;
1387         struct rt6_exception *rt6_ex;
1388         struct rt6_info *res = NULL;
1389
1390         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1391
1392 #ifdef CONFIG_IPV6_SUBTREES
1393         /* rt6i_src.plen != 0 indicates rt is in subtree
1394          * and exception table is indexed by a hash of
1395          * both rt6i_dst and rt6i_src.
1396          * Otherwise, the exception table is indexed by
1397          * a hash of only rt6i_dst.
1398          */
1399         if (rt->rt6i_src.plen)
1400                 src_key = saddr;
1401 #endif
1402         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1403
1404         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1405                 res = rt6_ex->rt6i;
1406
1407         return res;
1408 }
1409
1410 /* Remove the passed in cached rt from the hash table that contains it */
1411 int rt6_remove_exception_rt(struct rt6_info *rt)
1412 {
1413         struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1414         struct rt6_exception_bucket *bucket;
1415         struct in6_addr *src_key = NULL;
1416         struct rt6_exception *rt6_ex;
1417         int err;
1418
1419         if (!from ||
1420             !(rt->rt6i_flags & RTF_CACHE))
1421                 return -EINVAL;
1422
1423         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1424                 return -ENOENT;
1425
1426         spin_lock_bh(&rt6_exception_lock);
1427         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1428                                     lockdep_is_held(&rt6_exception_lock));
1429 #ifdef CONFIG_IPV6_SUBTREES
1430         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1431          * and exception table is indexed by a hash of
1432          * both rt6i_dst and rt6i_src.
1433          * Otherwise, the exception table is indexed by
1434          * a hash of only rt6i_dst.
1435          */
1436         if (from->rt6i_src.plen)
1437                 src_key = &rt->rt6i_src.addr;
1438 #endif
1439         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1440                                                &rt->rt6i_dst.addr,
1441                                                src_key);
1442         if (rt6_ex) {
1443                 rt6_remove_exception(bucket, rt6_ex);
1444                 err = 0;
1445         } else {
1446                 err = -ENOENT;
1447         }
1448
1449         spin_unlock_bh(&rt6_exception_lock);
1450         return err;
1451 }
1452
1453 /* Find rt6_ex which contains the passed in rt cache and
1454  * refresh its stamp
1455  */
1456 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1457 {
1458         struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1459         struct rt6_exception_bucket *bucket;
1460         struct in6_addr *src_key = NULL;
1461         struct rt6_exception *rt6_ex;
1462
1463         if (!from ||
1464             !(rt->rt6i_flags & RTF_CACHE))
1465                 return;
1466
1467         rcu_read_lock();
1468         bucket = rcu_dereference(from->rt6i_exception_bucket);
1469
1470 #ifdef CONFIG_IPV6_SUBTREES
1471         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1472          * and exception table is indexed by a hash of
1473          * both rt6i_dst and rt6i_src.
1474          * Otherwise, the exception table is indexed by
1475          * a hash of only rt6i_dst.
1476          */
1477         if (from->rt6i_src.plen)
1478                 src_key = &rt->rt6i_src.addr;
1479 #endif
1480         rt6_ex = __rt6_find_exception_rcu(&bucket,
1481                                           &rt->rt6i_dst.addr,
1482                                           src_key);
1483         if (rt6_ex)
1484                 rt6_ex->stamp = jiffies;
1485
1486         rcu_read_unlock();
1487 }
1488
1489 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1490 {
1491         struct rt6_exception_bucket *bucket;
1492         struct rt6_exception *rt6_ex;
1493         int i;
1494
1495         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1496                                         lockdep_is_held(&rt6_exception_lock));
1497
1498         if (bucket) {
1499                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1500                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1501                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1502                         }
1503                         bucket++;
1504                 }
1505         }
1506 }
1507
1508 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
1509 {
1510         struct rt6_exception_bucket *bucket;
1511         struct rt6_exception *rt6_ex;
1512         int i;
1513
1514         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1515                                         lockdep_is_held(&rt6_exception_lock));
1516
1517         if (bucket) {
1518                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1519                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1520                                 struct rt6_info *entry = rt6_ex->rt6i;
1521                                 /* For RTF_CACHE with rt6i_pmtu == 0
1522                                  * (i.e. a redirected route),
1523                                  * the metrics of its rt->dst.from has already
1524                                  * been updated.
1525                                  */
1526                                 if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
1527                                         entry->rt6i_pmtu = mtu;
1528                         }
1529                         bucket++;
1530                 }
1531         }
1532 }
1533
1534 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1535
1536 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1537                                         struct in6_addr *gateway)
1538 {
1539         struct rt6_exception_bucket *bucket;
1540         struct rt6_exception *rt6_ex;
1541         struct hlist_node *tmp;
1542         int i;
1543
1544         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1545                 return;
1546
1547         spin_lock_bh(&rt6_exception_lock);
1548         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1549                                      lockdep_is_held(&rt6_exception_lock));
1550
1551         if (bucket) {
1552                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1553                         hlist_for_each_entry_safe(rt6_ex, tmp,
1554                                                   &bucket->chain, hlist) {
1555                                 struct rt6_info *entry = rt6_ex->rt6i;
1556
1557                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1558                                     RTF_CACHE_GATEWAY &&
1559                                     ipv6_addr_equal(gateway,
1560                                                     &entry->rt6i_gateway)) {
1561                                         rt6_remove_exception(bucket, rt6_ex);
1562                                 }
1563                         }
1564                         bucket++;
1565                 }
1566         }
1567
1568         spin_unlock_bh(&rt6_exception_lock);
1569 }
1570
1571 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1572                                       struct rt6_exception *rt6_ex,
1573                                       struct fib6_gc_args *gc_args,
1574                                       unsigned long now)
1575 {
1576         struct rt6_info *rt = rt6_ex->rt6i;
1577
1578         if (atomic_read(&rt->dst.__refcnt) == 1 &&
1579             time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1580                 RT6_TRACE("aging clone %p\n", rt);
1581                 rt6_remove_exception(bucket, rt6_ex);
1582                 return;
1583         } else if (rt->rt6i_flags & RTF_GATEWAY) {
1584                 struct neighbour *neigh;
1585                 __u8 neigh_flags = 0;
1586
1587                 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1588                 if (neigh) {
1589                         neigh_flags = neigh->flags;
1590                         neigh_release(neigh);
1591                 }
1592                 if (!(neigh_flags & NTF_ROUTER)) {
1593                         RT6_TRACE("purging route %p via non-router but gateway\n",
1594                                   rt);
1595                         rt6_remove_exception(bucket, rt6_ex);
1596                         return;
1597                 }
1598         }
1599         gc_args->more++;
1600 }
1601
1602 void rt6_age_exceptions(struct rt6_info *rt,
1603                         struct fib6_gc_args *gc_args,
1604                         unsigned long now)
1605 {
1606         struct rt6_exception_bucket *bucket;
1607         struct rt6_exception *rt6_ex;
1608         struct hlist_node *tmp;
1609         int i;
1610
1611         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1612                 return;
1613
1614         spin_lock_bh(&rt6_exception_lock);
1615         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1616                                     lockdep_is_held(&rt6_exception_lock));
1617
1618         if (bucket) {
1619                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1620                         hlist_for_each_entry_safe(rt6_ex, tmp,
1621                                                   &bucket->chain, hlist) {
1622                                 rt6_age_examine_exception(bucket, rt6_ex,
1623                                                           gc_args, now);
1624                         }
1625                         bucket++;
1626                 }
1627         }
1628         spin_unlock_bh(&rt6_exception_lock);
1629 }
1630
1631 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1632                                int oif, struct flowi6 *fl6, int flags)
1633 {
1634         struct fib6_node *fn, *saved_fn;
1635         struct rt6_info *rt, *rt_cache;
1636         int strict = 0;
1637
1638         strict |= flags & RT6_LOOKUP_F_IFACE;
1639         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1640         if (net->ipv6.devconf_all->forwarding == 0)
1641                 strict |= RT6_LOOKUP_F_REACHABLE;
1642
1643         rcu_read_lock();
1644
1645         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1646         saved_fn = fn;
1647
1648         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1649                 oif = 0;
1650
1651 redo_rt6_select:
1652         rt = rt6_select(net, fn, oif, strict);
1653         if (rt->rt6i_nsiblings)
1654                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1655         if (rt == net->ipv6.ip6_null_entry) {
1656                 fn = fib6_backtrack(fn, &fl6->saddr);
1657                 if (fn)
1658                         goto redo_rt6_select;
1659                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1660                         /* also consider unreachable route */
1661                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1662                         fn = saved_fn;
1663                         goto redo_rt6_select;
1664                 }
1665         }
1666
1667         /*Search through exception table */
1668         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1669         if (rt_cache)
1670                 rt = rt_cache;
1671
1672         if (rt == net->ipv6.ip6_null_entry) {
1673                 rcu_read_unlock();
1674                 dst_hold(&rt->dst);
1675                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1676                 return rt;
1677         } else if (rt->rt6i_flags & RTF_CACHE) {
1678                 if (ip6_hold_safe(net, &rt, true)) {
1679                         dst_use_noref(&rt->dst, jiffies);
1680                         rt6_dst_from_metrics_check(rt);
1681                 }
1682                 rcu_read_unlock();
1683                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1684                 return rt;
1685         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1686                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1687                 /* Create a RTF_CACHE clone which will not be
1688                  * owned by the fib6 tree.  It is for the special case where
1689                  * the daddr in the skb during the neighbor look-up is different
1690                  * from the fl6->daddr used to look-up route here.
1691                  */
1692
1693                 struct rt6_info *uncached_rt;
1694
1695                 if (ip6_hold_safe(net, &rt, true)) {
1696                         dst_use_noref(&rt->dst, jiffies);
1697                 } else {
1698                         rcu_read_unlock();
1699                         uncached_rt = rt;
1700                         goto uncached_rt_out;
1701                 }
1702                 rcu_read_unlock();
1703
1704                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1705                 dst_release(&rt->dst);
1706
1707                 if (uncached_rt) {
1708                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1709                          * No need for another dst_hold()
1710                          */
1711                         rt6_uncached_list_add(uncached_rt);
1712                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1713                 } else {
1714                         uncached_rt = net->ipv6.ip6_null_entry;
1715                         dst_hold(&uncached_rt->dst);
1716                 }
1717
1718 uncached_rt_out:
1719                 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1720                 return uncached_rt;
1721
1722         } else {
1723                 /* Get a percpu copy */
1724
1725                 struct rt6_info *pcpu_rt;
1726
1727                 dst_use_noref(&rt->dst, jiffies);
1728                 local_bh_disable();
1729                 pcpu_rt = rt6_get_pcpu_route(rt);
1730
1731                 if (!pcpu_rt) {
1732                         /* atomic_inc_not_zero() is needed when using rcu */
1733                         if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1734                                 /* No dst_hold() on rt is needed because grabbing
1735                                  * rt->rt6i_ref makes sure rt can't be released.
1736                                  */
1737                                 pcpu_rt = rt6_make_pcpu_route(rt);
1738                                 rt6_release(rt);
1739                         } else {
1740                                 /* rt is already removed from tree */
1741                                 pcpu_rt = net->ipv6.ip6_null_entry;
1742                                 dst_hold(&pcpu_rt->dst);
1743                         }
1744                 }
1745                 local_bh_enable();
1746                 rcu_read_unlock();
1747                 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1748                 return pcpu_rt;
1749         }
1750 }
1751 EXPORT_SYMBOL_GPL(ip6_pol_route);
1752
1753 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1754                                             struct flowi6 *fl6, int flags)
1755 {
1756         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1757 }
1758
1759 struct dst_entry *ip6_route_input_lookup(struct net *net,
1760                                          struct net_device *dev,
1761                                          struct flowi6 *fl6, int flags)
1762 {
1763         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1764                 flags |= RT6_LOOKUP_F_IFACE;
1765
1766         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1767 }
1768 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1769
1770 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1771                                   struct flow_keys *keys)
1772 {
1773         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1774         const struct ipv6hdr *key_iph = outer_iph;
1775         const struct ipv6hdr *inner_iph;
1776         const struct icmp6hdr *icmph;
1777         struct ipv6hdr _inner_iph;
1778
1779         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1780                 goto out;
1781
1782         icmph = icmp6_hdr(skb);
1783         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1784             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1785             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1786             icmph->icmp6_type != ICMPV6_PARAMPROB)
1787                 goto out;
1788
1789         inner_iph = skb_header_pointer(skb,
1790                                        skb_transport_offset(skb) + sizeof(*icmph),
1791                                        sizeof(_inner_iph), &_inner_iph);
1792         if (!inner_iph)
1793                 goto out;
1794
1795         key_iph = inner_iph;
1796 out:
1797         memset(keys, 0, sizeof(*keys));
1798         keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1799         keys->addrs.v6addrs.src = key_iph->saddr;
1800         keys->addrs.v6addrs.dst = key_iph->daddr;
1801         keys->tags.flow_label = ip6_flowinfo(key_iph);
1802         keys->basic.ip_proto = key_iph->nexthdr;
1803 }
1804
1805 /* if skb is set it will be used and fl6 can be NULL */
1806 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1807 {
1808         struct flow_keys hash_keys;
1809
1810         if (skb) {
1811                 ip6_multipath_l3_keys(skb, &hash_keys);
1812                 return flow_hash_from_keys(&hash_keys);
1813         }
1814
1815         return get_hash_from_flowi6(fl6);
1816 }
1817
1818 void ip6_route_input(struct sk_buff *skb)
1819 {
1820         const struct ipv6hdr *iph = ipv6_hdr(skb);
1821         struct net *net = dev_net(skb->dev);
1822         int flags = RT6_LOOKUP_F_HAS_SADDR;
1823         struct ip_tunnel_info *tun_info;
1824         struct flowi6 fl6 = {
1825                 .flowi6_iif = skb->dev->ifindex,
1826                 .daddr = iph->daddr,
1827                 .saddr = iph->saddr,
1828                 .flowlabel = ip6_flowinfo(iph),
1829                 .flowi6_mark = skb->mark,
1830                 .flowi6_proto = iph->nexthdr,
1831         };
1832
1833         tun_info = skb_tunnel_info(skb);
1834         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1835                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1836         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1837                 fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1838         skb_dst_drop(skb);
1839         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1840 }
1841
1842 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1843                                              struct flowi6 *fl6, int flags)
1844 {
1845         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1846 }
1847
1848 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1849                                          struct flowi6 *fl6, int flags)
1850 {
1851         bool any_src;
1852
1853         if (rt6_need_strict(&fl6->daddr)) {
1854                 struct dst_entry *dst;
1855
1856                 dst = l3mdev_link_scope_lookup(net, fl6);
1857                 if (dst)
1858                         return dst;
1859         }
1860
1861         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1862
1863         any_src = ipv6_addr_any(&fl6->saddr);
1864         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1865             (fl6->flowi6_oif && any_src))
1866                 flags |= RT6_LOOKUP_F_IFACE;
1867
1868         if (!any_src)
1869                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1870         else if (sk)
1871                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1872
1873         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1874 }
1875 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1876
1877 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1878 {
1879         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1880         struct net_device *loopback_dev = net->loopback_dev;
1881         struct dst_entry *new = NULL;
1882
1883         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1884                        DST_OBSOLETE_DEAD, 0);
1885         if (rt) {
1886                 rt6_info_init(rt);
1887                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
1888
1889                 new = &rt->dst;
1890                 new->__use = 1;
1891                 new->input = dst_discard;
1892                 new->output = dst_discard_out;
1893
1894                 dst_copy_metrics(new, &ort->dst);
1895
1896                 rt->rt6i_idev = in6_dev_get(loopback_dev);
1897                 rt->rt6i_gateway = ort->rt6i_gateway;
1898                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1899                 rt->rt6i_metric = 0;
1900
1901                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1902 #ifdef CONFIG_IPV6_SUBTREES
1903                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1904 #endif
1905         }
1906
1907         dst_release(dst_orig);
1908         return new ? new : ERR_PTR(-ENOMEM);
1909 }
1910
1911 /*
1912  *      Destination cache support functions
1913  */
1914
1915 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1916 {
1917         if (rt->dst.from &&
1918             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1919                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1920 }
1921
1922 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1923 {
1924         u32 rt_cookie = 0;
1925
1926         if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1927                 return NULL;
1928
1929         if (rt6_check_expired(rt))
1930                 return NULL;
1931
1932         return &rt->dst;
1933 }
1934
1935 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1936 {
1937         if (!__rt6_check_expired(rt) &&
1938             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1939             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1940                 return &rt->dst;
1941         else
1942                 return NULL;
1943 }
1944
1945 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1946 {
1947         struct rt6_info *rt;
1948
1949         rt = (struct rt6_info *) dst;
1950
1951         /* All IPV6 dsts are created with ->obsolete set to the value
1952          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1953          * into this function always.
1954          */
1955
1956         rt6_dst_from_metrics_check(rt);
1957
1958         if (rt->rt6i_flags & RTF_PCPU ||
1959             (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
1960                 return rt6_dst_from_check(rt, cookie);
1961         else
1962                 return rt6_check(rt, cookie);
1963 }
1964
1965 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1966 {
1967         struct rt6_info *rt = (struct rt6_info *) dst;
1968
1969         if (rt) {
1970                 if (rt->rt6i_flags & RTF_CACHE) {
1971                         if (rt6_check_expired(rt)) {
1972                                 ip6_del_rt(rt);
1973                                 dst = NULL;
1974                         }
1975                 } else {
1976                         dst_release(dst);
1977                         dst = NULL;
1978                 }
1979         }
1980         return dst;
1981 }
1982
1983 static void ip6_link_failure(struct sk_buff *skb)
1984 {
1985         struct rt6_info *rt;
1986
1987         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1988
1989         rt = (struct rt6_info *) skb_dst(skb);
1990         if (rt) {
1991                 if (rt->rt6i_flags & RTF_CACHE) {
1992                         if (dst_hold_safe(&rt->dst))
1993                                 ip6_del_rt(rt);
1994                 } else {
1995                         struct fib6_node *fn;
1996
1997                         rcu_read_lock();
1998                         fn = rcu_dereference(rt->rt6i_node);
1999                         if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2000                                 fn->fn_sernum = -1;
2001                         rcu_read_unlock();
2002                 }
2003         }
2004 }
2005
2006 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2007 {
2008         struct net *net = dev_net(rt->dst.dev);
2009
2010         rt->rt6i_flags |= RTF_MODIFIED;
2011         rt->rt6i_pmtu = mtu;
2012         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2013 }
2014
2015 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2016 {
2017         return !(rt->rt6i_flags & RTF_CACHE) &&
2018                 (rt->rt6i_flags & RTF_PCPU ||
2019                  rcu_access_pointer(rt->rt6i_node));
2020 }
2021
2022 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2023                                  const struct ipv6hdr *iph, u32 mtu)
2024 {
2025         const struct in6_addr *daddr, *saddr;
2026         struct rt6_info *rt6 = (struct rt6_info *)dst;
2027
2028         if (rt6->rt6i_flags & RTF_LOCAL)
2029                 return;
2030
2031         if (dst_metric_locked(dst, RTAX_MTU))
2032                 return;
2033
2034         if (iph) {
2035                 daddr = &iph->daddr;
2036                 saddr = &iph->saddr;
2037         } else if (sk) {
2038                 daddr = &sk->sk_v6_daddr;
2039                 saddr = &inet6_sk(sk)->saddr;
2040         } else {
2041                 daddr = NULL;
2042                 saddr = NULL;
2043         }
2044         dst_confirm_neigh(dst, daddr);
2045         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2046         if (mtu >= dst_mtu(dst))
2047                 return;
2048
2049         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2050                 rt6_do_update_pmtu(rt6, mtu);
2051                 /* update rt6_ex->stamp for cache */
2052                 if (rt6->rt6i_flags & RTF_CACHE)
2053                         rt6_update_exception_stamp_rt(rt6);
2054         } else if (daddr) {
2055                 struct rt6_info *nrt6;
2056
2057                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2058                 if (nrt6) {
2059                         rt6_do_update_pmtu(nrt6, mtu);
2060                         if (rt6_insert_exception(nrt6, rt6))
2061                                 dst_release_immediate(&nrt6->dst);
2062                 }
2063         }
2064 }
2065
2066 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2067                                struct sk_buff *skb, u32 mtu)
2068 {
2069         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2070 }
2071
2072 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2073                      int oif, u32 mark, kuid_t uid)
2074 {
2075         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2076         struct dst_entry *dst;
2077         struct flowi6 fl6;
2078
2079         memset(&fl6, 0, sizeof(fl6));
2080         fl6.flowi6_oif = oif;
2081         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2082         fl6.daddr = iph->daddr;
2083         fl6.saddr = iph->saddr;
2084         fl6.flowlabel = ip6_flowinfo(iph);
2085         fl6.flowi6_uid = uid;
2086
2087         dst = ip6_route_output(net, NULL, &fl6);
2088         if (!dst->error)
2089                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2090         dst_release(dst);
2091 }
2092 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2093
2094 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2095 {
2096         struct dst_entry *dst;
2097
2098         ip6_update_pmtu(skb, sock_net(sk), mtu,
2099                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2100
2101         dst = __sk_dst_get(sk);
2102         if (!dst || !dst->obsolete ||
2103             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2104                 return;
2105
2106         bh_lock_sock(sk);
2107         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2108                 ip6_datagram_dst_update(sk, false);
2109         bh_unlock_sock(sk);
2110 }
2111 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2112
2113 /* Handle redirects */
2114 struct ip6rd_flowi {
2115         struct flowi6 fl6;
2116         struct in6_addr gateway;
2117 };
2118
2119 static struct rt6_info *__ip6_route_redirect(struct net *net,
2120                                              struct fib6_table *table,
2121                                              struct flowi6 *fl6,
2122                                              int flags)
2123 {
2124         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2125         struct rt6_info *rt, *rt_cache;
2126         struct fib6_node *fn;
2127
2128         /* Get the "current" route for this destination and
2129          * check if the redirect has come from appropriate router.
2130          *
2131          * RFC 4861 specifies that redirects should only be
2132          * accepted if they come from the nexthop to the target.
2133          * Due to the way the routes are chosen, this notion
2134          * is a bit fuzzy and one might need to check all possible
2135          * routes.
2136          */
2137
2138         rcu_read_lock();
2139         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2140 restart:
2141         for_each_fib6_node_rt_rcu(fn) {
2142                 if (rt6_check_expired(rt))
2143                         continue;
2144                 if (rt->dst.error)
2145                         break;
2146                 if (!(rt->rt6i_flags & RTF_GATEWAY))
2147                         continue;
2148                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2149                         continue;
2150                 /* rt_cache's gateway might be different from its 'parent'
2151                  * in the case of an ip redirect.
2152                  * So we keep searching in the exception table if the gateway
2153                  * is different.
2154                  */
2155                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2156                         rt_cache = rt6_find_cached_rt(rt,
2157                                                       &fl6->daddr,
2158                                                       &fl6->saddr);
2159                         if (rt_cache &&
2160                             ipv6_addr_equal(&rdfl->gateway,
2161                                             &rt_cache->rt6i_gateway)) {
2162                                 rt = rt_cache;
2163                                 break;
2164                         }
2165                         continue;
2166                 }
2167                 break;
2168         }
2169
2170         if (!rt)
2171                 rt = net->ipv6.ip6_null_entry;
2172         else if (rt->dst.error) {
2173                 rt = net->ipv6.ip6_null_entry;
2174                 goto out;
2175         }
2176
2177         if (rt == net->ipv6.ip6_null_entry) {
2178                 fn = fib6_backtrack(fn, &fl6->saddr);
2179                 if (fn)
2180                         goto restart;
2181         }
2182
2183 out:
2184         ip6_hold_safe(net, &rt, true);
2185
2186         rcu_read_unlock();
2187
2188         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
2189         return rt;
2190 };
2191
2192 static struct dst_entry *ip6_route_redirect(struct net *net,
2193                                         const struct flowi6 *fl6,
2194                                         const struct in6_addr *gateway)
2195 {
2196         int flags = RT6_LOOKUP_F_HAS_SADDR;
2197         struct ip6rd_flowi rdfl;
2198
2199         rdfl.fl6 = *fl6;
2200         rdfl.gateway = *gateway;
2201
2202         return fib6_rule_lookup(net, &rdfl.fl6,
2203                                 flags, __ip6_route_redirect);
2204 }
2205
2206 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2207                   kuid_t uid)
2208 {
2209         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2210         struct dst_entry *dst;
2211         struct flowi6 fl6;
2212
2213         memset(&fl6, 0, sizeof(fl6));
2214         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2215         fl6.flowi6_oif = oif;
2216         fl6.flowi6_mark = mark;
2217         fl6.daddr = iph->daddr;
2218         fl6.saddr = iph->saddr;
2219         fl6.flowlabel = ip6_flowinfo(iph);
2220         fl6.flowi6_uid = uid;
2221
2222         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
2223         rt6_do_redirect(dst, NULL, skb);
2224         dst_release(dst);
2225 }
2226 EXPORT_SYMBOL_GPL(ip6_redirect);
2227
2228 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2229                             u32 mark)
2230 {
2231         const struct ipv6hdr *iph = ipv6_hdr(skb);
2232         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2233         struct dst_entry *dst;
2234         struct flowi6 fl6;
2235
2236         memset(&fl6, 0, sizeof(fl6));
2237         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2238         fl6.flowi6_oif = oif;
2239         fl6.flowi6_mark = mark;
2240         fl6.daddr = msg->dest;
2241         fl6.saddr = iph->daddr;
2242         fl6.flowi6_uid = sock_net_uid(net, NULL);
2243
2244         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2245         rt6_do_redirect(dst, NULL, skb);
2246         dst_release(dst);
2247 }
2248
2249 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2250 {
2251         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2252                      sk->sk_uid);
2253 }
2254 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2255
2256 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2257 {
2258         struct net_device *dev = dst->dev;
2259         unsigned int mtu = dst_mtu(dst);
2260         struct net *net = dev_net(dev);
2261
2262         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2263
2264         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2265                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2266
2267         /*
2268          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2269          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2270          * IPV6_MAXPLEN is also valid and means: "any MSS,
2271          * rely only on pmtu discovery"
2272          */
2273         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2274                 mtu = IPV6_MAXPLEN;
2275         return mtu;
2276 }
2277
2278 static unsigned int ip6_mtu(const struct dst_entry *dst)
2279 {
2280         const struct rt6_info *rt = (const struct rt6_info *)dst;
2281         unsigned int mtu = rt->rt6i_pmtu;
2282         struct inet6_dev *idev;
2283
2284         if (mtu)
2285                 goto out;
2286
2287         mtu = dst_metric_raw(dst, RTAX_MTU);
2288         if (mtu)
2289                 goto out;
2290
2291         mtu = IPV6_MIN_MTU;
2292
2293         rcu_read_lock();
2294         idev = __in6_dev_get(dst->dev);
2295         if (idev)
2296                 mtu = idev->cnf.mtu6;
2297         rcu_read_unlock();
2298
2299 out:
2300         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2301
2302         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2303 }
2304
2305 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2306                                   struct flowi6 *fl6)
2307 {
2308         struct dst_entry *dst;
2309         struct rt6_info *rt;
2310         struct inet6_dev *idev = in6_dev_get(dev);
2311         struct net *net = dev_net(dev);
2312
2313         if (unlikely(!idev))
2314                 return ERR_PTR(-ENODEV);
2315
2316         rt = ip6_dst_alloc(net, dev, 0);
2317         if (unlikely(!rt)) {
2318                 in6_dev_put(idev);
2319                 dst = ERR_PTR(-ENOMEM);
2320                 goto out;
2321         }
2322
2323         rt->dst.flags |= DST_HOST;
2324         rt->dst.output  = ip6_output;
2325         rt->rt6i_gateway  = fl6->daddr;
2326         rt->rt6i_dst.addr = fl6->daddr;
2327         rt->rt6i_dst.plen = 128;
2328         rt->rt6i_idev     = idev;
2329         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2330
2331         /* Add this dst into uncached_list so that rt6_ifdown() can
2332          * do proper release of the net_device
2333          */
2334         rt6_uncached_list_add(rt);
2335         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2336
2337         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2338
2339 out:
2340         return dst;
2341 }
2342
2343 static int ip6_dst_gc(struct dst_ops *ops)
2344 {
2345         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2346         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2347         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2348         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2349         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2350         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2351         int entries;
2352
2353         entries = dst_entries_get_fast(ops);
2354         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2355             entries <= rt_max_size)
2356                 goto out;
2357
2358         net->ipv6.ip6_rt_gc_expire++;
2359         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2360         entries = dst_entries_get_slow(ops);
2361         if (entries < ops->gc_thresh)
2362                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2363 out:
2364         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2365         return entries > rt_max_size;
2366 }
2367
2368 static int ip6_convert_metrics(struct mx6_config *mxc,
2369                                const struct fib6_config *cfg)
2370 {
2371         bool ecn_ca = false;
2372         struct nlattr *nla;
2373         int remaining;
2374         u32 *mp;
2375
2376         if (!cfg->fc_mx)
2377                 return 0;
2378
2379         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2380         if (unlikely(!mp))
2381                 return -ENOMEM;
2382
2383         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2384                 int type = nla_type(nla);
2385                 u32 val;
2386
2387                 if (!type)
2388                         continue;
2389                 if (unlikely(type > RTAX_MAX))
2390                         goto err;
2391
2392                 if (type == RTAX_CC_ALGO) {
2393                         char tmp[TCP_CA_NAME_MAX];
2394
2395                         nla_strlcpy(tmp, nla, sizeof(tmp));
2396                         val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
2397                         if (val == TCP_CA_UNSPEC)
2398                                 goto err;
2399                 } else {
2400                         val = nla_get_u32(nla);
2401                 }
2402                 if (type == RTAX_HOPLIMIT && val > 255)
2403                         val = 255;
2404                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2405                         goto err;
2406
2407                 mp[type - 1] = val;
2408                 __set_bit(type - 1, mxc->mx_valid);
2409         }
2410
2411         if (ecn_ca) {
2412                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2413                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2414         }
2415
2416         mxc->mx = mp;
2417         return 0;
2418  err:
2419         kfree(mp);
2420         return -EINVAL;
2421 }
2422
2423 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2424                                             struct fib6_config *cfg,
2425                                             const struct in6_addr *gw_addr)
2426 {
2427         struct flowi6 fl6 = {
2428                 .flowi6_oif = cfg->fc_ifindex,
2429                 .daddr = *gw_addr,
2430                 .saddr = cfg->fc_prefsrc,
2431         };
2432         struct fib6_table *table;
2433         struct rt6_info *rt;
2434         int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
2435
2436         table = fib6_get_table(net, cfg->fc_table);
2437         if (!table)
2438                 return NULL;
2439
2440         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2441                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2442
2443         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2444
2445         /* if table lookup failed, fall back to full lookup */
2446         if (rt == net->ipv6.ip6_null_entry) {
2447                 ip6_rt_put(rt);
2448                 rt = NULL;
2449         }
2450
2451         return rt;
2452 }
2453
2454 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2455                                               struct netlink_ext_ack *extack)
2456 {
2457         struct net *net = cfg->fc_nlinfo.nl_net;
2458         struct rt6_info *rt = NULL;
2459         struct net_device *dev = NULL;
2460         struct inet6_dev *idev = NULL;
2461         struct fib6_table *table;
2462         int addr_type;
2463         int err = -EINVAL;
2464
2465         /* RTF_PCPU is an internal flag; can not be set by userspace */
2466         if (cfg->fc_flags & RTF_PCPU) {
2467                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2468                 goto out;
2469         }
2470
2471         if (cfg->fc_dst_len > 128) {
2472                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2473                 goto out;
2474         }
2475         if (cfg->fc_src_len > 128) {
2476                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2477                 goto out;
2478         }
2479 #ifndef CONFIG_IPV6_SUBTREES
2480         if (cfg->fc_src_len) {
2481                 NL_SET_ERR_MSG(extack,
2482                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2483                 goto out;
2484         }
2485 #endif
2486         if (cfg->fc_ifindex) {
2487                 err = -ENODEV;
2488                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2489                 if (!dev)
2490                         goto out;
2491                 idev = in6_dev_get(dev);
2492                 if (!idev)
2493                         goto out;
2494         }
2495
2496         if (cfg->fc_metric == 0)
2497                 cfg->fc_metric = IP6_RT_PRIO_USER;
2498
2499         err = -ENOBUFS;
2500         if (cfg->fc_nlinfo.nlh &&
2501             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2502                 table = fib6_get_table(net, cfg->fc_table);
2503                 if (!table) {
2504                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2505                         table = fib6_new_table(net, cfg->fc_table);
2506                 }
2507         } else {
2508                 table = fib6_new_table(net, cfg->fc_table);
2509         }
2510
2511         if (!table)
2512                 goto out;
2513
2514         rt = ip6_dst_alloc(net, NULL,
2515                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2516
2517         if (!rt) {
2518                 err = -ENOMEM;
2519                 goto out;
2520         }
2521
2522         if (cfg->fc_flags & RTF_EXPIRES)
2523                 rt6_set_expires(rt, jiffies +
2524                                 clock_t_to_jiffies(cfg->fc_expires));
2525         else
2526                 rt6_clean_expires(rt);
2527
2528         if (cfg->fc_protocol == RTPROT_UNSPEC)
2529                 cfg->fc_protocol = RTPROT_BOOT;
2530         rt->rt6i_protocol = cfg->fc_protocol;
2531
2532         addr_type = ipv6_addr_type(&cfg->fc_dst);
2533
2534         if (addr_type & IPV6_ADDR_MULTICAST)
2535                 rt->dst.input = ip6_mc_input;
2536         else if (cfg->fc_flags & RTF_LOCAL)
2537                 rt->dst.input = ip6_input;
2538         else
2539                 rt->dst.input = ip6_forward;
2540
2541         rt->dst.output = ip6_output;
2542
2543         if (cfg->fc_encap) {
2544                 struct lwtunnel_state *lwtstate;
2545
2546                 err = lwtunnel_build_state(cfg->fc_encap_type,
2547                                            cfg->fc_encap, AF_INET6, cfg,
2548                                            &lwtstate, extack);
2549                 if (err)
2550                         goto out;
2551                 rt->dst.lwtstate = lwtstate_get(lwtstate);
2552                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2553                         rt->dst.lwtstate->orig_output = rt->dst.output;
2554                         rt->dst.output = lwtunnel_output;
2555                 }
2556                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2557                         rt->dst.lwtstate->orig_input = rt->dst.input;
2558                         rt->dst.input = lwtunnel_input;
2559                 }
2560         }
2561
2562         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2563         rt->rt6i_dst.plen = cfg->fc_dst_len;
2564         if (rt->rt6i_dst.plen == 128)
2565                 rt->dst.flags |= DST_HOST;
2566
2567 #ifdef CONFIG_IPV6_SUBTREES
2568         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2569         rt->rt6i_src.plen = cfg->fc_src_len;
2570 #endif
2571
2572         rt->rt6i_metric = cfg->fc_metric;
2573
2574         /* We cannot add true routes via loopback here,
2575            they would result in kernel looping; promote them to reject routes
2576          */
2577         if ((cfg->fc_flags & RTF_REJECT) ||
2578             (dev && (dev->flags & IFF_LOOPBACK) &&
2579              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2580              !(cfg->fc_flags & RTF_LOCAL))) {
2581                 /* hold loopback dev/idev if we haven't done so. */
2582                 if (dev != net->loopback_dev) {
2583                         if (dev) {
2584                                 dev_put(dev);
2585                                 in6_dev_put(idev);
2586                         }
2587                         dev = net->loopback_dev;
2588                         dev_hold(dev);
2589                         idev = in6_dev_get(dev);
2590                         if (!idev) {
2591                                 err = -ENODEV;
2592                                 goto out;
2593                         }
2594                 }
2595                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2596                 switch (cfg->fc_type) {
2597                 case RTN_BLACKHOLE:
2598                         rt->dst.error = -EINVAL;
2599                         rt->dst.output = dst_discard_out;
2600                         rt->dst.input = dst_discard;
2601                         break;
2602                 case RTN_PROHIBIT:
2603                         rt->dst.error = -EACCES;
2604                         rt->dst.output = ip6_pkt_prohibit_out;
2605                         rt->dst.input = ip6_pkt_prohibit;
2606                         break;
2607                 case RTN_THROW:
2608                 case RTN_UNREACHABLE:
2609                 default:
2610                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2611                                         : (cfg->fc_type == RTN_UNREACHABLE)
2612                                         ? -EHOSTUNREACH : -ENETUNREACH;
2613                         rt->dst.output = ip6_pkt_discard_out;
2614                         rt->dst.input = ip6_pkt_discard;
2615                         break;
2616                 }
2617                 goto install_route;
2618         }
2619
2620         if (cfg->fc_flags & RTF_GATEWAY) {
2621                 const struct in6_addr *gw_addr;
2622                 int gwa_type;
2623
2624                 gw_addr = &cfg->fc_gateway;
2625                 gwa_type = ipv6_addr_type(gw_addr);
2626
2627                 /* if gw_addr is local we will fail to detect this in case
2628                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
2629                  * will return already-added prefix route via interface that
2630                  * prefix route was assigned to, which might be non-loopback.
2631                  */
2632                 err = -EINVAL;
2633                 if (ipv6_chk_addr_and_flags(net, gw_addr,
2634                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
2635                                             dev : NULL, 0, 0)) {
2636                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2637                         goto out;
2638                 }
2639                 rt->rt6i_gateway = *gw_addr;
2640
2641                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2642                         struct rt6_info *grt = NULL;
2643
2644                         /* IPv6 strictly inhibits using not link-local
2645                            addresses as nexthop address.
2646                            Otherwise, router will not able to send redirects.
2647                            It is very good, but in some (rare!) circumstances
2648                            (SIT, PtP, NBMA NOARP links) it is handy to allow
2649                            some exceptions. --ANK
2650                            We allow IPv4-mapped nexthops to support RFC4798-type
2651                            addressing
2652                          */
2653                         if (!(gwa_type & (IPV6_ADDR_UNICAST |
2654                                           IPV6_ADDR_MAPPED))) {
2655                                 NL_SET_ERR_MSG(extack,
2656                                                "Invalid gateway address");
2657                                 goto out;
2658                         }
2659
2660                         if (cfg->fc_table) {
2661                                 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2662
2663                                 if (grt) {
2664                                         if (grt->rt6i_flags & RTF_GATEWAY ||
2665                                             (dev && dev != grt->dst.dev)) {
2666                                                 ip6_rt_put(grt);
2667                                                 grt = NULL;
2668                                         }
2669                                 }
2670                         }
2671
2672                         if (!grt)
2673                                 grt = rt6_lookup(net, gw_addr, NULL,
2674                                                  cfg->fc_ifindex, 1);
2675
2676                         err = -EHOSTUNREACH;
2677                         if (!grt)
2678                                 goto out;
2679                         if (dev) {
2680                                 if (dev != grt->dst.dev) {
2681                                         ip6_rt_put(grt);
2682                                         goto out;
2683                                 }
2684                         } else {
2685                                 dev = grt->dst.dev;
2686                                 idev = grt->rt6i_idev;
2687                                 dev_hold(dev);
2688                                 in6_dev_hold(grt->rt6i_idev);
2689                         }
2690                         if (!(grt->rt6i_flags & RTF_GATEWAY))
2691                                 err = 0;
2692                         ip6_rt_put(grt);
2693
2694                         if (err)
2695                                 goto out;
2696                 }
2697                 err = -EINVAL;
2698                 if (!dev) {
2699                         NL_SET_ERR_MSG(extack, "Egress device not specified");
2700                         goto out;
2701                 } else if (dev->flags & IFF_LOOPBACK) {
2702                         NL_SET_ERR_MSG(extack,
2703                                        "Egress device can not be loopback device for this route");
2704                         goto out;
2705                 }
2706         }
2707
2708         err = -ENODEV;
2709         if (!dev)
2710                 goto out;
2711
2712         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2713                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2714                         NL_SET_ERR_MSG(extack, "Invalid source address");
2715                         err = -EINVAL;
2716                         goto out;
2717                 }
2718                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2719                 rt->rt6i_prefsrc.plen = 128;
2720         } else
2721                 rt->rt6i_prefsrc.plen = 0;
2722
2723         rt->rt6i_flags = cfg->fc_flags;
2724
2725 install_route:
2726         rt->dst.dev = dev;
2727         rt->rt6i_idev = idev;
2728         rt->rt6i_table = table;
2729
2730         cfg->fc_nlinfo.nl_net = dev_net(dev);
2731
2732         return rt;
2733 out:
2734         if (dev)
2735                 dev_put(dev);
2736         if (idev)
2737                 in6_dev_put(idev);
2738         if (rt)
2739                 dst_release_immediate(&rt->dst);
2740
2741         return ERR_PTR(err);
2742 }
2743
2744 int ip6_route_add(struct fib6_config *cfg,
2745                   struct netlink_ext_ack *extack)
2746 {
2747         struct mx6_config mxc = { .mx = NULL, };
2748         struct rt6_info *rt;
2749         int err;
2750
2751         rt = ip6_route_info_create(cfg, extack);
2752         if (IS_ERR(rt)) {
2753                 err = PTR_ERR(rt);
2754                 rt = NULL;
2755                 goto out;
2756         }
2757
2758         err = ip6_convert_metrics(&mxc, cfg);
2759         if (err)
2760                 goto out;
2761
2762         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2763
2764         kfree(mxc.mx);
2765
2766         return err;
2767 out:
2768         if (rt)
2769                 dst_release_immediate(&rt->dst);
2770
2771         return err;
2772 }
2773
2774 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2775 {
2776         int err;
2777         struct fib6_table *table;
2778         struct net *net = dev_net(rt->dst.dev);
2779
2780         if (rt == net->ipv6.ip6_null_entry) {
2781                 err = -ENOENT;
2782                 goto out;
2783         }
2784
2785         table = rt->rt6i_table;
2786         spin_lock_bh(&table->tb6_lock);
2787         err = fib6_del(rt, info);
2788         spin_unlock_bh(&table->tb6_lock);
2789
2790 out:
2791         ip6_rt_put(rt);
2792         return err;
2793 }
2794
2795 int ip6_del_rt(struct rt6_info *rt)
2796 {
2797         struct nl_info info = {
2798                 .nl_net = dev_net(rt->dst.dev),
2799         };
2800         return __ip6_del_rt(rt, &info);
2801 }
2802
2803 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2804 {
2805         struct nl_info *info = &cfg->fc_nlinfo;
2806         struct net *net = info->nl_net;
2807         struct sk_buff *skb = NULL;
2808         struct fib6_table *table;
2809         int err = -ENOENT;
2810
2811         if (rt == net->ipv6.ip6_null_entry)
2812                 goto out_put;
2813         table = rt->rt6i_table;
2814         spin_lock_bh(&table->tb6_lock);
2815
2816         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2817                 struct rt6_info *sibling, *next_sibling;
2818
2819                 /* prefer to send a single notification with all hops */
2820                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2821                 if (skb) {
2822                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2823
2824                         if (rt6_fill_node(net, skb, rt,
2825                                           NULL, NULL, 0, RTM_DELROUTE,
2826                                           info->portid, seq, 0) < 0) {
2827                                 kfree_skb(skb);
2828                                 skb = NULL;
2829                         } else
2830                                 info->skip_notify = 1;
2831                 }
2832
2833                 list_for_each_entry_safe(sibling, next_sibling,
2834                                          &rt->rt6i_siblings,
2835                                          rt6i_siblings) {
2836                         err = fib6_del(sibling, info);
2837                         if (err)
2838                                 goto out_unlock;
2839                 }
2840         }
2841
2842         err = fib6_del(rt, info);
2843 out_unlock:
2844         spin_unlock_bh(&table->tb6_lock);
2845 out_put:
2846         ip6_rt_put(rt);
2847
2848         if (skb) {
2849                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2850                             info->nlh, gfp_any());
2851         }
2852         return err;
2853 }
2854
2855 static int ip6_route_del(struct fib6_config *cfg,
2856                          struct netlink_ext_ack *extack)
2857 {
2858         struct rt6_info *rt, *rt_cache;
2859         struct fib6_table *table;
2860         struct fib6_node *fn;
2861         int err = -ESRCH;
2862
2863         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2864         if (!table) {
2865                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2866                 return err;
2867         }
2868
2869         rcu_read_lock();
2870
2871         fn = fib6_locate(&table->tb6_root,
2872                          &cfg->fc_dst, cfg->fc_dst_len,
2873                          &cfg->fc_src, cfg->fc_src_len,
2874                          !(cfg->fc_flags & RTF_CACHE));
2875
2876         if (fn) {
2877                 for_each_fib6_node_rt_rcu(fn) {
2878                         if (cfg->fc_flags & RTF_CACHE) {
2879                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
2880                                                               &cfg->fc_src);
2881                                 if (!rt_cache)
2882                                         continue;
2883                                 rt = rt_cache;
2884                         }
2885                         if (cfg->fc_ifindex &&
2886                             (!rt->dst.dev ||
2887                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2888                                 continue;
2889                         if (cfg->fc_flags & RTF_GATEWAY &&
2890                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2891                                 continue;
2892                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2893                                 continue;
2894                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2895                                 continue;
2896                         if (!dst_hold_safe(&rt->dst))
2897                                 break;
2898                         rcu_read_unlock();
2899
2900                         /* if gateway was specified only delete the one hop */
2901                         if (cfg->fc_flags & RTF_GATEWAY)
2902                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2903
2904                         return __ip6_del_rt_siblings(rt, cfg);
2905                 }
2906         }
2907         rcu_read_unlock();
2908
2909         return err;
2910 }
2911
2912 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2913 {
2914         struct netevent_redirect netevent;
2915         struct rt6_info *rt, *nrt = NULL;
2916         struct ndisc_options ndopts;
2917         struct inet6_dev *in6_dev;
2918         struct neighbour *neigh;
2919         struct rd_msg *msg;
2920         int optlen, on_link;
2921         u8 *lladdr;
2922
2923         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2924         optlen -= sizeof(*msg);
2925
2926         if (optlen < 0) {
2927                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2928                 return;
2929         }
2930
2931         msg = (struct rd_msg *)icmp6_hdr(skb);
2932
2933         if (ipv6_addr_is_multicast(&msg->dest)) {
2934                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2935                 return;
2936         }
2937
2938         on_link = 0;
2939         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2940                 on_link = 1;
2941         } else if (ipv6_addr_type(&msg->target) !=
2942                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2943                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2944                 return;
2945         }
2946
2947         in6_dev = __in6_dev_get(skb->dev);
2948         if (!in6_dev)
2949                 return;
2950         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2951                 return;
2952
2953         /* RFC2461 8.1:
2954          *      The IP source address of the Redirect MUST be the same as the current
2955          *      first-hop router for the specified ICMP Destination Address.
2956          */
2957
2958         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2959                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2960                 return;
2961         }
2962
2963         lladdr = NULL;
2964         if (ndopts.nd_opts_tgt_lladdr) {
2965                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2966                                              skb->dev);
2967                 if (!lladdr) {
2968                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2969                         return;
2970                 }
2971         }
2972
2973         rt = (struct rt6_info *) dst;
2974         if (rt->rt6i_flags & RTF_REJECT) {
2975                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2976                 return;
2977         }
2978
2979         /* Redirect received -> path was valid.
2980          * Look, redirects are sent only in response to data packets,
2981          * so that this nexthop apparently is reachable. --ANK
2982          */
2983         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2984
2985         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2986         if (!neigh)
2987                 return;
2988
2989         /*
2990          *      We have finally decided to accept it.
2991          */
2992
2993         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2994                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
2995                      NEIGH_UPDATE_F_OVERRIDE|
2996                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2997                                      NEIGH_UPDATE_F_ISROUTER)),
2998                      NDISC_REDIRECT, &ndopts);
2999
3000         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3001         if (!nrt)
3002                 goto out;
3003
3004         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3005         if (on_link)
3006                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3007
3008         nrt->rt6i_protocol = RTPROT_REDIRECT;
3009         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3010
3011         /* No need to remove rt from the exception table if rt is
3012          * a cached route because rt6_insert_exception() will
3013          * takes care of it
3014          */
3015         if (rt6_insert_exception(nrt, rt)) {
3016                 dst_release_immediate(&nrt->dst);
3017                 goto out;
3018         }
3019
3020         netevent.old = &rt->dst;
3021         netevent.new = &nrt->dst;
3022         netevent.daddr = &msg->dest;
3023         netevent.neigh = neigh;
3024         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3025
3026 out:
3027         neigh_release(neigh);
3028 }
3029
3030 /*
3031  *      Misc support functions
3032  */
3033
3034 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3035 {
3036         BUG_ON(from->dst.from);
3037
3038         rt->rt6i_flags &= ~RTF_EXPIRES;
3039         dst_hold(&from->dst);
3040         rt->dst.from = &from->dst;
3041         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3042 }
3043
3044 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3045 {
3046         rt->dst.input = ort->dst.input;
3047         rt->dst.output = ort->dst.output;
3048         rt->rt6i_dst = ort->rt6i_dst;
3049         rt->dst.error = ort->dst.error;
3050         rt->rt6i_idev = ort->rt6i_idev;
3051         if (rt->rt6i_idev)
3052                 in6_dev_hold(rt->rt6i_idev);
3053         rt->dst.lastuse = jiffies;
3054         rt->rt6i_gateway = ort->rt6i_gateway;
3055         rt->rt6i_flags = ort->rt6i_flags;
3056         rt6_set_from(rt, ort);
3057         rt->rt6i_metric = ort->rt6i_metric;
3058 #ifdef CONFIG_IPV6_SUBTREES
3059         rt->rt6i_src = ort->rt6i_src;
3060 #endif
3061         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3062         rt->rt6i_table = ort->rt6i_table;
3063         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3064 }
3065
3066 #ifdef CONFIG_IPV6_ROUTE_INFO
3067 static struct rt6_info *rt6_get_route_info(struct net *net,
3068                                            const struct in6_addr *prefix, int prefixlen,
3069                                            const struct in6_addr *gwaddr,
3070                                            struct net_device *dev)
3071 {
3072         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3073         int ifindex = dev->ifindex;
3074         struct fib6_node *fn;
3075         struct rt6_info *rt = NULL;
3076         struct fib6_table *table;
3077
3078         table = fib6_get_table(net, tb_id);
3079         if (!table)
3080                 return NULL;
3081
3082         rcu_read_lock();
3083         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3084         if (!fn)
3085                 goto out;
3086
3087         for_each_fib6_node_rt_rcu(fn) {
3088                 if (rt->dst.dev->ifindex != ifindex)
3089                         continue;
3090                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3091                         continue;
3092                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3093                         continue;
3094                 ip6_hold_safe(NULL, &rt, false);
3095                 break;
3096         }
3097 out:
3098         rcu_read_unlock();
3099         return rt;
3100 }
3101
3102 static struct rt6_info *rt6_add_route_info(struct net *net,
3103                                            const struct in6_addr *prefix, int prefixlen,
3104                                            const struct in6_addr *gwaddr,
3105                                            struct net_device *dev,
3106                                            unsigned int pref)
3107 {
3108         struct fib6_config cfg = {
3109                 .fc_metric      = IP6_RT_PRIO_USER,
3110                 .fc_ifindex     = dev->ifindex,
3111                 .fc_dst_len     = prefixlen,
3112                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3113                                   RTF_UP | RTF_PREF(pref),
3114                 .fc_protocol = RTPROT_RA,
3115                 .fc_nlinfo.portid = 0,
3116                 .fc_nlinfo.nlh = NULL,
3117                 .fc_nlinfo.nl_net = net,
3118         };
3119
3120         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3121         cfg.fc_dst = *prefix;
3122         cfg.fc_gateway = *gwaddr;
3123
3124         /* We should treat it as a default route if prefix length is 0. */
3125         if (!prefixlen)
3126                 cfg.fc_flags |= RTF_DEFAULT;
3127
3128         ip6_route_add(&cfg, NULL);
3129
3130         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3131 }
3132 #endif
3133
3134 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3135 {
3136         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3137         struct rt6_info *rt;
3138         struct fib6_table *table;
3139
3140         table = fib6_get_table(dev_net(dev), tb_id);
3141         if (!table)
3142                 return NULL;
3143
3144         rcu_read_lock();
3145         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3146                 if (dev == rt->dst.dev &&
3147                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3148                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
3149                         break;
3150         }
3151         if (rt)
3152                 ip6_hold_safe(NULL, &rt, false);
3153         rcu_read_unlock();
3154         return rt;
3155 }
3156
3157 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3158                                      struct net_device *dev,
3159                                      unsigned int pref)
3160 {
3161         struct fib6_config cfg = {
3162                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3163                 .fc_metric      = IP6_RT_PRIO_USER,
3164                 .fc_ifindex     = dev->ifindex,
3165                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3166                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3167                 .fc_protocol = RTPROT_RA,
3168                 .fc_nlinfo.portid = 0,
3169                 .fc_nlinfo.nlh = NULL,
3170                 .fc_nlinfo.nl_net = dev_net(dev),
3171         };
3172
3173         cfg.fc_gateway = *gwaddr;
3174
3175         if (!ip6_route_add(&cfg, NULL)) {
3176                 struct fib6_table *table;
3177
3178                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3179                 if (table)
3180                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3181         }
3182
3183         return rt6_get_dflt_router(gwaddr, dev);
3184 }
3185
3186 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3187 {
3188         struct rt6_info *rt;
3189
3190 restart:
3191         rcu_read_lock();
3192         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3193                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3194                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3195                         if (dst_hold_safe(&rt->dst)) {
3196                                 rcu_read_unlock();
3197                                 ip6_del_rt(rt);
3198                         } else {
3199                                 rcu_read_unlock();
3200                         }
3201                         goto restart;
3202                 }
3203         }
3204         rcu_read_unlock();
3205
3206         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3207 }
3208
3209 void rt6_purge_dflt_routers(struct net *net)
3210 {
3211         struct fib6_table *table;
3212         struct hlist_head *head;
3213         unsigned int h;
3214
3215         rcu_read_lock();
3216
3217         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3218                 head = &net->ipv6.fib_table_hash[h];
3219                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3220                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3221                                 __rt6_purge_dflt_routers(table);
3222                 }
3223         }
3224
3225         rcu_read_unlock();
3226 }
3227
3228 static void rtmsg_to_fib6_config(struct net *net,
3229                                  struct in6_rtmsg *rtmsg,
3230                                  struct fib6_config *cfg)
3231 {
3232         memset(cfg, 0, sizeof(*cfg));
3233
3234         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3235                          : RT6_TABLE_MAIN;
3236         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3237         cfg->fc_metric = rtmsg->rtmsg_metric;
3238         cfg->fc_expires = rtmsg->rtmsg_info;
3239         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3240         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3241         cfg->fc_flags = rtmsg->rtmsg_flags;
3242
3243         cfg->fc_nlinfo.nl_net = net;
3244
3245         cfg->fc_dst = rtmsg->rtmsg_dst;
3246         cfg->fc_src = rtmsg->rtmsg_src;
3247         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3248 }
3249
3250 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3251 {
3252         struct fib6_config cfg;
3253         struct in6_rtmsg rtmsg;
3254         int err;
3255
3256         switch (cmd) {
3257         case SIOCADDRT:         /* Add a route */
3258         case SIOCDELRT:         /* Delete a route */
3259                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3260                         return -EPERM;
3261                 err = copy_from_user(&rtmsg, arg,
3262                                      sizeof(struct in6_rtmsg));
3263                 if (err)
3264                         return -EFAULT;
3265
3266                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3267
3268                 rtnl_lock();
3269                 switch (cmd) {
3270                 case SIOCADDRT:
3271                         err = ip6_route_add(&cfg, NULL);
3272                         break;
3273                 case SIOCDELRT:
3274                         err = ip6_route_del(&cfg, NULL);
3275                         break;
3276                 default:
3277                         err = -EINVAL;
3278                 }
3279                 rtnl_unlock();
3280
3281                 return err;
3282         }
3283
3284         return -EINVAL;
3285 }
3286
3287 /*
3288  *      Drop the packet on the floor
3289  */
3290
3291 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3292 {
3293         int type;
3294         struct dst_entry *dst = skb_dst(skb);
3295         switch (ipstats_mib_noroutes) {
3296         case IPSTATS_MIB_INNOROUTES:
3297                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3298                 if (type == IPV6_ADDR_ANY) {
3299                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3300                                       IPSTATS_MIB_INADDRERRORS);
3301                         break;
3302                 }
3303                 /* FALLTHROUGH */
3304         case IPSTATS_MIB_OUTNOROUTES:
3305                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3306                               ipstats_mib_noroutes);
3307                 break;
3308         }
3309         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3310         kfree_skb(skb);
3311         return 0;
3312 }
3313
3314 static int ip6_pkt_discard(struct sk_buff *skb)
3315 {
3316         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3317 }
3318
3319 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3320 {
3321         skb->dev = skb_dst(skb)->dev;
3322         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3323 }
3324
3325 static int ip6_pkt_prohibit(struct sk_buff *skb)
3326 {
3327         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3328 }
3329
3330 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3331 {
3332         skb->dev = skb_dst(skb)->dev;
3333         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3334 }
3335
3336 /*
3337  *      Allocate a dst for local (unicast / anycast) address.
3338  */
3339
3340 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3341                                     const struct in6_addr *addr,
3342                                     bool anycast)
3343 {
3344         u32 tb_id;
3345         struct net *net = dev_net(idev->dev);
3346         struct net_device *dev = idev->dev;
3347         struct rt6_info *rt;
3348
3349         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3350         if (!rt)
3351                 return ERR_PTR(-ENOMEM);
3352
3353         in6_dev_hold(idev);
3354
3355         rt->dst.flags |= DST_HOST;
3356         rt->dst.input = ip6_input;
3357         rt->dst.output = ip6_output;
3358         rt->rt6i_idev = idev;
3359
3360         rt->rt6i_protocol = RTPROT_KERNEL;
3361         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3362         if (anycast)
3363                 rt->rt6i_flags |= RTF_ANYCAST;
3364         else
3365                 rt->rt6i_flags |= RTF_LOCAL;
3366
3367         rt->rt6i_gateway  = *addr;
3368         rt->rt6i_dst.addr = *addr;
3369         rt->rt6i_dst.plen = 128;
3370         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3371         rt->rt6i_table = fib6_get_table(net, tb_id);
3372
3373         return rt;
3374 }
3375
3376 /* remove deleted ip from prefsrc entries */
3377 struct arg_dev_net_ip {
3378         struct net_device *dev;
3379         struct net *net;
3380         struct in6_addr *addr;
3381 };
3382
3383 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3384 {
3385         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3386         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3387         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3388
3389         if (((void *)rt->dst.dev == dev || !dev) &&
3390             rt != net->ipv6.ip6_null_entry &&
3391             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3392                 spin_lock_bh(&rt6_exception_lock);
3393                 /* remove prefsrc entry */
3394                 rt->rt6i_prefsrc.plen = 0;
3395                 /* need to update cache as well */
3396                 rt6_exceptions_remove_prefsrc(rt);
3397                 spin_unlock_bh(&rt6_exception_lock);
3398         }
3399         return 0;
3400 }
3401
3402 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3403 {
3404         struct net *net = dev_net(ifp->idev->dev);
3405         struct arg_dev_net_ip adni = {
3406                 .dev = ifp->idev->dev,
3407                 .net = net,
3408                 .addr = &ifp->addr,
3409         };
3410         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3411 }
3412
3413 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3414
3415 /* Remove routers and update dst entries when gateway turn into host. */
3416 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3417 {
3418         struct in6_addr *gateway = (struct in6_addr *)arg;
3419
3420         if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3421             ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3422                 return -1;
3423         }
3424
3425         /* Further clean up cached routes in exception table.
3426          * This is needed because cached route may have a different
3427          * gateway than its 'parent' in the case of an ip redirect.
3428          */
3429         rt6_exceptions_clean_tohost(rt, gateway);
3430
3431         return 0;
3432 }
3433
3434 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3435 {
3436         fib6_clean_all(net, fib6_clean_tohost, gateway);
3437 }
3438
3439 struct arg_dev_net {
3440         struct net_device *dev;
3441         struct net *net;
3442 };
3443
3444 /* called with write lock held for table with rt */
3445 static int fib6_ifdown(struct rt6_info *rt, void *arg)
3446 {
3447         const struct arg_dev_net *adn = arg;
3448         const struct net_device *dev = adn->dev;
3449
3450         if ((rt->dst.dev == dev || !dev) &&
3451             rt != adn->net->ipv6.ip6_null_entry &&
3452             (rt->rt6i_nsiblings == 0 ||
3453              (dev && netdev_unregistering(dev)) ||
3454              !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3455                 return -1;
3456
3457         return 0;
3458 }
3459
3460 void rt6_ifdown(struct net *net, struct net_device *dev)
3461 {
3462         struct arg_dev_net adn = {
3463                 .dev = dev,
3464                 .net = net,
3465         };
3466
3467         fib6_clean_all(net, fib6_ifdown, &adn);
3468         if (dev)
3469                 rt6_uncached_list_flush_dev(net, dev);
3470 }
3471
3472 struct rt6_mtu_change_arg {
3473         struct net_device *dev;
3474         unsigned int mtu;
3475 };
3476
3477 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3478 {
3479         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3480         struct inet6_dev *idev;
3481
3482         /* In IPv6 pmtu discovery is not optional,
3483            so that RTAX_MTU lock cannot disable it.
3484            We still use this lock to block changes
3485            caused by addrconf/ndisc.
3486         */
3487
3488         idev = __in6_dev_get(arg->dev);
3489         if (!idev)
3490                 return 0;
3491
3492         /* For administrative MTU increase, there is no way to discover
3493            IPv6 PMTU increase, so PMTU increase should be updated here.
3494            Since RFC 1981 doesn't include administrative MTU increase
3495            update PMTU increase is a MUST. (i.e. jumbo frame)
3496          */
3497         /*
3498            If new MTU is less than route PMTU, this new MTU will be the
3499            lowest MTU in the path, update the route PMTU to reflect PMTU
3500            decreases; if new MTU is greater than route PMTU, and the
3501            old MTU is the lowest MTU in the path, update the route PMTU
3502            to reflect the increase. In this case if the other nodes' MTU
3503            also have the lowest MTU, TOO BIG MESSAGE will be lead to
3504            PMTU discovery.
3505          */
3506         if (rt->dst.dev == arg->dev &&
3507             dst_metric_raw(&rt->dst, RTAX_MTU) &&
3508             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3509                 spin_lock_bh(&rt6_exception_lock);
3510                 if (dst_mtu(&rt->dst) >= arg->mtu ||
3511                     (dst_mtu(&rt->dst) < arg->mtu &&
3512                      dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
3513                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3514                 }
3515                 rt6_exceptions_update_pmtu(rt, arg->mtu);
3516                 spin_unlock_bh(&rt6_exception_lock);
3517         }
3518         return 0;
3519 }
3520
3521 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3522 {
3523         struct rt6_mtu_change_arg arg = {
3524                 .dev = dev,
3525                 .mtu = mtu,
3526         };
3527
3528         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3529 }
3530
3531 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3532         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3533         [RTA_OIF]               = { .type = NLA_U32 },
3534         [RTA_IIF]               = { .type = NLA_U32 },
3535         [RTA_PRIORITY]          = { .type = NLA_U32 },
3536         [RTA_METRICS]           = { .type = NLA_NESTED },
3537         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
3538         [RTA_PREF]              = { .type = NLA_U8 },
3539         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
3540         [RTA_ENCAP]             = { .type = NLA_NESTED },
3541         [RTA_EXPIRES]           = { .type = NLA_U32 },
3542         [RTA_UID]               = { .type = NLA_U32 },
3543         [RTA_MARK]              = { .type = NLA_U32 },
3544 };
3545
3546 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3547                               struct fib6_config *cfg,
3548                               struct netlink_ext_ack *extack)
3549 {
3550         struct rtmsg *rtm;
3551         struct nlattr *tb[RTA_MAX+1];
3552         unsigned int pref;
3553         int err;
3554
3555         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3556                           NULL);
3557         if (err < 0)
3558                 goto errout;
3559
3560         err = -EINVAL;
3561         rtm = nlmsg_data(nlh);
3562         memset(cfg, 0, sizeof(*cfg));
3563
3564         cfg->fc_table = rtm->rtm_table;
3565         cfg->fc_dst_len = rtm->rtm_dst_len;
3566         cfg->fc_src_len = rtm->rtm_src_len;
3567         cfg->fc_flags = RTF_UP;
3568         cfg->fc_protocol = rtm->rtm_protocol;
3569         cfg->fc_type = rtm->rtm_type;
3570
3571         if (rtm->rtm_type == RTN_UNREACHABLE ||
3572             rtm->rtm_type == RTN_BLACKHOLE ||
3573             rtm->rtm_type == RTN_PROHIBIT ||
3574             rtm->rtm_type == RTN_THROW)
3575                 cfg->fc_flags |= RTF_REJECT;
3576
3577         if (rtm->rtm_type == RTN_LOCAL)
3578                 cfg->fc_flags |= RTF_LOCAL;
3579
3580         if (rtm->rtm_flags & RTM_F_CLONED)
3581                 cfg->fc_flags |= RTF_CACHE;
3582
3583         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3584         cfg->fc_nlinfo.nlh = nlh;
3585         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3586
3587         if (tb[RTA_GATEWAY]) {
3588                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3589                 cfg->fc_flags |= RTF_GATEWAY;
3590         }
3591
3592         if (tb[RTA_DST]) {
3593                 int plen = (rtm->rtm_dst_len + 7) >> 3;
3594
3595                 if (nla_len(tb[RTA_DST]) < plen)
3596                         goto errout;
3597
3598                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3599         }
3600
3601         if (tb[RTA_SRC]) {
3602                 int plen = (rtm->rtm_src_len + 7) >> 3;
3603
3604                 if (nla_len(tb[RTA_SRC]) < plen)
3605                         goto errout;
3606
3607                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3608         }
3609
3610         if (tb[RTA_PREFSRC])
3611                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3612
3613         if (tb[RTA_OIF])
3614                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3615
3616         if (tb[RTA_PRIORITY])
3617                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3618
3619         if (tb[RTA_METRICS]) {
3620                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3621                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3622         }
3623
3624         if (tb[RTA_TABLE])
3625                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3626
3627         if (tb[RTA_MULTIPATH]) {
3628                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3629                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3630
3631                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3632                                                      cfg->fc_mp_len, extack);
3633                 if (err < 0)
3634                         goto errout;
3635         }
3636
3637         if (tb[RTA_PREF]) {
3638                 pref = nla_get_u8(tb[RTA_PREF]);
3639                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3640                     pref != ICMPV6_ROUTER_PREF_HIGH)
3641                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
3642                 cfg->fc_flags |= RTF_PREF(pref);
3643         }
3644
3645         if (tb[RTA_ENCAP])
3646                 cfg->fc_encap = tb[RTA_ENCAP];
3647
3648         if (tb[RTA_ENCAP_TYPE]) {
3649                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3650
3651                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3652                 if (err < 0)
3653                         goto errout;
3654         }
3655
3656         if (tb[RTA_EXPIRES]) {
3657                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3658
3659                 if (addrconf_finite_timeout(timeout)) {
3660                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3661                         cfg->fc_flags |= RTF_EXPIRES;
3662                 }
3663         }
3664
3665         err = 0;
3666 errout:
3667         return err;
3668 }
3669
3670 struct rt6_nh {
3671         struct rt6_info *rt6_info;
3672         struct fib6_config r_cfg;
3673         struct mx6_config mxc;
3674         struct list_head next;
3675 };
3676
3677 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3678 {
3679         struct rt6_nh *nh;
3680
3681         list_for_each_entry(nh, rt6_nh_list, next) {
3682                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3683                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3684                         nh->r_cfg.fc_ifindex);
3685         }
3686 }
3687
3688 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3689                                  struct rt6_info *rt, struct fib6_config *r_cfg)
3690 {
3691         struct rt6_nh *nh;
3692         int err = -EEXIST;
3693
3694         list_for_each_entry(nh, rt6_nh_list, next) {
3695                 /* check if rt6_info already exists */
3696                 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3697                         return err;
3698         }
3699
3700         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3701         if (!nh)
3702                 return -ENOMEM;
3703         nh->rt6_info = rt;
3704         err = ip6_convert_metrics(&nh->mxc, r_cfg);
3705         if (err) {
3706                 kfree(nh);
3707                 return err;
3708         }
3709         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3710         list_add_tail(&nh->next, rt6_nh_list);
3711
3712         return 0;
3713 }
3714
3715 static void ip6_route_mpath_notify(struct rt6_info *rt,
3716                                    struct rt6_info *rt_last,
3717                                    struct nl_info *info,
3718                                    __u16 nlflags)
3719 {
3720         /* if this is an APPEND route, then rt points to the first route
3721          * inserted and rt_last points to last route inserted. Userspace
3722          * wants a consistent dump of the route which starts at the first
3723          * nexthop. Since sibling routes are always added at the end of
3724          * the list, find the first sibling of the last route appended
3725          */
3726         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3727                 rt = list_first_entry(&rt_last->rt6i_siblings,
3728                                       struct rt6_info,
3729                                       rt6i_siblings);
3730         }
3731
3732         if (rt)
3733                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3734 }
3735
3736 static int ip6_route_multipath_add(struct fib6_config *cfg,
3737                                    struct netlink_ext_ack *extack)
3738 {
3739         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3740         struct nl_info *info = &cfg->fc_nlinfo;
3741         struct fib6_config r_cfg;
3742         struct rtnexthop *rtnh;
3743         struct rt6_info *rt;
3744         struct rt6_nh *err_nh;
3745         struct rt6_nh *nh, *nh_safe;
3746         __u16 nlflags;
3747         int remaining;
3748         int attrlen;
3749         int err = 1;
3750         int nhn = 0;
3751         int replace = (cfg->fc_nlinfo.nlh &&
3752                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3753         LIST_HEAD(rt6_nh_list);
3754
3755         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3756         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3757                 nlflags |= NLM_F_APPEND;
3758
3759         remaining = cfg->fc_mp_len;
3760         rtnh = (struct rtnexthop *)cfg->fc_mp;
3761
3762         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3763          * rt6_info structs per nexthop
3764          */
3765         while (rtnh_ok(rtnh, remaining)) {
3766                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3767                 if (rtnh->rtnh_ifindex)
3768                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3769
3770                 attrlen = rtnh_attrlen(rtnh);
3771                 if (attrlen > 0) {
3772                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3773
3774                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3775                         if (nla) {
3776                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3777                                 r_cfg.fc_flags |= RTF_GATEWAY;
3778                         }
3779                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3780                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3781                         if (nla)
3782                                 r_cfg.fc_encap_type = nla_get_u16(nla);
3783                 }
3784
3785                 rt = ip6_route_info_create(&r_cfg, extack);
3786                 if (IS_ERR(rt)) {
3787                         err = PTR_ERR(rt);
3788                         rt = NULL;
3789                         goto cleanup;
3790                 }
3791
3792                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3793                 if (err) {
3794                         dst_release_immediate(&rt->dst);
3795                         goto cleanup;
3796                 }
3797
3798                 rtnh = rtnh_next(rtnh, &remaining);
3799         }
3800
3801         /* for add and replace send one notification with all nexthops.
3802          * Skip the notification in fib6_add_rt2node and send one with
3803          * the full route when done
3804          */
3805         info->skip_notify = 1;
3806
3807         err_nh = NULL;
3808         list_for_each_entry(nh, &rt6_nh_list, next) {
3809                 rt_last = nh->rt6_info;
3810                 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3811                 /* save reference to first route for notification */
3812                 if (!rt_notif && !err)
3813                         rt_notif = nh->rt6_info;
3814
3815                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3816                 nh->rt6_info = NULL;
3817                 if (err) {
3818                         if (replace && nhn)
3819                                 ip6_print_replace_route_err(&rt6_nh_list);
3820                         err_nh = nh;
3821                         goto add_errout;
3822                 }
3823
3824                 /* Because each route is added like a single route we remove
3825                  * these flags after the first nexthop: if there is a collision,
3826                  * we have already failed to add the first nexthop:
3827                  * fib6_add_rt2node() has rejected it; when replacing, old
3828                  * nexthops have been replaced by first new, the rest should
3829                  * be added to it.
3830                  */
3831                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3832                                                      NLM_F_REPLACE);
3833                 nhn++;
3834         }
3835
3836         /* success ... tell user about new route */
3837         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3838         goto cleanup;
3839
3840 add_errout:
3841         /* send notification for routes that were added so that
3842          * the delete notifications sent by ip6_route_del are
3843          * coherent
3844          */
3845         if (rt_notif)
3846                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3847
3848         /* Delete routes that were already added */
3849         list_for_each_entry(nh, &rt6_nh_list, next) {
3850                 if (err_nh == nh)
3851                         break;
3852                 ip6_route_del(&nh->r_cfg, extack);
3853         }
3854
3855 cleanup:
3856         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3857                 if (nh->rt6_info)
3858                         dst_release_immediate(&nh->rt6_info->dst);
3859                 kfree(nh->mxc.mx);
3860                 list_del(&nh->next);
3861                 kfree(nh);
3862         }
3863
3864         return err;
3865 }
3866
3867 static int ip6_route_multipath_del(struct fib6_config *cfg,
3868                                    struct netlink_ext_ack *extack)
3869 {
3870         struct fib6_config r_cfg;
3871         struct rtnexthop *rtnh;
3872         int remaining;
3873         int attrlen;
3874         int err = 1, last_err = 0;
3875
3876         remaining = cfg->fc_mp_len;
3877         rtnh = (struct rtnexthop *)cfg->fc_mp;
3878
3879         /* Parse a Multipath Entry */
3880         while (rtnh_ok(rtnh, remaining)) {
3881                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3882                 if (rtnh->rtnh_ifindex)
3883                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3884
3885                 attrlen = rtnh_attrlen(rtnh);
3886                 if (attrlen > 0) {
3887                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3888
3889                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3890                         if (nla) {
3891                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3892                                 r_cfg.fc_flags |= RTF_GATEWAY;
3893                         }
3894                 }
3895                 err = ip6_route_del(&r_cfg, extack);
3896                 if (err)
3897                         last_err = err;
3898
3899                 rtnh = rtnh_next(rtnh, &remaining);
3900         }
3901
3902         return last_err;
3903 }
3904
3905 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3906                               struct netlink_ext_ack *extack)
3907 {
3908         struct fib6_config cfg;
3909         int err;
3910
3911         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3912         if (err < 0)
3913                 return err;
3914
3915         if (cfg.fc_mp)
3916                 return ip6_route_multipath_del(&cfg, extack);
3917         else {
3918                 cfg.fc_delete_all_nh = 1;
3919                 return ip6_route_del(&cfg, extack);
3920         }
3921 }
3922
3923 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3924                               struct netlink_ext_ack *extack)
3925 {
3926         struct fib6_config cfg;
3927         int err;
3928
3929         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3930         if (err < 0)
3931                 return err;
3932
3933         if (cfg.fc_mp)
3934                 return ip6_route_multipath_add(&cfg, extack);
3935         else
3936                 return ip6_route_add(&cfg, extack);
3937 }
3938
3939 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3940 {
3941         int nexthop_len = 0;
3942
3943         if (rt->rt6i_nsiblings) {
3944                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
3945                             + NLA_ALIGN(sizeof(struct rtnexthop))
3946                             + nla_total_size(16) /* RTA_GATEWAY */
3947                             + lwtunnel_get_encap_size(rt->dst.lwtstate);
3948
3949                 nexthop_len *= rt->rt6i_nsiblings;
3950         }
3951
3952         return NLMSG_ALIGN(sizeof(struct rtmsg))
3953                + nla_total_size(16) /* RTA_SRC */
3954                + nla_total_size(16) /* RTA_DST */
3955                + nla_total_size(16) /* RTA_GATEWAY */
3956                + nla_total_size(16) /* RTA_PREFSRC */
3957                + nla_total_size(4) /* RTA_TABLE */
3958                + nla_total_size(4) /* RTA_IIF */
3959                + nla_total_size(4) /* RTA_OIF */
3960                + nla_total_size(4) /* RTA_PRIORITY */
3961                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3962                + nla_total_size(sizeof(struct rta_cacheinfo))
3963                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3964                + nla_total_size(1) /* RTA_PREF */
3965                + lwtunnel_get_encap_size(rt->dst.lwtstate)
3966                + nexthop_len;
3967 }
3968
3969 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3970                             unsigned int *flags, bool skip_oif)
3971 {
3972         if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3973                 *flags |= RTNH_F_LINKDOWN;
3974                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3975                         *flags |= RTNH_F_DEAD;
3976         }
3977
3978         if (rt->rt6i_flags & RTF_GATEWAY) {
3979                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3980                         goto nla_put_failure;
3981         }
3982
3983         if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
3984                 *flags |= RTNH_F_OFFLOAD;
3985
3986         /* not needed for multipath encoding b/c it has a rtnexthop struct */
3987         if (!skip_oif && rt->dst.dev &&
3988             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3989                 goto nla_put_failure;
3990
3991         if (rt->dst.lwtstate &&
3992             lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3993                 goto nla_put_failure;
3994
3995         return 0;
3996
3997 nla_put_failure:
3998         return -EMSGSIZE;
3999 }
4000
4001 /* add multipath next hop */
4002 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4003 {
4004         struct rtnexthop *rtnh;
4005         unsigned int flags = 0;
4006
4007         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4008         if (!rtnh)
4009                 goto nla_put_failure;
4010
4011         rtnh->rtnh_hops = 0;
4012         rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4013
4014         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4015                 goto nla_put_failure;
4016
4017         rtnh->rtnh_flags = flags;
4018
4019         /* length of rtnetlink header + attributes */
4020         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4021
4022         return 0;
4023
4024 nla_put_failure:
4025         return -EMSGSIZE;
4026 }
4027
4028 static int rt6_fill_node(struct net *net,
4029                          struct sk_buff *skb, struct rt6_info *rt,
4030                          struct in6_addr *dst, struct in6_addr *src,
4031                          int iif, int type, u32 portid, u32 seq,
4032                          unsigned int flags)
4033 {
4034         u32 metrics[RTAX_MAX];
4035         struct rtmsg *rtm;
4036         struct nlmsghdr *nlh;
4037         long expires;
4038         u32 table;
4039
4040         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4041         if (!nlh)
4042                 return -EMSGSIZE;
4043
4044         rtm = nlmsg_data(nlh);
4045         rtm->rtm_family = AF_INET6;
4046         rtm->rtm_dst_len = rt->rt6i_dst.plen;
4047         rtm->rtm_src_len = rt->rt6i_src.plen;
4048         rtm->rtm_tos = 0;
4049         if (rt->rt6i_table)
4050                 table = rt->rt6i_table->tb6_id;
4051         else
4052                 table = RT6_TABLE_UNSPEC;
4053         rtm->rtm_table = table;
4054         if (nla_put_u32(skb, RTA_TABLE, table))
4055                 goto nla_put_failure;
4056         if (rt->rt6i_flags & RTF_REJECT) {
4057                 switch (rt->dst.error) {
4058                 case -EINVAL:
4059                         rtm->rtm_type = RTN_BLACKHOLE;
4060                         break;
4061                 case -EACCES:
4062                         rtm->rtm_type = RTN_PROHIBIT;
4063                         break;
4064                 case -EAGAIN:
4065                         rtm->rtm_type = RTN_THROW;
4066                         break;
4067                 default:
4068                         rtm->rtm_type = RTN_UNREACHABLE;
4069                         break;
4070                 }
4071         }
4072         else if (rt->rt6i_flags & RTF_LOCAL)
4073                 rtm->rtm_type = RTN_LOCAL;
4074         else if (rt->rt6i_flags & RTF_ANYCAST)
4075                 rtm->rtm_type = RTN_ANYCAST;
4076         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4077                 rtm->rtm_type = RTN_LOCAL;
4078         else
4079                 rtm->rtm_type = RTN_UNICAST;
4080         rtm->rtm_flags = 0;
4081         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4082         rtm->rtm_protocol = rt->rt6i_protocol;
4083
4084         if (rt->rt6i_flags & RTF_CACHE)
4085                 rtm->rtm_flags |= RTM_F_CLONED;
4086
4087         if (dst) {
4088                 if (nla_put_in6_addr(skb, RTA_DST, dst))
4089                         goto nla_put_failure;
4090                 rtm->rtm_dst_len = 128;
4091         } else if (rtm->rtm_dst_len)
4092                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4093                         goto nla_put_failure;
4094 #ifdef CONFIG_IPV6_SUBTREES
4095         if (src) {
4096                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4097                         goto nla_put_failure;
4098                 rtm->rtm_src_len = 128;
4099         } else if (rtm->rtm_src_len &&
4100                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4101                 goto nla_put_failure;
4102 #endif
4103         if (iif) {
4104 #ifdef CONFIG_IPV6_MROUTE
4105                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4106                         int err = ip6mr_get_route(net, skb, rtm, portid);
4107
4108                         if (err == 0)
4109                                 return 0;
4110                         if (err < 0)
4111                                 goto nla_put_failure;
4112                 } else
4113 #endif
4114                         if (nla_put_u32(skb, RTA_IIF, iif))
4115                                 goto nla_put_failure;
4116         } else if (dst) {
4117                 struct in6_addr saddr_buf;
4118                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4119                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4120                         goto nla_put_failure;
4121         }
4122
4123         if (rt->rt6i_prefsrc.plen) {
4124                 struct in6_addr saddr_buf;
4125                 saddr_buf = rt->rt6i_prefsrc.addr;
4126                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4127                         goto nla_put_failure;
4128         }
4129
4130         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4131         if (rt->rt6i_pmtu)
4132                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4133         if (rtnetlink_put_metrics(skb, metrics) < 0)
4134                 goto nla_put_failure;
4135
4136         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4137                 goto nla_put_failure;
4138
4139         /* For multipath routes, walk the siblings list and add
4140          * each as a nexthop within RTA_MULTIPATH.
4141          */
4142         if (rt->rt6i_nsiblings) {
4143                 struct rt6_info *sibling, *next_sibling;
4144                 struct nlattr *mp;
4145
4146                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4147                 if (!mp)
4148                         goto nla_put_failure;
4149
4150                 if (rt6_add_nexthop(skb, rt) < 0)
4151                         goto nla_put_failure;
4152
4153                 list_for_each_entry_safe(sibling, next_sibling,
4154                                          &rt->rt6i_siblings, rt6i_siblings) {
4155                         if (rt6_add_nexthop(skb, sibling) < 0)
4156                                 goto nla_put_failure;
4157                 }
4158
4159                 nla_nest_end(skb, mp);
4160         } else {
4161                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4162                         goto nla_put_failure;
4163         }
4164
4165         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4166
4167         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4168                 goto nla_put_failure;
4169
4170         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4171                 goto nla_put_failure;
4172
4173
4174         nlmsg_end(skb, nlh);
4175         return 0;
4176
4177 nla_put_failure:
4178         nlmsg_cancel(skb, nlh);
4179         return -EMSGSIZE;
4180 }
4181
4182 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4183 {
4184         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4185         struct net *net = arg->net;
4186
4187         if (rt == net->ipv6.ip6_null_entry)
4188                 return 0;
4189
4190         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4191                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4192
4193                 /* user wants prefix routes only */
4194                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4195                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4196                         /* success since this is not a prefix route */
4197                         return 1;
4198                 }
4199         }
4200
4201         return rt6_fill_node(net,
4202                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4203                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4204                      NLM_F_MULTI);
4205 }
4206
4207 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4208                               struct netlink_ext_ack *extack)
4209 {
4210         struct net *net = sock_net(in_skb->sk);
4211         struct nlattr *tb[RTA_MAX+1];
4212         int err, iif = 0, oif = 0;
4213         struct dst_entry *dst;
4214         struct rt6_info *rt;
4215         struct sk_buff *skb;
4216         struct rtmsg *rtm;
4217         struct flowi6 fl6;
4218         bool fibmatch;
4219
4220         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4221                           extack);
4222         if (err < 0)
4223                 goto errout;
4224
4225         err = -EINVAL;
4226         memset(&fl6, 0, sizeof(fl6));
4227         rtm = nlmsg_data(nlh);
4228         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4229         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4230
4231         if (tb[RTA_SRC]) {
4232                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4233                         goto errout;
4234
4235                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4236         }
4237
4238         if (tb[RTA_DST]) {
4239                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4240                         goto errout;
4241
4242                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4243         }
4244
4245         if (tb[RTA_IIF])
4246                 iif = nla_get_u32(tb[RTA_IIF]);
4247
4248         if (tb[RTA_OIF])
4249                 oif = nla_get_u32(tb[RTA_OIF]);
4250
4251         if (tb[RTA_MARK])
4252                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4253
4254         if (tb[RTA_UID])
4255                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4256                                            nla_get_u32(tb[RTA_UID]));
4257         else
4258                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4259
4260         if (iif) {
4261                 struct net_device *dev;
4262                 int flags = 0;
4263
4264                 rcu_read_lock();
4265
4266                 dev = dev_get_by_index_rcu(net, iif);
4267                 if (!dev) {
4268                         rcu_read_unlock();
4269                         err = -ENODEV;
4270                         goto errout;
4271                 }
4272
4273                 fl6.flowi6_iif = iif;
4274
4275                 if (!ipv6_addr_any(&fl6.saddr))
4276                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4277
4278                 if (!fibmatch)
4279                         dst = ip6_route_input_lookup(net, dev, &fl6, flags);
4280                 else
4281                         dst = ip6_route_lookup(net, &fl6, 0);
4282
4283                 rcu_read_unlock();
4284         } else {
4285                 fl6.flowi6_oif = oif;
4286
4287                 if (!fibmatch)
4288                         dst = ip6_route_output(net, NULL, &fl6);
4289                 else
4290                         dst = ip6_route_lookup(net, &fl6, 0);
4291         }
4292
4293
4294         rt = container_of(dst, struct rt6_info, dst);
4295         if (rt->dst.error) {
4296                 err = rt->dst.error;
4297                 ip6_rt_put(rt);
4298                 goto errout;
4299         }
4300
4301         if (rt == net->ipv6.ip6_null_entry) {
4302                 err = rt->dst.error;
4303                 ip6_rt_put(rt);
4304                 goto errout;
4305         }
4306
4307         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4308         if (!skb) {
4309                 ip6_rt_put(rt);
4310                 err = -ENOBUFS;
4311                 goto errout;
4312         }
4313
4314         skb_dst_set(skb, &rt->dst);
4315         if (fibmatch)
4316                 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4317                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4318                                     nlh->nlmsg_seq, 0);
4319         else
4320                 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4321                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4322                                     nlh->nlmsg_seq, 0);
4323         if (err < 0) {
4324                 kfree_skb(skb);
4325                 goto errout;
4326         }
4327
4328         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4329 errout:
4330         return err;
4331 }
4332
4333 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4334                      unsigned int nlm_flags)
4335 {
4336         struct sk_buff *skb;
4337         struct net *net = info->nl_net;
4338         u32 seq;
4339         int err;
4340
4341         err = -ENOBUFS;
4342         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4343
4344         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4345         if (!skb)
4346                 goto errout;
4347
4348         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4349                                 event, info->portid, seq, nlm_flags);
4350         if (err < 0) {
4351                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4352                 WARN_ON(err == -EMSGSIZE);
4353                 kfree_skb(skb);
4354                 goto errout;
4355         }
4356         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4357                     info->nlh, gfp_any());
4358         return;
4359 errout:
4360         if (err < 0)
4361                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4362 }
4363
4364 static int ip6_route_dev_notify(struct notifier_block *this,
4365                                 unsigned long event, void *ptr)
4366 {
4367         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4368         struct net *net = dev_net(dev);
4369
4370         if (!(dev->flags & IFF_LOOPBACK))
4371                 return NOTIFY_OK;
4372
4373         if (event == NETDEV_REGISTER) {
4374                 net->ipv6.ip6_null_entry->dst.dev = dev;
4375                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4376 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4377                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4378                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4379                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4380                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4381 #endif
4382          } else if (event == NETDEV_UNREGISTER &&
4383                     dev->reg_state != NETREG_UNREGISTERED) {
4384                 /* NETDEV_UNREGISTER could be fired for multiple times by
4385                  * netdev_wait_allrefs(). Make sure we only call this once.
4386                  */
4387                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4388 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4389                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4390                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4391 #endif
4392         }
4393
4394         return NOTIFY_OK;
4395 }
4396
4397 /*
4398  *      /proc
4399  */
4400
4401 #ifdef CONFIG_PROC_FS
4402
4403 static const struct file_operations ipv6_route_proc_fops = {
4404         .owner          = THIS_MODULE,
4405         .open           = ipv6_route_open,
4406         .read           = seq_read,
4407         .llseek         = seq_lseek,
4408         .release        = seq_release_net,
4409 };
4410
4411 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4412 {
4413         struct net *net = (struct net *)seq->private;
4414         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4415                    net->ipv6.rt6_stats->fib_nodes,
4416                    net->ipv6.rt6_stats->fib_route_nodes,
4417                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4418                    net->ipv6.rt6_stats->fib_rt_entries,
4419                    net->ipv6.rt6_stats->fib_rt_cache,
4420                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4421                    net->ipv6.rt6_stats->fib_discarded_routes);
4422
4423         return 0;
4424 }
4425
4426 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4427 {
4428         return single_open_net(inode, file, rt6_stats_seq_show);
4429 }
4430
4431 static const struct file_operations rt6_stats_seq_fops = {
4432         .owner   = THIS_MODULE,
4433         .open    = rt6_stats_seq_open,
4434         .read    = seq_read,
4435         .llseek  = seq_lseek,
4436         .release = single_release_net,
4437 };
4438 #endif  /* CONFIG_PROC_FS */
4439
4440 #ifdef CONFIG_SYSCTL
4441
4442 static
4443 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4444                               void __user *buffer, size_t *lenp, loff_t *ppos)
4445 {
4446         struct net *net;
4447         int delay;
4448         if (!write)
4449                 return -EINVAL;
4450
4451         net = (struct net *)ctl->extra1;
4452         delay = net->ipv6.sysctl.flush_delay;
4453         proc_dointvec(ctl, write, buffer, lenp, ppos);
4454         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4455         return 0;
4456 }
4457
4458 struct ctl_table ipv6_route_table_template[] = {
4459         {
4460                 .procname       =       "flush",
4461                 .data           =       &init_net.ipv6.sysctl.flush_delay,
4462                 .maxlen         =       sizeof(int),
4463                 .mode           =       0200,
4464                 .proc_handler   =       ipv6_sysctl_rtcache_flush
4465         },
4466         {
4467                 .procname       =       "gc_thresh",
4468                 .data           =       &ip6_dst_ops_template.gc_thresh,
4469                 .maxlen         =       sizeof(int),
4470                 .mode           =       0644,
4471                 .proc_handler   =       proc_dointvec,
4472         },
4473         {
4474                 .procname       =       "max_size",
4475                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
4476                 .maxlen         =       sizeof(int),
4477                 .mode           =       0644,
4478                 .proc_handler   =       proc_dointvec,
4479         },
4480         {
4481                 .procname       =       "gc_min_interval",
4482                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4483                 .maxlen         =       sizeof(int),
4484                 .mode           =       0644,
4485                 .proc_handler   =       proc_dointvec_jiffies,
4486         },
4487         {
4488                 .procname       =       "gc_timeout",
4489                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4490                 .maxlen         =       sizeof(int),
4491                 .mode           =       0644,
4492                 .proc_handler   =       proc_dointvec_jiffies,
4493         },
4494         {
4495                 .procname       =       "gc_interval",
4496                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4497                 .maxlen         =       sizeof(int),
4498                 .mode           =       0644,
4499                 .proc_handler   =       proc_dointvec_jiffies,
4500         },
4501         {
4502                 .procname       =       "gc_elasticity",
4503                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4504                 .maxlen         =       sizeof(int),
4505                 .mode           =       0644,
4506                 .proc_handler   =       proc_dointvec,
4507         },
4508         {
4509                 .procname       =       "mtu_expires",
4510                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4511                 .maxlen         =       sizeof(int),
4512                 .mode           =       0644,
4513                 .proc_handler   =       proc_dointvec_jiffies,
4514         },
4515         {
4516                 .procname       =       "min_adv_mss",
4517                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4518                 .maxlen         =       sizeof(int),
4519                 .mode           =       0644,
4520                 .proc_handler   =       proc_dointvec,
4521         },
4522         {
4523                 .procname       =       "gc_min_interval_ms",
4524                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4525                 .maxlen         =       sizeof(int),
4526                 .mode           =       0644,
4527                 .proc_handler   =       proc_dointvec_ms_jiffies,
4528         },
4529         { }
4530 };
4531
4532 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4533 {
4534         struct ctl_table *table;
4535
4536         table = kmemdup(ipv6_route_table_template,
4537                         sizeof(ipv6_route_table_template),
4538                         GFP_KERNEL);
4539
4540         if (table) {
4541                 table[0].data = &net->ipv6.sysctl.flush_delay;
4542                 table[0].extra1 = net;
4543                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4544                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4545                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4546                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4547                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4548                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4549                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4550                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4551                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4552
4553                 /* Don't export sysctls to unprivileged users */
4554                 if (net->user_ns != &init_user_ns)
4555                         table[0].procname = NULL;
4556         }
4557
4558         return table;
4559 }
4560 #endif
4561
4562 static int __net_init ip6_route_net_init(struct net *net)
4563 {
4564         int ret = -ENOMEM;
4565
4566         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4567                sizeof(net->ipv6.ip6_dst_ops));
4568
4569         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4570                 goto out_ip6_dst_ops;
4571
4572         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4573                                            sizeof(*net->ipv6.ip6_null_entry),
4574                                            GFP_KERNEL);
4575         if (!net->ipv6.ip6_null_entry)
4576                 goto out_ip6_dst_entries;
4577         net->ipv6.ip6_null_entry->dst.path =
4578                 (struct dst_entry *)net->ipv6.ip6_null_entry;
4579         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4580         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4581                          ip6_template_metrics, true);
4582
4583 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4584         net->ipv6.fib6_has_custom_rules = false;
4585         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4586                                                sizeof(*net->ipv6.ip6_prohibit_entry),
4587                                                GFP_KERNEL);
4588         if (!net->ipv6.ip6_prohibit_entry)
4589                 goto out_ip6_null_entry;
4590         net->ipv6.ip6_prohibit_entry->dst.path =
4591                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
4592         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4593         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4594                          ip6_template_metrics, true);
4595
4596         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4597                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
4598                                                GFP_KERNEL);
4599         if (!net->ipv6.ip6_blk_hole_entry)
4600                 goto out_ip6_prohibit_entry;
4601         net->ipv6.ip6_blk_hole_entry->dst.path =
4602                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
4603         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4604         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4605                          ip6_template_metrics, true);
4606 #endif
4607
4608         net->ipv6.sysctl.flush_delay = 0;
4609         net->ipv6.sysctl.ip6_rt_max_size = 4096;
4610         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4611         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4612         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4613         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4614         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4615         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4616
4617         net->ipv6.ip6_rt_gc_expire = 30*HZ;
4618
4619         ret = 0;
4620 out:
4621         return ret;
4622
4623 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4624 out_ip6_prohibit_entry:
4625         kfree(net->ipv6.ip6_prohibit_entry);
4626 out_ip6_null_entry:
4627         kfree(net->ipv6.ip6_null_entry);
4628 #endif
4629 out_ip6_dst_entries:
4630         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4631 out_ip6_dst_ops:
4632         goto out;
4633 }
4634
4635 static void __net_exit ip6_route_net_exit(struct net *net)
4636 {
4637         kfree(net->ipv6.ip6_null_entry);
4638 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4639         kfree(net->ipv6.ip6_prohibit_entry);
4640         kfree(net->ipv6.ip6_blk_hole_entry);
4641 #endif
4642         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4643 }
4644
4645 static int __net_init ip6_route_net_init_late(struct net *net)
4646 {
4647 #ifdef CONFIG_PROC_FS
4648         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4649         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4650 #endif
4651         return 0;
4652 }
4653
4654 static void __net_exit ip6_route_net_exit_late(struct net *net)
4655 {
4656 #ifdef CONFIG_PROC_FS
4657         remove_proc_entry("ipv6_route", net->proc_net);
4658         remove_proc_entry("rt6_stats", net->proc_net);
4659 #endif
4660 }
4661
4662 static struct pernet_operations ip6_route_net_ops = {
4663         .init = ip6_route_net_init,
4664         .exit = ip6_route_net_exit,
4665 };
4666
4667 static int __net_init ipv6_inetpeer_init(struct net *net)
4668 {
4669         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4670
4671         if (!bp)
4672                 return -ENOMEM;
4673         inet_peer_base_init(bp);
4674         net->ipv6.peers = bp;
4675         return 0;
4676 }
4677
4678 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4679 {
4680         struct inet_peer_base *bp = net->ipv6.peers;
4681
4682         net->ipv6.peers = NULL;
4683         inetpeer_invalidate_tree(bp);
4684         kfree(bp);
4685 }
4686
4687 static struct pernet_operations ipv6_inetpeer_ops = {
4688         .init   =       ipv6_inetpeer_init,
4689         .exit   =       ipv6_inetpeer_exit,
4690 };
4691
4692 static struct pernet_operations ip6_route_net_late_ops = {
4693         .init = ip6_route_net_init_late,
4694         .exit = ip6_route_net_exit_late,
4695 };
4696
4697 static struct notifier_block ip6_route_dev_notifier = {
4698         .notifier_call = ip6_route_dev_notify,
4699         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4700 };
4701
4702 void __init ip6_route_init_special_entries(void)
4703 {
4704         /* Registering of the loopback is done before this portion of code,
4705          * the loopback reference in rt6_info will not be taken, do it
4706          * manually for init_net */
4707         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4708         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4709   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4710         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4711         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4712         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4713         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4714   #endif
4715 }
4716
4717 int __init ip6_route_init(void)
4718 {
4719         int ret;
4720         int cpu;
4721
4722         ret = -ENOMEM;
4723         ip6_dst_ops_template.kmem_cachep =
4724                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4725                                   SLAB_HWCACHE_ALIGN, NULL);
4726         if (!ip6_dst_ops_template.kmem_cachep)
4727                 goto out;
4728
4729         ret = dst_entries_init(&ip6_dst_blackhole_ops);
4730         if (ret)
4731                 goto out_kmem_cache;
4732
4733         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4734         if (ret)
4735                 goto out_dst_entries;
4736
4737         ret = register_pernet_subsys(&ip6_route_net_ops);
4738         if (ret)
4739                 goto out_register_inetpeer;
4740
4741         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4742
4743         ret = fib6_init();
4744         if (ret)
4745                 goto out_register_subsys;
4746
4747         ret = xfrm6_init();
4748         if (ret)
4749                 goto out_fib6_init;
4750
4751         ret = fib6_rules_init();
4752         if (ret)
4753                 goto xfrm6_init;
4754
4755         ret = register_pernet_subsys(&ip6_route_net_late_ops);
4756         if (ret)
4757                 goto fib6_rules_init;
4758
4759         ret = -ENOBUFS;
4760         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
4761             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
4762             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
4763                             RTNL_FLAG_DOIT_UNLOCKED))
4764                 goto out_register_late_subsys;
4765
4766         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4767         if (ret)
4768                 goto out_register_late_subsys;
4769
4770         for_each_possible_cpu(cpu) {
4771                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4772
4773                 INIT_LIST_HEAD(&ul->head);
4774                 spin_lock_init(&ul->lock);
4775         }
4776
4777 out:
4778         return ret;
4779
4780 out_register_late_subsys:
4781         unregister_pernet_subsys(&ip6_route_net_late_ops);
4782 fib6_rules_init:
4783         fib6_rules_cleanup();
4784 xfrm6_init:
4785         xfrm6_fini();
4786 out_fib6_init:
4787         fib6_gc_cleanup();
4788 out_register_subsys:
4789         unregister_pernet_subsys(&ip6_route_net_ops);
4790 out_register_inetpeer:
4791         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4792 out_dst_entries:
4793         dst_entries_destroy(&ip6_dst_blackhole_ops);
4794 out_kmem_cache:
4795         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4796         goto out;
4797 }
4798
4799 void ip6_route_cleanup(void)
4800 {
4801         unregister_netdevice_notifier(&ip6_route_dev_notifier);
4802         unregister_pernet_subsys(&ip6_route_net_late_ops);
4803         fib6_rules_cleanup();
4804         xfrm6_fini();
4805         fib6_gc_cleanup();
4806         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4807         unregister_pernet_subsys(&ip6_route_net_ops);
4808         dst_entries_destroy(&ip6_dst_blackhole_ops);
4809         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4810 }