]> asedeno.scripts.mit.edu Git - linux.git/blob - net/ipv6/route.c
ipv6: fix a BUG in rt6_get_pcpu_route()
[linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67
68 #include <linux/uaccess.h>
69
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73
74 enum rt6_nud_state {
75         RT6_NUD_FAIL_HARD = -3,
76         RT6_NUD_FAIL_PROBE = -2,
77         RT6_NUD_FAIL_DO_RR = -1,
78         RT6_NUD_SUCCEED = 1
79 };
80
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int      ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void             ip6_dst_destroy(struct dst_entry *);
87 static void             ip6_dst_ifdown(struct dst_entry *,
88                                        struct net_device *dev, int how);
89 static int               ip6_dst_gc(struct dst_ops *ops);
90
91 static int              ip6_pkt_discard(struct sk_buff *skb);
92 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int              ip6_pkt_prohibit(struct sk_buff *skb);
94 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void             ip6_link_failure(struct sk_buff *skb);
96 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97                                            struct sk_buff *skb, u32 mtu);
98 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99                                         struct sk_buff *skb);
100 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104                          struct sk_buff *skb, struct rt6_info *rt,
105                          struct in6_addr *dst, struct in6_addr *src,
106                          int iif, int type, u32 portid, u32 seq,
107                          unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109                                            struct in6_addr *daddr,
110                                            struct in6_addr *saddr);
111
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114                                            const struct in6_addr *prefix, int prefixlen,
115                                            const struct in6_addr *gwaddr,
116                                            struct net_device *dev,
117                                            unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119                                            const struct in6_addr *prefix, int prefixlen,
120                                            const struct in6_addr *gwaddr,
121                                            struct net_device *dev);
122 #endif
123
124 struct uncached_list {
125         spinlock_t              lock;
126         struct list_head        head;
127 };
128
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130
131 static void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134
135         rt->rt6i_uncached_list = ul;
136
137         spin_lock_bh(&ul->lock);
138         list_add_tail(&rt->rt6i_uncached, &ul->head);
139         spin_unlock_bh(&ul->lock);
140 }
141
142 static void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144         if (!list_empty(&rt->rt6i_uncached)) {
145                 struct uncached_list *ul = rt->rt6i_uncached_list;
146                 struct net *net = dev_net(rt->dst.dev);
147
148                 spin_lock_bh(&ul->lock);
149                 list_del(&rt->rt6i_uncached);
150                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151                 spin_unlock_bh(&ul->lock);
152         }
153 }
154
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
156 {
157         struct net_device *loopback_dev = net->loopback_dev;
158         int cpu;
159
160         if (dev == loopback_dev)
161                 return;
162
163         for_each_possible_cpu(cpu) {
164                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165                 struct rt6_info *rt;
166
167                 spin_lock_bh(&ul->lock);
168                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169                         struct inet6_dev *rt_idev = rt->rt6i_idev;
170                         struct net_device *rt_dev = rt->dst.dev;
171
172                         if (rt_idev->dev == dev) {
173                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
174                                 in6_dev_put(rt_idev);
175                         }
176
177                         if (rt_dev == dev) {
178                                 rt->dst.dev = loopback_dev;
179                                 dev_hold(rt->dst.dev);
180                                 dev_put(rt_dev);
181                         }
182                 }
183                 spin_unlock_bh(&ul->lock);
184         }
185 }
186
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
188 {
189         return dst_metrics_write_ptr(rt->dst.from);
190 }
191
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193 {
194         struct rt6_info *rt = (struct rt6_info *)dst;
195
196         if (rt->rt6i_flags & RTF_PCPU)
197                 return rt6_pcpu_cow_metrics(rt);
198         else if (rt->rt6i_flags & RTF_CACHE)
199                 return NULL;
200         else
201                 return dst_cow_metrics_generic(dst, old);
202 }
203
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205                                              struct sk_buff *skb,
206                                              const void *daddr)
207 {
208         struct in6_addr *p = &rt->rt6i_gateway;
209
210         if (!ipv6_addr_any(p))
211                 return (const void *) p;
212         else if (skb)
213                 return &ipv6_hdr(skb)->daddr;
214         return daddr;
215 }
216
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218                                           struct sk_buff *skb,
219                                           const void *daddr)
220 {
221         struct rt6_info *rt = (struct rt6_info *) dst;
222         struct neighbour *n;
223
224         daddr = choose_neigh_daddr(rt, skb, daddr);
225         n = __ipv6_neigh_lookup(dst->dev, daddr);
226         if (n)
227                 return n;
228         return neigh_create(&nd_tbl, daddr, dst->dev);
229 }
230
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
232 {
233         struct net_device *dev = dst->dev;
234         struct rt6_info *rt = (struct rt6_info *)dst;
235
236         daddr = choose_neigh_daddr(rt, NULL, daddr);
237         if (!daddr)
238                 return;
239         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240                 return;
241         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242                 return;
243         __ipv6_confirm_neigh(dev, daddr);
244 }
245
246 static struct dst_ops ip6_dst_ops_template = {
247         .family                 =       AF_INET6,
248         .gc                     =       ip6_dst_gc,
249         .gc_thresh              =       1024,
250         .check                  =       ip6_dst_check,
251         .default_advmss         =       ip6_default_advmss,
252         .mtu                    =       ip6_mtu,
253         .cow_metrics            =       ipv6_cow_metrics,
254         .destroy                =       ip6_dst_destroy,
255         .ifdown                 =       ip6_dst_ifdown,
256         .negative_advice        =       ip6_negative_advice,
257         .link_failure           =       ip6_link_failure,
258         .update_pmtu            =       ip6_rt_update_pmtu,
259         .redirect               =       rt6_do_redirect,
260         .local_out              =       __ip6_local_out,
261         .neigh_lookup           =       ip6_neigh_lookup,
262         .confirm_neigh          =       ip6_confirm_neigh,
263 };
264
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
266 {
267         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
268
269         return mtu ? : dst->dev->mtu;
270 }
271
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273                                          struct sk_buff *skb, u32 mtu)
274 {
275 }
276
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278                                       struct sk_buff *skb)
279 {
280 }
281
282 static struct dst_ops ip6_dst_blackhole_ops = {
283         .family                 =       AF_INET6,
284         .destroy                =       ip6_dst_destroy,
285         .check                  =       ip6_dst_check,
286         .mtu                    =       ip6_blackhole_mtu,
287         .default_advmss         =       ip6_default_advmss,
288         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
289         .redirect               =       ip6_rt_blackhole_redirect,
290         .cow_metrics            =       dst_cow_metrics_generic,
291         .neigh_lookup           =       ip6_neigh_lookup,
292 };
293
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295         [RTAX_HOPLIMIT - 1] = 0,
296 };
297
298 static const struct rt6_info ip6_null_entry_template = {
299         .dst = {
300                 .__refcnt       = ATOMIC_INIT(1),
301                 .__use          = 1,
302                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
303                 .error          = -ENETUNREACH,
304                 .input          = ip6_pkt_discard,
305                 .output         = ip6_pkt_discard_out,
306         },
307         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
308         .rt6i_protocol  = RTPROT_KERNEL,
309         .rt6i_metric    = ~(u32) 0,
310         .rt6i_ref       = ATOMIC_INIT(1),
311 };
312
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
315 static const struct rt6_info ip6_prohibit_entry_template = {
316         .dst = {
317                 .__refcnt       = ATOMIC_INIT(1),
318                 .__use          = 1,
319                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
320                 .error          = -EACCES,
321                 .input          = ip6_pkt_prohibit,
322                 .output         = ip6_pkt_prohibit_out,
323         },
324         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
325         .rt6i_protocol  = RTPROT_KERNEL,
326         .rt6i_metric    = ~(u32) 0,
327         .rt6i_ref       = ATOMIC_INIT(1),
328 };
329
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331         .dst = {
332                 .__refcnt       = ATOMIC_INIT(1),
333                 .__use          = 1,
334                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
335                 .error          = -EINVAL,
336                 .input          = dst_discard,
337                 .output         = dst_discard_out,
338         },
339         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
340         .rt6i_protocol  = RTPROT_KERNEL,
341         .rt6i_metric    = ~(u32) 0,
342         .rt6i_ref       = ATOMIC_INIT(1),
343 };
344
345 #endif
346
347 static void rt6_info_init(struct rt6_info *rt)
348 {
349         struct dst_entry *dst = &rt->dst;
350
351         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352         INIT_LIST_HEAD(&rt->rt6i_siblings);
353         INIT_LIST_HEAD(&rt->rt6i_uncached);
354 }
355
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358                                         struct net_device *dev,
359                                         int flags)
360 {
361         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362                                         1, DST_OBSOLETE_FORCE_CHK, flags);
363
364         if (rt) {
365                 rt6_info_init(rt);
366                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
367         }
368
369         return rt;
370 }
371
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373                                struct net_device *dev,
374                                int flags)
375 {
376         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
377
378         if (rt) {
379                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380                 if (rt->rt6i_pcpu) {
381                         int cpu;
382
383                         for_each_possible_cpu(cpu) {
384                                 struct rt6_info **p;
385
386                                 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
387                                 /* no one shares rt */
388                                 *p =  NULL;
389                         }
390                 } else {
391                         dst_release_immediate(&rt->dst);
392                         return NULL;
393                 }
394         }
395
396         return rt;
397 }
398 EXPORT_SYMBOL(ip6_dst_alloc);
399
400 static void ip6_dst_destroy(struct dst_entry *dst)
401 {
402         struct rt6_info *rt = (struct rt6_info *)dst;
403         struct rt6_exception_bucket *bucket;
404         struct dst_entry *from = dst->from;
405         struct inet6_dev *idev;
406
407         dst_destroy_metrics_generic(dst);
408         free_percpu(rt->rt6i_pcpu);
409         rt6_uncached_list_del(rt);
410
411         idev = rt->rt6i_idev;
412         if (idev) {
413                 rt->rt6i_idev = NULL;
414                 in6_dev_put(idev);
415         }
416         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
417         if (bucket) {
418                 rt->rt6i_exception_bucket = NULL;
419                 kfree(bucket);
420         }
421
422         dst->from = NULL;
423         dst_release(from);
424 }
425
426 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
427                            int how)
428 {
429         struct rt6_info *rt = (struct rt6_info *)dst;
430         struct inet6_dev *idev = rt->rt6i_idev;
431         struct net_device *loopback_dev =
432                 dev_net(dev)->loopback_dev;
433
434         if (idev && idev->dev != loopback_dev) {
435                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
436                 if (loopback_idev) {
437                         rt->rt6i_idev = loopback_idev;
438                         in6_dev_put(idev);
439                 }
440         }
441 }
442
443 static bool __rt6_check_expired(const struct rt6_info *rt)
444 {
445         if (rt->rt6i_flags & RTF_EXPIRES)
446                 return time_after(jiffies, rt->dst.expires);
447         else
448                 return false;
449 }
450
451 static bool rt6_check_expired(const struct rt6_info *rt)
452 {
453         if (rt->rt6i_flags & RTF_EXPIRES) {
454                 if (time_after(jiffies, rt->dst.expires))
455                         return true;
456         } else if (rt->dst.from) {
457                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
458                        rt6_check_expired((struct rt6_info *)rt->dst.from);
459         }
460         return false;
461 }
462
463 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
464                                              struct flowi6 *fl6, int oif,
465                                              int strict)
466 {
467         struct rt6_info *sibling, *next_sibling;
468         int route_choosen;
469
470         /* We might have already computed the hash for ICMPv6 errors. In such
471          * case it will always be non-zero. Otherwise now is the time to do it.
472          */
473         if (!fl6->mp_hash)
474                 fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
475
476         route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
477         /* Don't change the route, if route_choosen == 0
478          * (siblings does not include ourself)
479          */
480         if (route_choosen)
481                 list_for_each_entry_safe(sibling, next_sibling,
482                                 &match->rt6i_siblings, rt6i_siblings) {
483                         route_choosen--;
484                         if (route_choosen == 0) {
485                                 if (rt6_score_route(sibling, oif, strict) < 0)
486                                         break;
487                                 match = sibling;
488                                 break;
489                         }
490                 }
491         return match;
492 }
493
494 /*
495  *      Route lookup. rcu_read_lock() should be held.
496  */
497
498 static inline struct rt6_info *rt6_device_match(struct net *net,
499                                                     struct rt6_info *rt,
500                                                     const struct in6_addr *saddr,
501                                                     int oif,
502                                                     int flags)
503 {
504         struct rt6_info *local = NULL;
505         struct rt6_info *sprt;
506
507         if (!oif && ipv6_addr_any(saddr))
508                 goto out;
509
510         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) {
511                 struct net_device *dev = sprt->dst.dev;
512
513                 if (oif) {
514                         if (dev->ifindex == oif)
515                                 return sprt;
516                         if (dev->flags & IFF_LOOPBACK) {
517                                 if (!sprt->rt6i_idev ||
518                                     sprt->rt6i_idev->dev->ifindex != oif) {
519                                         if (flags & RT6_LOOKUP_F_IFACE)
520                                                 continue;
521                                         if (local &&
522                                             local->rt6i_idev->dev->ifindex == oif)
523                                                 continue;
524                                 }
525                                 local = sprt;
526                         }
527                 } else {
528                         if (ipv6_chk_addr(net, saddr, dev,
529                                           flags & RT6_LOOKUP_F_IFACE))
530                                 return sprt;
531                 }
532         }
533
534         if (oif) {
535                 if (local)
536                         return local;
537
538                 if (flags & RT6_LOOKUP_F_IFACE)
539                         return net->ipv6.ip6_null_entry;
540         }
541 out:
542         return rt;
543 }
544
545 #ifdef CONFIG_IPV6_ROUTER_PREF
546 struct __rt6_probe_work {
547         struct work_struct work;
548         struct in6_addr target;
549         struct net_device *dev;
550 };
551
552 static void rt6_probe_deferred(struct work_struct *w)
553 {
554         struct in6_addr mcaddr;
555         struct __rt6_probe_work *work =
556                 container_of(w, struct __rt6_probe_work, work);
557
558         addrconf_addr_solict_mult(&work->target, &mcaddr);
559         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
560         dev_put(work->dev);
561         kfree(work);
562 }
563
564 static void rt6_probe(struct rt6_info *rt)
565 {
566         struct __rt6_probe_work *work;
567         struct neighbour *neigh;
568         /*
569          * Okay, this does not seem to be appropriate
570          * for now, however, we need to check if it
571          * is really so; aka Router Reachability Probing.
572          *
573          * Router Reachability Probe MUST be rate-limited
574          * to no more than one per minute.
575          */
576         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
577                 return;
578         rcu_read_lock_bh();
579         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
580         if (neigh) {
581                 if (neigh->nud_state & NUD_VALID)
582                         goto out;
583
584                 work = NULL;
585                 write_lock(&neigh->lock);
586                 if (!(neigh->nud_state & NUD_VALID) &&
587                     time_after(jiffies,
588                                neigh->updated +
589                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
590                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
591                         if (work)
592                                 __neigh_set_probe_once(neigh);
593                 }
594                 write_unlock(&neigh->lock);
595         } else {
596                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
597         }
598
599         if (work) {
600                 INIT_WORK(&work->work, rt6_probe_deferred);
601                 work->target = rt->rt6i_gateway;
602                 dev_hold(rt->dst.dev);
603                 work->dev = rt->dst.dev;
604                 schedule_work(&work->work);
605         }
606
607 out:
608         rcu_read_unlock_bh();
609 }
610 #else
611 static inline void rt6_probe(struct rt6_info *rt)
612 {
613 }
614 #endif
615
616 /*
617  * Default Router Selection (RFC 2461 6.3.6)
618  */
619 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
620 {
621         struct net_device *dev = rt->dst.dev;
622         if (!oif || dev->ifindex == oif)
623                 return 2;
624         if ((dev->flags & IFF_LOOPBACK) &&
625             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
626                 return 1;
627         return 0;
628 }
629
630 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
631 {
632         struct neighbour *neigh;
633         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
634
635         if (rt->rt6i_flags & RTF_NONEXTHOP ||
636             !(rt->rt6i_flags & RTF_GATEWAY))
637                 return RT6_NUD_SUCCEED;
638
639         rcu_read_lock_bh();
640         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
641         if (neigh) {
642                 read_lock(&neigh->lock);
643                 if (neigh->nud_state & NUD_VALID)
644                         ret = RT6_NUD_SUCCEED;
645 #ifdef CONFIG_IPV6_ROUTER_PREF
646                 else if (!(neigh->nud_state & NUD_FAILED))
647                         ret = RT6_NUD_SUCCEED;
648                 else
649                         ret = RT6_NUD_FAIL_PROBE;
650 #endif
651                 read_unlock(&neigh->lock);
652         } else {
653                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
654                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
655         }
656         rcu_read_unlock_bh();
657
658         return ret;
659 }
660
661 static int rt6_score_route(struct rt6_info *rt, int oif,
662                            int strict)
663 {
664         int m;
665
666         m = rt6_check_dev(rt, oif);
667         if (!m && (strict & RT6_LOOKUP_F_IFACE))
668                 return RT6_NUD_FAIL_HARD;
669 #ifdef CONFIG_IPV6_ROUTER_PREF
670         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
671 #endif
672         if (strict & RT6_LOOKUP_F_REACHABLE) {
673                 int n = rt6_check_neigh(rt);
674                 if (n < 0)
675                         return n;
676         }
677         return m;
678 }
679
680 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
681                                    int *mpri, struct rt6_info *match,
682                                    bool *do_rr)
683 {
684         int m;
685         bool match_do_rr = false;
686         struct inet6_dev *idev = rt->rt6i_idev;
687         struct net_device *dev = rt->dst.dev;
688
689         if (dev && !netif_carrier_ok(dev) &&
690             idev->cnf.ignore_routes_with_linkdown &&
691             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
692                 goto out;
693
694         if (rt6_check_expired(rt))
695                 goto out;
696
697         m = rt6_score_route(rt, oif, strict);
698         if (m == RT6_NUD_FAIL_DO_RR) {
699                 match_do_rr = true;
700                 m = 0; /* lowest valid score */
701         } else if (m == RT6_NUD_FAIL_HARD) {
702                 goto out;
703         }
704
705         if (strict & RT6_LOOKUP_F_REACHABLE)
706                 rt6_probe(rt);
707
708         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
709         if (m > *mpri) {
710                 *do_rr = match_do_rr;
711                 *mpri = m;
712                 match = rt;
713         }
714 out:
715         return match;
716 }
717
718 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
719                                      struct rt6_info *leaf,
720                                      struct rt6_info *rr_head,
721                                      u32 metric, int oif, int strict,
722                                      bool *do_rr)
723 {
724         struct rt6_info *rt, *match, *cont;
725         int mpri = -1;
726
727         match = NULL;
728         cont = NULL;
729         for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) {
730                 if (rt->rt6i_metric != metric) {
731                         cont = rt;
732                         break;
733                 }
734
735                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
736         }
737
738         for (rt = leaf; rt && rt != rr_head;
739              rt = rcu_dereference(rt->dst.rt6_next)) {
740                 if (rt->rt6i_metric != metric) {
741                         cont = rt;
742                         break;
743                 }
744
745                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
746         }
747
748         if (match || !cont)
749                 return match;
750
751         for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next))
752                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
753
754         return match;
755 }
756
757 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
758                                    int oif, int strict)
759 {
760         struct rt6_info *leaf = rcu_dereference(fn->leaf);
761         struct rt6_info *match, *rt0;
762         bool do_rr = false;
763         int key_plen;
764
765         if (!leaf)
766                 return net->ipv6.ip6_null_entry;
767
768         rt0 = rcu_dereference(fn->rr_ptr);
769         if (!rt0)
770                 rt0 = leaf;
771
772         /* Double check to make sure fn is not an intermediate node
773          * and fn->leaf does not points to its child's leaf
774          * (This might happen if all routes under fn are deleted from
775          * the tree and fib6_repair_tree() is called on the node.)
776          */
777         key_plen = rt0->rt6i_dst.plen;
778 #ifdef CONFIG_IPV6_SUBTREES
779         if (rt0->rt6i_src.plen)
780                 key_plen = rt0->rt6i_src.plen;
781 #endif
782         if (fn->fn_bit != key_plen)
783                 return net->ipv6.ip6_null_entry;
784
785         match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
786                              &do_rr);
787
788         if (do_rr) {
789                 struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next);
790
791                 /* no entries matched; do round-robin */
792                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
793                         next = leaf;
794
795                 if (next != rt0) {
796                         spin_lock_bh(&leaf->rt6i_table->tb6_lock);
797                         /* make sure next is not being deleted from the tree */
798                         if (next->rt6i_node)
799                                 rcu_assign_pointer(fn->rr_ptr, next);
800                         spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
801                 }
802         }
803
804         return match ? match : net->ipv6.ip6_null_entry;
805 }
806
807 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
808 {
809         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
810 }
811
812 #ifdef CONFIG_IPV6_ROUTE_INFO
813 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
814                   const struct in6_addr *gwaddr)
815 {
816         struct net *net = dev_net(dev);
817         struct route_info *rinfo = (struct route_info *) opt;
818         struct in6_addr prefix_buf, *prefix;
819         unsigned int pref;
820         unsigned long lifetime;
821         struct rt6_info *rt;
822
823         if (len < sizeof(struct route_info)) {
824                 return -EINVAL;
825         }
826
827         /* Sanity check for prefix_len and length */
828         if (rinfo->length > 3) {
829                 return -EINVAL;
830         } else if (rinfo->prefix_len > 128) {
831                 return -EINVAL;
832         } else if (rinfo->prefix_len > 64) {
833                 if (rinfo->length < 2) {
834                         return -EINVAL;
835                 }
836         } else if (rinfo->prefix_len > 0) {
837                 if (rinfo->length < 1) {
838                         return -EINVAL;
839                 }
840         }
841
842         pref = rinfo->route_pref;
843         if (pref == ICMPV6_ROUTER_PREF_INVALID)
844                 return -EINVAL;
845
846         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
847
848         if (rinfo->length == 3)
849                 prefix = (struct in6_addr *)rinfo->prefix;
850         else {
851                 /* this function is safe */
852                 ipv6_addr_prefix(&prefix_buf,
853                                  (struct in6_addr *)rinfo->prefix,
854                                  rinfo->prefix_len);
855                 prefix = &prefix_buf;
856         }
857
858         if (rinfo->prefix_len == 0)
859                 rt = rt6_get_dflt_router(gwaddr, dev);
860         else
861                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
862                                         gwaddr, dev);
863
864         if (rt && !lifetime) {
865                 ip6_del_rt(rt);
866                 rt = NULL;
867         }
868
869         if (!rt && lifetime)
870                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
871                                         dev, pref);
872         else if (rt)
873                 rt->rt6i_flags = RTF_ROUTEINFO |
874                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
875
876         if (rt) {
877                 if (!addrconf_finite_timeout(lifetime))
878                         rt6_clean_expires(rt);
879                 else
880                         rt6_set_expires(rt, jiffies + HZ * lifetime);
881
882                 ip6_rt_put(rt);
883         }
884         return 0;
885 }
886 #endif
887
888 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
889                                         struct in6_addr *saddr)
890 {
891         struct fib6_node *pn, *sn;
892         while (1) {
893                 if (fn->fn_flags & RTN_TL_ROOT)
894                         return NULL;
895                 pn = rcu_dereference(fn->parent);
896                 sn = FIB6_SUBTREE(pn);
897                 if (sn && sn != fn)
898                         fn = fib6_lookup(sn, NULL, saddr);
899                 else
900                         fn = pn;
901                 if (fn->fn_flags & RTN_RTINFO)
902                         return fn;
903         }
904 }
905
906 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
907                           bool null_fallback)
908 {
909         struct rt6_info *rt = *prt;
910
911         if (dst_hold_safe(&rt->dst))
912                 return true;
913         if (null_fallback) {
914                 rt = net->ipv6.ip6_null_entry;
915                 dst_hold(&rt->dst);
916         } else {
917                 rt = NULL;
918         }
919         *prt = rt;
920         return false;
921 }
922
923 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
924                                              struct fib6_table *table,
925                                              struct flowi6 *fl6, int flags)
926 {
927         struct rt6_info *rt, *rt_cache;
928         struct fib6_node *fn;
929
930         rcu_read_lock();
931         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
932 restart:
933         rt = rcu_dereference(fn->leaf);
934         if (!rt) {
935                 rt = net->ipv6.ip6_null_entry;
936         } else {
937                 rt = rt6_device_match(net, rt, &fl6->saddr,
938                                       fl6->flowi6_oif, flags);
939                 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
940                         rt = rt6_multipath_select(rt, fl6,
941                                                   fl6->flowi6_oif, flags);
942         }
943         if (rt == net->ipv6.ip6_null_entry) {
944                 fn = fib6_backtrack(fn, &fl6->saddr);
945                 if (fn)
946                         goto restart;
947         }
948         /* Search through exception table */
949         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
950         if (rt_cache)
951                 rt = rt_cache;
952
953         if (ip6_hold_safe(net, &rt, true))
954                 dst_use_noref(&rt->dst, jiffies);
955
956         rcu_read_unlock();
957
958         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
959
960         return rt;
961
962 }
963
964 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
965                                     int flags)
966 {
967         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
968 }
969 EXPORT_SYMBOL_GPL(ip6_route_lookup);
970
971 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
972                             const struct in6_addr *saddr, int oif, int strict)
973 {
974         struct flowi6 fl6 = {
975                 .flowi6_oif = oif,
976                 .daddr = *daddr,
977         };
978         struct dst_entry *dst;
979         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
980
981         if (saddr) {
982                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
983                 flags |= RT6_LOOKUP_F_HAS_SADDR;
984         }
985
986         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
987         if (dst->error == 0)
988                 return (struct rt6_info *) dst;
989
990         dst_release(dst);
991
992         return NULL;
993 }
994 EXPORT_SYMBOL(rt6_lookup);
995
996 /* ip6_ins_rt is called with FREE table->tb6_lock.
997  * It takes new route entry, the addition fails by any reason the
998  * route is released.
999  * Caller must hold dst before calling it.
1000  */
1001
1002 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
1003                         struct mx6_config *mxc,
1004                         struct netlink_ext_ack *extack)
1005 {
1006         int err;
1007         struct fib6_table *table;
1008
1009         table = rt->rt6i_table;
1010         spin_lock_bh(&table->tb6_lock);
1011         err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1012         spin_unlock_bh(&table->tb6_lock);
1013
1014         return err;
1015 }
1016
1017 int ip6_ins_rt(struct rt6_info *rt)
1018 {
1019         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
1020         struct mx6_config mxc = { .mx = NULL, };
1021
1022         /* Hold dst to account for the reference from the fib6 tree */
1023         dst_hold(&rt->dst);
1024         return __ip6_ins_rt(rt, &info, &mxc, NULL);
1025 }
1026
1027 /* called with rcu_lock held */
1028 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1029 {
1030         struct net_device *dev = rt->dst.dev;
1031
1032         if (rt->rt6i_flags & RTF_LOCAL) {
1033                 /* for copies of local routes, dst->dev needs to be the
1034                  * device if it is a master device, the master device if
1035                  * device is enslaved, and the loopback as the default
1036                  */
1037                 if (netif_is_l3_slave(dev) &&
1038                     !rt6_need_strict(&rt->rt6i_dst.addr))
1039                         dev = l3mdev_master_dev_rcu(dev);
1040                 else if (!netif_is_l3_master(dev))
1041                         dev = dev_net(dev)->loopback_dev;
1042                 /* last case is netif_is_l3_master(dev) is true in which
1043                  * case we want dev returned to be dev
1044                  */
1045         }
1046
1047         return dev;
1048 }
1049
1050 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1051                                            const struct in6_addr *daddr,
1052                                            const struct in6_addr *saddr)
1053 {
1054         struct net_device *dev;
1055         struct rt6_info *rt;
1056
1057         /*
1058          *      Clone the route.
1059          */
1060
1061         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1062                 ort = (struct rt6_info *)ort->dst.from;
1063
1064         rcu_read_lock();
1065         dev = ip6_rt_get_dev_rcu(ort);
1066         rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1067         rcu_read_unlock();
1068         if (!rt)
1069                 return NULL;
1070
1071         ip6_rt_copy_init(rt, ort);
1072         rt->rt6i_flags |= RTF_CACHE;
1073         rt->rt6i_metric = 0;
1074         rt->dst.flags |= DST_HOST;
1075         rt->rt6i_dst.addr = *daddr;
1076         rt->rt6i_dst.plen = 128;
1077
1078         if (!rt6_is_gw_or_nonexthop(ort)) {
1079                 if (ort->rt6i_dst.plen != 128 &&
1080                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1081                         rt->rt6i_flags |= RTF_ANYCAST;
1082 #ifdef CONFIG_IPV6_SUBTREES
1083                 if (rt->rt6i_src.plen && saddr) {
1084                         rt->rt6i_src.addr = *saddr;
1085                         rt->rt6i_src.plen = 128;
1086                 }
1087 #endif
1088         }
1089
1090         return rt;
1091 }
1092
1093 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1094 {
1095         struct net_device *dev;
1096         struct rt6_info *pcpu_rt;
1097
1098         rcu_read_lock();
1099         dev = ip6_rt_get_dev_rcu(rt);
1100         pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1101         rcu_read_unlock();
1102         if (!pcpu_rt)
1103                 return NULL;
1104         ip6_rt_copy_init(pcpu_rt, rt);
1105         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1106         pcpu_rt->rt6i_flags |= RTF_PCPU;
1107         return pcpu_rt;
1108 }
1109
1110 /* It should be called with rcu_read_lock() acquired */
1111 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1112 {
1113         struct rt6_info *pcpu_rt, **p;
1114
1115         p = this_cpu_ptr(rt->rt6i_pcpu);
1116         pcpu_rt = *p;
1117
1118         if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1119                 rt6_dst_from_metrics_check(pcpu_rt);
1120
1121         return pcpu_rt;
1122 }
1123
1124 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1125 {
1126         struct rt6_info *pcpu_rt, *prev, **p;
1127
1128         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1129         if (!pcpu_rt) {
1130                 struct net *net = dev_net(rt->dst.dev);
1131
1132                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1133                 return net->ipv6.ip6_null_entry;
1134         }
1135
1136         dst_hold(&pcpu_rt->dst);
1137         p = this_cpu_ptr(rt->rt6i_pcpu);
1138         prev = cmpxchg(p, NULL, pcpu_rt);
1139         BUG_ON(prev);
1140
1141         rt6_dst_from_metrics_check(pcpu_rt);
1142         return pcpu_rt;
1143 }
1144
1145 /* exception hash table implementation
1146  */
1147 static DEFINE_SPINLOCK(rt6_exception_lock);
1148
1149 /* Remove rt6_ex from hash table and free the memory
1150  * Caller must hold rt6_exception_lock
1151  */
1152 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1153                                  struct rt6_exception *rt6_ex)
1154 {
1155         struct net *net = dev_net(rt6_ex->rt6i->dst.dev);
1156
1157         if (!bucket || !rt6_ex)
1158                 return;
1159         rt6_ex->rt6i->rt6i_node = NULL;
1160         hlist_del_rcu(&rt6_ex->hlist);
1161         rt6_release(rt6_ex->rt6i);
1162         kfree_rcu(rt6_ex, rcu);
1163         WARN_ON_ONCE(!bucket->depth);
1164         bucket->depth--;
1165         net->ipv6.rt6_stats->fib_rt_cache--;
1166 }
1167
1168 /* Remove oldest rt6_ex in bucket and free the memory
1169  * Caller must hold rt6_exception_lock
1170  */
1171 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1172 {
1173         struct rt6_exception *rt6_ex, *oldest = NULL;
1174
1175         if (!bucket)
1176                 return;
1177
1178         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1179                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1180                         oldest = rt6_ex;
1181         }
1182         rt6_remove_exception(bucket, oldest);
1183 }
1184
1185 static u32 rt6_exception_hash(const struct in6_addr *dst,
1186                               const struct in6_addr *src)
1187 {
1188         static u32 seed __read_mostly;
1189         u32 val;
1190
1191         net_get_random_once(&seed, sizeof(seed));
1192         val = jhash(dst, sizeof(*dst), seed);
1193
1194 #ifdef CONFIG_IPV6_SUBTREES
1195         if (src)
1196                 val = jhash(src, sizeof(*src), val);
1197 #endif
1198         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1199 }
1200
1201 /* Helper function to find the cached rt in the hash table
1202  * and update bucket pointer to point to the bucket for this
1203  * (daddr, saddr) pair
1204  * Caller must hold rt6_exception_lock
1205  */
1206 static struct rt6_exception *
1207 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1208                               const struct in6_addr *daddr,
1209                               const struct in6_addr *saddr)
1210 {
1211         struct rt6_exception *rt6_ex;
1212         u32 hval;
1213
1214         if (!(*bucket) || !daddr)
1215                 return NULL;
1216
1217         hval = rt6_exception_hash(daddr, saddr);
1218         *bucket += hval;
1219
1220         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1221                 struct rt6_info *rt6 = rt6_ex->rt6i;
1222                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1223
1224 #ifdef CONFIG_IPV6_SUBTREES
1225                 if (matched && saddr)
1226                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1227 #endif
1228                 if (matched)
1229                         return rt6_ex;
1230         }
1231         return NULL;
1232 }
1233
1234 /* Helper function to find the cached rt in the hash table
1235  * and update bucket pointer to point to the bucket for this
1236  * (daddr, saddr) pair
1237  * Caller must hold rcu_read_lock()
1238  */
1239 static struct rt6_exception *
1240 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1241                          const struct in6_addr *daddr,
1242                          const struct in6_addr *saddr)
1243 {
1244         struct rt6_exception *rt6_ex;
1245         u32 hval;
1246
1247         WARN_ON_ONCE(!rcu_read_lock_held());
1248
1249         if (!(*bucket) || !daddr)
1250                 return NULL;
1251
1252         hval = rt6_exception_hash(daddr, saddr);
1253         *bucket += hval;
1254
1255         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1256                 struct rt6_info *rt6 = rt6_ex->rt6i;
1257                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1258
1259 #ifdef CONFIG_IPV6_SUBTREES
1260                 if (matched && saddr)
1261                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1262 #endif
1263                 if (matched)
1264                         return rt6_ex;
1265         }
1266         return NULL;
1267 }
1268
1269 static int rt6_insert_exception(struct rt6_info *nrt,
1270                                 struct rt6_info *ort)
1271 {
1272         struct net *net = dev_net(ort->dst.dev);
1273         struct rt6_exception_bucket *bucket;
1274         struct in6_addr *src_key = NULL;
1275         struct rt6_exception *rt6_ex;
1276         int err = 0;
1277
1278         /* ort can't be a cache or pcpu route */
1279         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1280                 ort = (struct rt6_info *)ort->dst.from;
1281         WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1282
1283         spin_lock_bh(&rt6_exception_lock);
1284
1285         if (ort->exception_bucket_flushed) {
1286                 err = -EINVAL;
1287                 goto out;
1288         }
1289
1290         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1291                                         lockdep_is_held(&rt6_exception_lock));
1292         if (!bucket) {
1293                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1294                                  GFP_ATOMIC);
1295                 if (!bucket) {
1296                         err = -ENOMEM;
1297                         goto out;
1298                 }
1299                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1300         }
1301
1302 #ifdef CONFIG_IPV6_SUBTREES
1303         /* rt6i_src.plen != 0 indicates ort is in subtree
1304          * and exception table is indexed by a hash of
1305          * both rt6i_dst and rt6i_src.
1306          * Otherwise, the exception table is indexed by
1307          * a hash of only rt6i_dst.
1308          */
1309         if (ort->rt6i_src.plen)
1310                 src_key = &nrt->rt6i_src.addr;
1311 #endif
1312
1313         /* Update rt6i_prefsrc as it could be changed
1314          * in rt6_remove_prefsrc()
1315          */
1316         nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1317         /* rt6_mtu_change() might lower mtu on ort.
1318          * Only insert this exception route if its mtu
1319          * is less than ort's mtu value.
1320          */
1321         if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1322                 err = -EINVAL;
1323                 goto out;
1324         }
1325
1326         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1327                                                src_key);
1328         if (rt6_ex)
1329                 rt6_remove_exception(bucket, rt6_ex);
1330
1331         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1332         if (!rt6_ex) {
1333                 err = -ENOMEM;
1334                 goto out;
1335         }
1336         rt6_ex->rt6i = nrt;
1337         rt6_ex->stamp = jiffies;
1338         atomic_inc(&nrt->rt6i_ref);
1339         nrt->rt6i_node = ort->rt6i_node;
1340         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1341         bucket->depth++;
1342         net->ipv6.rt6_stats->fib_rt_cache++;
1343
1344         if (bucket->depth > FIB6_MAX_DEPTH)
1345                 rt6_exception_remove_oldest(bucket);
1346
1347 out:
1348         spin_unlock_bh(&rt6_exception_lock);
1349
1350         /* Update fn->fn_sernum to invalidate all cached dst */
1351         if (!err)
1352                 fib6_update_sernum(ort);
1353
1354         return err;
1355 }
1356
1357 void rt6_flush_exceptions(struct rt6_info *rt)
1358 {
1359         struct rt6_exception_bucket *bucket;
1360         struct rt6_exception *rt6_ex;
1361         struct hlist_node *tmp;
1362         int i;
1363
1364         spin_lock_bh(&rt6_exception_lock);
1365         /* Prevent rt6_insert_exception() to recreate the bucket list */
1366         rt->exception_bucket_flushed = 1;
1367
1368         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1369                                     lockdep_is_held(&rt6_exception_lock));
1370         if (!bucket)
1371                 goto out;
1372
1373         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1374                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1375                         rt6_remove_exception(bucket, rt6_ex);
1376                 WARN_ON_ONCE(bucket->depth);
1377                 bucket++;
1378         }
1379
1380 out:
1381         spin_unlock_bh(&rt6_exception_lock);
1382 }
1383
1384 /* Find cached rt in the hash table inside passed in rt
1385  * Caller has to hold rcu_read_lock()
1386  */
1387 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1388                                            struct in6_addr *daddr,
1389                                            struct in6_addr *saddr)
1390 {
1391         struct rt6_exception_bucket *bucket;
1392         struct in6_addr *src_key = NULL;
1393         struct rt6_exception *rt6_ex;
1394         struct rt6_info *res = NULL;
1395
1396         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1397
1398 #ifdef CONFIG_IPV6_SUBTREES
1399         /* rt6i_src.plen != 0 indicates rt is in subtree
1400          * and exception table is indexed by a hash of
1401          * both rt6i_dst and rt6i_src.
1402          * Otherwise, the exception table is indexed by
1403          * a hash of only rt6i_dst.
1404          */
1405         if (rt->rt6i_src.plen)
1406                 src_key = saddr;
1407 #endif
1408         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1409
1410         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1411                 res = rt6_ex->rt6i;
1412
1413         return res;
1414 }
1415
1416 /* Remove the passed in cached rt from the hash table that contains it */
1417 int rt6_remove_exception_rt(struct rt6_info *rt)
1418 {
1419         struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1420         struct rt6_exception_bucket *bucket;
1421         struct in6_addr *src_key = NULL;
1422         struct rt6_exception *rt6_ex;
1423         int err;
1424
1425         if (!from ||
1426             !(rt->rt6i_flags | RTF_CACHE))
1427                 return -EINVAL;
1428
1429         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1430                 return -ENOENT;
1431
1432         spin_lock_bh(&rt6_exception_lock);
1433         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1434                                     lockdep_is_held(&rt6_exception_lock));
1435 #ifdef CONFIG_IPV6_SUBTREES
1436         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1437          * and exception table is indexed by a hash of
1438          * both rt6i_dst and rt6i_src.
1439          * Otherwise, the exception table is indexed by
1440          * a hash of only rt6i_dst.
1441          */
1442         if (from->rt6i_src.plen)
1443                 src_key = &rt->rt6i_src.addr;
1444 #endif
1445         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1446                                                &rt->rt6i_dst.addr,
1447                                                src_key);
1448         if (rt6_ex) {
1449                 rt6_remove_exception(bucket, rt6_ex);
1450                 err = 0;
1451         } else {
1452                 err = -ENOENT;
1453         }
1454
1455         spin_unlock_bh(&rt6_exception_lock);
1456         return err;
1457 }
1458
1459 /* Find rt6_ex which contains the passed in rt cache and
1460  * refresh its stamp
1461  */
1462 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1463 {
1464         struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1465         struct rt6_exception_bucket *bucket;
1466         struct in6_addr *src_key = NULL;
1467         struct rt6_exception *rt6_ex;
1468
1469         if (!from ||
1470             !(rt->rt6i_flags | RTF_CACHE))
1471                 return;
1472
1473         rcu_read_lock();
1474         bucket = rcu_dereference(from->rt6i_exception_bucket);
1475
1476 #ifdef CONFIG_IPV6_SUBTREES
1477         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1478          * and exception table is indexed by a hash of
1479          * both rt6i_dst and rt6i_src.
1480          * Otherwise, the exception table is indexed by
1481          * a hash of only rt6i_dst.
1482          */
1483         if (from->rt6i_src.plen)
1484                 src_key = &rt->rt6i_src.addr;
1485 #endif
1486         rt6_ex = __rt6_find_exception_rcu(&bucket,
1487                                           &rt->rt6i_dst.addr,
1488                                           src_key);
1489         if (rt6_ex)
1490                 rt6_ex->stamp = jiffies;
1491
1492         rcu_read_unlock();
1493 }
1494
1495 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1496 {
1497         struct rt6_exception_bucket *bucket;
1498         struct rt6_exception *rt6_ex;
1499         int i;
1500
1501         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1502                                         lockdep_is_held(&rt6_exception_lock));
1503
1504         if (bucket) {
1505                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1506                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1507                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1508                         }
1509                         bucket++;
1510                 }
1511         }
1512 }
1513
1514 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
1515 {
1516         struct rt6_exception_bucket *bucket;
1517         struct rt6_exception *rt6_ex;
1518         int i;
1519
1520         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1521                                         lockdep_is_held(&rt6_exception_lock));
1522
1523         if (bucket) {
1524                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1525                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1526                                 struct rt6_info *entry = rt6_ex->rt6i;
1527                                 /* For RTF_CACHE with rt6i_pmtu == 0
1528                                  * (i.e. a redirected route),
1529                                  * the metrics of its rt->dst.from has already
1530                                  * been updated.
1531                                  */
1532                                 if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
1533                                         entry->rt6i_pmtu = mtu;
1534                         }
1535                         bucket++;
1536                 }
1537         }
1538 }
1539
1540 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1541
1542 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1543                                         struct in6_addr *gateway)
1544 {
1545         struct rt6_exception_bucket *bucket;
1546         struct rt6_exception *rt6_ex;
1547         struct hlist_node *tmp;
1548         int i;
1549
1550         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1551                 return;
1552
1553         spin_lock_bh(&rt6_exception_lock);
1554         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1555                                      lockdep_is_held(&rt6_exception_lock));
1556
1557         if (bucket) {
1558                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1559                         hlist_for_each_entry_safe(rt6_ex, tmp,
1560                                                   &bucket->chain, hlist) {
1561                                 struct rt6_info *entry = rt6_ex->rt6i;
1562
1563                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1564                                     RTF_CACHE_GATEWAY &&
1565                                     ipv6_addr_equal(gateway,
1566                                                     &entry->rt6i_gateway)) {
1567                                         rt6_remove_exception(bucket, rt6_ex);
1568                                 }
1569                         }
1570                         bucket++;
1571                 }
1572         }
1573
1574         spin_unlock_bh(&rt6_exception_lock);
1575 }
1576
1577 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1578                                       struct rt6_exception *rt6_ex,
1579                                       struct fib6_gc_args *gc_args,
1580                                       unsigned long now)
1581 {
1582         struct rt6_info *rt = rt6_ex->rt6i;
1583
1584         if (atomic_read(&rt->dst.__refcnt) == 1 &&
1585             time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1586                 RT6_TRACE("aging clone %p\n", rt);
1587                 rt6_remove_exception(bucket, rt6_ex);
1588                 return;
1589         } else if (rt->rt6i_flags & RTF_GATEWAY) {
1590                 struct neighbour *neigh;
1591                 __u8 neigh_flags = 0;
1592
1593                 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1594                 if (neigh) {
1595                         neigh_flags = neigh->flags;
1596                         neigh_release(neigh);
1597                 }
1598                 if (!(neigh_flags & NTF_ROUTER)) {
1599                         RT6_TRACE("purging route %p via non-router but gateway\n",
1600                                   rt);
1601                         rt6_remove_exception(bucket, rt6_ex);
1602                         return;
1603                 }
1604         }
1605         gc_args->more++;
1606 }
1607
1608 void rt6_age_exceptions(struct rt6_info *rt,
1609                         struct fib6_gc_args *gc_args,
1610                         unsigned long now)
1611 {
1612         struct rt6_exception_bucket *bucket;
1613         struct rt6_exception *rt6_ex;
1614         struct hlist_node *tmp;
1615         int i;
1616
1617         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1618                 return;
1619
1620         spin_lock_bh(&rt6_exception_lock);
1621         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1622                                     lockdep_is_held(&rt6_exception_lock));
1623
1624         if (bucket) {
1625                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1626                         hlist_for_each_entry_safe(rt6_ex, tmp,
1627                                                   &bucket->chain, hlist) {
1628                                 rt6_age_examine_exception(bucket, rt6_ex,
1629                                                           gc_args, now);
1630                         }
1631                         bucket++;
1632                 }
1633         }
1634         spin_unlock_bh(&rt6_exception_lock);
1635 }
1636
1637 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1638                                int oif, struct flowi6 *fl6, int flags)
1639 {
1640         struct fib6_node *fn, *saved_fn;
1641         struct rt6_info *rt, *rt_cache;
1642         int strict = 0;
1643
1644         strict |= flags & RT6_LOOKUP_F_IFACE;
1645         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1646         if (net->ipv6.devconf_all->forwarding == 0)
1647                 strict |= RT6_LOOKUP_F_REACHABLE;
1648
1649         rcu_read_lock();
1650
1651         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1652         saved_fn = fn;
1653
1654         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1655                 oif = 0;
1656
1657 redo_rt6_select:
1658         rt = rt6_select(net, fn, oif, strict);
1659         if (rt->rt6i_nsiblings)
1660                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1661         if (rt == net->ipv6.ip6_null_entry) {
1662                 fn = fib6_backtrack(fn, &fl6->saddr);
1663                 if (fn)
1664                         goto redo_rt6_select;
1665                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1666                         /* also consider unreachable route */
1667                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1668                         fn = saved_fn;
1669                         goto redo_rt6_select;
1670                 }
1671         }
1672
1673         /*Search through exception table */
1674         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1675         if (rt_cache)
1676                 rt = rt_cache;
1677
1678         if (rt == net->ipv6.ip6_null_entry) {
1679                 rcu_read_unlock();
1680                 dst_hold(&rt->dst);
1681                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1682                 return rt;
1683         } else if (rt->rt6i_flags & RTF_CACHE) {
1684                 if (ip6_hold_safe(net, &rt, true)) {
1685                         dst_use_noref(&rt->dst, jiffies);
1686                         rt6_dst_from_metrics_check(rt);
1687                 }
1688                 rcu_read_unlock();
1689                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1690                 return rt;
1691         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1692                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1693                 /* Create a RTF_CACHE clone which will not be
1694                  * owned by the fib6 tree.  It is for the special case where
1695                  * the daddr in the skb during the neighbor look-up is different
1696                  * from the fl6->daddr used to look-up route here.
1697                  */
1698
1699                 struct rt6_info *uncached_rt;
1700
1701                 if (ip6_hold_safe(net, &rt, true)) {
1702                         dst_use_noref(&rt->dst, jiffies);
1703                 } else {
1704                         rcu_read_unlock();
1705                         uncached_rt = rt;
1706                         goto uncached_rt_out;
1707                 }
1708                 rcu_read_unlock();
1709
1710                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1711                 dst_release(&rt->dst);
1712
1713                 if (uncached_rt) {
1714                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1715                          * No need for another dst_hold()
1716                          */
1717                         rt6_uncached_list_add(uncached_rt);
1718                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1719                 } else {
1720                         uncached_rt = net->ipv6.ip6_null_entry;
1721                         dst_hold(&uncached_rt->dst);
1722                 }
1723
1724 uncached_rt_out:
1725                 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1726                 return uncached_rt;
1727
1728         } else {
1729                 /* Get a percpu copy */
1730
1731                 struct rt6_info *pcpu_rt;
1732
1733                 dst_use_noref(&rt->dst, jiffies);
1734                 local_bh_disable();
1735                 pcpu_rt = rt6_get_pcpu_route(rt);
1736
1737                 if (!pcpu_rt) {
1738                         /* atomic_inc_not_zero() is needed when using rcu */
1739                         if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1740                                 /* No dst_hold() on rt is needed because grabbing
1741                                  * rt->rt6i_ref makes sure rt can't be released.
1742                                  */
1743                                 pcpu_rt = rt6_make_pcpu_route(rt);
1744                                 rt6_release(rt);
1745                         } else {
1746                                 /* rt is already removed from tree */
1747                                 pcpu_rt = net->ipv6.ip6_null_entry;
1748                                 dst_hold(&pcpu_rt->dst);
1749                         }
1750                 }
1751                 local_bh_enable();
1752                 rcu_read_unlock();
1753                 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1754                 return pcpu_rt;
1755         }
1756 }
1757 EXPORT_SYMBOL_GPL(ip6_pol_route);
1758
1759 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1760                                             struct flowi6 *fl6, int flags)
1761 {
1762         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1763 }
1764
1765 struct dst_entry *ip6_route_input_lookup(struct net *net,
1766                                          struct net_device *dev,
1767                                          struct flowi6 *fl6, int flags)
1768 {
1769         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1770                 flags |= RT6_LOOKUP_F_IFACE;
1771
1772         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1773 }
1774 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1775
1776 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1777                                   struct flow_keys *keys)
1778 {
1779         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1780         const struct ipv6hdr *key_iph = outer_iph;
1781         const struct ipv6hdr *inner_iph;
1782         const struct icmp6hdr *icmph;
1783         struct ipv6hdr _inner_iph;
1784
1785         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1786                 goto out;
1787
1788         icmph = icmp6_hdr(skb);
1789         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1790             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1791             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1792             icmph->icmp6_type != ICMPV6_PARAMPROB)
1793                 goto out;
1794
1795         inner_iph = skb_header_pointer(skb,
1796                                        skb_transport_offset(skb) + sizeof(*icmph),
1797                                        sizeof(_inner_iph), &_inner_iph);
1798         if (!inner_iph)
1799                 goto out;
1800
1801         key_iph = inner_iph;
1802 out:
1803         memset(keys, 0, sizeof(*keys));
1804         keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1805         keys->addrs.v6addrs.src = key_iph->saddr;
1806         keys->addrs.v6addrs.dst = key_iph->daddr;
1807         keys->tags.flow_label = ip6_flowinfo(key_iph);
1808         keys->basic.ip_proto = key_iph->nexthdr;
1809 }
1810
1811 /* if skb is set it will be used and fl6 can be NULL */
1812 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1813 {
1814         struct flow_keys hash_keys;
1815
1816         if (skb) {
1817                 ip6_multipath_l3_keys(skb, &hash_keys);
1818                 return flow_hash_from_keys(&hash_keys);
1819         }
1820
1821         return get_hash_from_flowi6(fl6);
1822 }
1823
1824 void ip6_route_input(struct sk_buff *skb)
1825 {
1826         const struct ipv6hdr *iph = ipv6_hdr(skb);
1827         struct net *net = dev_net(skb->dev);
1828         int flags = RT6_LOOKUP_F_HAS_SADDR;
1829         struct ip_tunnel_info *tun_info;
1830         struct flowi6 fl6 = {
1831                 .flowi6_iif = skb->dev->ifindex,
1832                 .daddr = iph->daddr,
1833                 .saddr = iph->saddr,
1834                 .flowlabel = ip6_flowinfo(iph),
1835                 .flowi6_mark = skb->mark,
1836                 .flowi6_proto = iph->nexthdr,
1837         };
1838
1839         tun_info = skb_tunnel_info(skb);
1840         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1841                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1842         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1843                 fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1844         skb_dst_drop(skb);
1845         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1846 }
1847
1848 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1849                                              struct flowi6 *fl6, int flags)
1850 {
1851         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1852 }
1853
1854 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1855                                          struct flowi6 *fl6, int flags)
1856 {
1857         bool any_src;
1858
1859         if (rt6_need_strict(&fl6->daddr)) {
1860                 struct dst_entry *dst;
1861
1862                 dst = l3mdev_link_scope_lookup(net, fl6);
1863                 if (dst)
1864                         return dst;
1865         }
1866
1867         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1868
1869         any_src = ipv6_addr_any(&fl6->saddr);
1870         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1871             (fl6->flowi6_oif && any_src))
1872                 flags |= RT6_LOOKUP_F_IFACE;
1873
1874         if (!any_src)
1875                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1876         else if (sk)
1877                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1878
1879         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1880 }
1881 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1882
1883 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1884 {
1885         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1886         struct net_device *loopback_dev = net->loopback_dev;
1887         struct dst_entry *new = NULL;
1888
1889         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1890                        DST_OBSOLETE_NONE, 0);
1891         if (rt) {
1892                 rt6_info_init(rt);
1893                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
1894
1895                 new = &rt->dst;
1896                 new->__use = 1;
1897                 new->input = dst_discard;
1898                 new->output = dst_discard_out;
1899
1900                 dst_copy_metrics(new, &ort->dst);
1901
1902                 rt->rt6i_idev = in6_dev_get(loopback_dev);
1903                 rt->rt6i_gateway = ort->rt6i_gateway;
1904                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1905                 rt->rt6i_metric = 0;
1906
1907                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1908 #ifdef CONFIG_IPV6_SUBTREES
1909                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1910 #endif
1911         }
1912
1913         dst_release(dst_orig);
1914         return new ? new : ERR_PTR(-ENOMEM);
1915 }
1916
1917 /*
1918  *      Destination cache support functions
1919  */
1920
1921 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1922 {
1923         if (rt->dst.from &&
1924             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1925                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1926 }
1927
1928 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1929 {
1930         u32 rt_cookie = 0;
1931
1932         if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1933                 return NULL;
1934
1935         if (rt6_check_expired(rt))
1936                 return NULL;
1937
1938         return &rt->dst;
1939 }
1940
1941 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1942 {
1943         if (!__rt6_check_expired(rt) &&
1944             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1945             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1946                 return &rt->dst;
1947         else
1948                 return NULL;
1949 }
1950
1951 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1952 {
1953         struct rt6_info *rt;
1954
1955         rt = (struct rt6_info *) dst;
1956
1957         /* All IPV6 dsts are created with ->obsolete set to the value
1958          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1959          * into this function always.
1960          */
1961
1962         rt6_dst_from_metrics_check(rt);
1963
1964         if (rt->rt6i_flags & RTF_PCPU ||
1965             (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
1966                 return rt6_dst_from_check(rt, cookie);
1967         else
1968                 return rt6_check(rt, cookie);
1969 }
1970
1971 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1972 {
1973         struct rt6_info *rt = (struct rt6_info *) dst;
1974
1975         if (rt) {
1976                 if (rt->rt6i_flags & RTF_CACHE) {
1977                         if (rt6_check_expired(rt)) {
1978                                 ip6_del_rt(rt);
1979                                 dst = NULL;
1980                         }
1981                 } else {
1982                         dst_release(dst);
1983                         dst = NULL;
1984                 }
1985         }
1986         return dst;
1987 }
1988
1989 static void ip6_link_failure(struct sk_buff *skb)
1990 {
1991         struct rt6_info *rt;
1992
1993         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1994
1995         rt = (struct rt6_info *) skb_dst(skb);
1996         if (rt) {
1997                 if (rt->rt6i_flags & RTF_CACHE) {
1998                         if (dst_hold_safe(&rt->dst))
1999                                 ip6_del_rt(rt);
2000                 } else {
2001                         struct fib6_node *fn;
2002
2003                         rcu_read_lock();
2004                         fn = rcu_dereference(rt->rt6i_node);
2005                         if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2006                                 fn->fn_sernum = -1;
2007                         rcu_read_unlock();
2008                 }
2009         }
2010 }
2011
2012 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2013 {
2014         struct net *net = dev_net(rt->dst.dev);
2015
2016         rt->rt6i_flags |= RTF_MODIFIED;
2017         rt->rt6i_pmtu = mtu;
2018         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2019 }
2020
2021 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2022 {
2023         return !(rt->rt6i_flags & RTF_CACHE) &&
2024                 (rt->rt6i_flags & RTF_PCPU ||
2025                  rcu_access_pointer(rt->rt6i_node));
2026 }
2027
2028 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2029                                  const struct ipv6hdr *iph, u32 mtu)
2030 {
2031         const struct in6_addr *daddr, *saddr;
2032         struct rt6_info *rt6 = (struct rt6_info *)dst;
2033
2034         if (rt6->rt6i_flags & RTF_LOCAL)
2035                 return;
2036
2037         if (dst_metric_locked(dst, RTAX_MTU))
2038                 return;
2039
2040         if (iph) {
2041                 daddr = &iph->daddr;
2042                 saddr = &iph->saddr;
2043         } else if (sk) {
2044                 daddr = &sk->sk_v6_daddr;
2045                 saddr = &inet6_sk(sk)->saddr;
2046         } else {
2047                 daddr = NULL;
2048                 saddr = NULL;
2049         }
2050         dst_confirm_neigh(dst, daddr);
2051         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2052         if (mtu >= dst_mtu(dst))
2053                 return;
2054
2055         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2056                 rt6_do_update_pmtu(rt6, mtu);
2057                 /* update rt6_ex->stamp for cache */
2058                 if (rt6->rt6i_flags & RTF_CACHE)
2059                         rt6_update_exception_stamp_rt(rt6);
2060         } else if (daddr) {
2061                 struct rt6_info *nrt6;
2062
2063                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2064                 if (nrt6) {
2065                         rt6_do_update_pmtu(nrt6, mtu);
2066                         if (rt6_insert_exception(nrt6, rt6))
2067                                 dst_release_immediate(&nrt6->dst);
2068                 }
2069         }
2070 }
2071
2072 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2073                                struct sk_buff *skb, u32 mtu)
2074 {
2075         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2076 }
2077
2078 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2079                      int oif, u32 mark, kuid_t uid)
2080 {
2081         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2082         struct dst_entry *dst;
2083         struct flowi6 fl6;
2084
2085         memset(&fl6, 0, sizeof(fl6));
2086         fl6.flowi6_oif = oif;
2087         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2088         fl6.daddr = iph->daddr;
2089         fl6.saddr = iph->saddr;
2090         fl6.flowlabel = ip6_flowinfo(iph);
2091         fl6.flowi6_uid = uid;
2092
2093         dst = ip6_route_output(net, NULL, &fl6);
2094         if (!dst->error)
2095                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2096         dst_release(dst);
2097 }
2098 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2099
2100 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2101 {
2102         struct dst_entry *dst;
2103
2104         ip6_update_pmtu(skb, sock_net(sk), mtu,
2105                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2106
2107         dst = __sk_dst_get(sk);
2108         if (!dst || !dst->obsolete ||
2109             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2110                 return;
2111
2112         bh_lock_sock(sk);
2113         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2114                 ip6_datagram_dst_update(sk, false);
2115         bh_unlock_sock(sk);
2116 }
2117 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2118
2119 /* Handle redirects */
2120 struct ip6rd_flowi {
2121         struct flowi6 fl6;
2122         struct in6_addr gateway;
2123 };
2124
2125 static struct rt6_info *__ip6_route_redirect(struct net *net,
2126                                              struct fib6_table *table,
2127                                              struct flowi6 *fl6,
2128                                              int flags)
2129 {
2130         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2131         struct rt6_info *rt, *rt_cache;
2132         struct fib6_node *fn;
2133
2134         /* Get the "current" route for this destination and
2135          * check if the redirect has come from appropriate router.
2136          *
2137          * RFC 4861 specifies that redirects should only be
2138          * accepted if they come from the nexthop to the target.
2139          * Due to the way the routes are chosen, this notion
2140          * is a bit fuzzy and one might need to check all possible
2141          * routes.
2142          */
2143
2144         rcu_read_lock();
2145         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2146 restart:
2147         for_each_fib6_node_rt_rcu(fn) {
2148                 if (rt6_check_expired(rt))
2149                         continue;
2150                 if (rt->dst.error)
2151                         break;
2152                 if (!(rt->rt6i_flags & RTF_GATEWAY))
2153                         continue;
2154                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2155                         continue;
2156                 /* rt_cache's gateway might be different from its 'parent'
2157                  * in the case of an ip redirect.
2158                  * So we keep searching in the exception table if the gateway
2159                  * is different.
2160                  */
2161                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2162                         rt_cache = rt6_find_cached_rt(rt,
2163                                                       &fl6->daddr,
2164                                                       &fl6->saddr);
2165                         if (rt_cache &&
2166                             ipv6_addr_equal(&rdfl->gateway,
2167                                             &rt_cache->rt6i_gateway)) {
2168                                 rt = rt_cache;
2169                                 break;
2170                         }
2171                         continue;
2172                 }
2173                 break;
2174         }
2175
2176         if (!rt)
2177                 rt = net->ipv6.ip6_null_entry;
2178         else if (rt->dst.error) {
2179                 rt = net->ipv6.ip6_null_entry;
2180                 goto out;
2181         }
2182
2183         if (rt == net->ipv6.ip6_null_entry) {
2184                 fn = fib6_backtrack(fn, &fl6->saddr);
2185                 if (fn)
2186                         goto restart;
2187         }
2188
2189 out:
2190         ip6_hold_safe(net, &rt, true);
2191
2192         rcu_read_unlock();
2193
2194         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
2195         return rt;
2196 };
2197
2198 static struct dst_entry *ip6_route_redirect(struct net *net,
2199                                         const struct flowi6 *fl6,
2200                                         const struct in6_addr *gateway)
2201 {
2202         int flags = RT6_LOOKUP_F_HAS_SADDR;
2203         struct ip6rd_flowi rdfl;
2204
2205         rdfl.fl6 = *fl6;
2206         rdfl.gateway = *gateway;
2207
2208         return fib6_rule_lookup(net, &rdfl.fl6,
2209                                 flags, __ip6_route_redirect);
2210 }
2211
2212 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2213                   kuid_t uid)
2214 {
2215         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2216         struct dst_entry *dst;
2217         struct flowi6 fl6;
2218
2219         memset(&fl6, 0, sizeof(fl6));
2220         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2221         fl6.flowi6_oif = oif;
2222         fl6.flowi6_mark = mark;
2223         fl6.daddr = iph->daddr;
2224         fl6.saddr = iph->saddr;
2225         fl6.flowlabel = ip6_flowinfo(iph);
2226         fl6.flowi6_uid = uid;
2227
2228         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
2229         rt6_do_redirect(dst, NULL, skb);
2230         dst_release(dst);
2231 }
2232 EXPORT_SYMBOL_GPL(ip6_redirect);
2233
2234 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2235                             u32 mark)
2236 {
2237         const struct ipv6hdr *iph = ipv6_hdr(skb);
2238         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2239         struct dst_entry *dst;
2240         struct flowi6 fl6;
2241
2242         memset(&fl6, 0, sizeof(fl6));
2243         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2244         fl6.flowi6_oif = oif;
2245         fl6.flowi6_mark = mark;
2246         fl6.daddr = msg->dest;
2247         fl6.saddr = iph->daddr;
2248         fl6.flowi6_uid = sock_net_uid(net, NULL);
2249
2250         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2251         rt6_do_redirect(dst, NULL, skb);
2252         dst_release(dst);
2253 }
2254
2255 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2256 {
2257         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2258                      sk->sk_uid);
2259 }
2260 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2261
2262 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2263 {
2264         struct net_device *dev = dst->dev;
2265         unsigned int mtu = dst_mtu(dst);
2266         struct net *net = dev_net(dev);
2267
2268         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2269
2270         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2271                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2272
2273         /*
2274          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2275          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2276          * IPV6_MAXPLEN is also valid and means: "any MSS,
2277          * rely only on pmtu discovery"
2278          */
2279         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2280                 mtu = IPV6_MAXPLEN;
2281         return mtu;
2282 }
2283
2284 static unsigned int ip6_mtu(const struct dst_entry *dst)
2285 {
2286         const struct rt6_info *rt = (const struct rt6_info *)dst;
2287         unsigned int mtu = rt->rt6i_pmtu;
2288         struct inet6_dev *idev;
2289
2290         if (mtu)
2291                 goto out;
2292
2293         mtu = dst_metric_raw(dst, RTAX_MTU);
2294         if (mtu)
2295                 goto out;
2296
2297         mtu = IPV6_MIN_MTU;
2298
2299         rcu_read_lock();
2300         idev = __in6_dev_get(dst->dev);
2301         if (idev)
2302                 mtu = idev->cnf.mtu6;
2303         rcu_read_unlock();
2304
2305 out:
2306         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2307
2308         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2309 }
2310
2311 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2312                                   struct flowi6 *fl6)
2313 {
2314         struct dst_entry *dst;
2315         struct rt6_info *rt;
2316         struct inet6_dev *idev = in6_dev_get(dev);
2317         struct net *net = dev_net(dev);
2318
2319         if (unlikely(!idev))
2320                 return ERR_PTR(-ENODEV);
2321
2322         rt = ip6_dst_alloc(net, dev, 0);
2323         if (unlikely(!rt)) {
2324                 in6_dev_put(idev);
2325                 dst = ERR_PTR(-ENOMEM);
2326                 goto out;
2327         }
2328
2329         rt->dst.flags |= DST_HOST;
2330         rt->dst.output  = ip6_output;
2331         rt->rt6i_gateway  = fl6->daddr;
2332         rt->rt6i_dst.addr = fl6->daddr;
2333         rt->rt6i_dst.plen = 128;
2334         rt->rt6i_idev     = idev;
2335         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2336
2337         /* Add this dst into uncached_list so that rt6_ifdown() can
2338          * do proper release of the net_device
2339          */
2340         rt6_uncached_list_add(rt);
2341         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2342
2343         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2344
2345 out:
2346         return dst;
2347 }
2348
2349 static int ip6_dst_gc(struct dst_ops *ops)
2350 {
2351         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2352         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2353         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2354         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2355         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2356         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2357         int entries;
2358
2359         entries = dst_entries_get_fast(ops);
2360         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2361             entries <= rt_max_size)
2362                 goto out;
2363
2364         net->ipv6.ip6_rt_gc_expire++;
2365         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2366         entries = dst_entries_get_slow(ops);
2367         if (entries < ops->gc_thresh)
2368                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2369 out:
2370         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2371         return entries > rt_max_size;
2372 }
2373
2374 static int ip6_convert_metrics(struct mx6_config *mxc,
2375                                const struct fib6_config *cfg)
2376 {
2377         bool ecn_ca = false;
2378         struct nlattr *nla;
2379         int remaining;
2380         u32 *mp;
2381
2382         if (!cfg->fc_mx)
2383                 return 0;
2384
2385         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2386         if (unlikely(!mp))
2387                 return -ENOMEM;
2388
2389         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2390                 int type = nla_type(nla);
2391                 u32 val;
2392
2393                 if (!type)
2394                         continue;
2395                 if (unlikely(type > RTAX_MAX))
2396                         goto err;
2397
2398                 if (type == RTAX_CC_ALGO) {
2399                         char tmp[TCP_CA_NAME_MAX];
2400
2401                         nla_strlcpy(tmp, nla, sizeof(tmp));
2402                         val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
2403                         if (val == TCP_CA_UNSPEC)
2404                                 goto err;
2405                 } else {
2406                         val = nla_get_u32(nla);
2407                 }
2408                 if (type == RTAX_HOPLIMIT && val > 255)
2409                         val = 255;
2410                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2411                         goto err;
2412
2413                 mp[type - 1] = val;
2414                 __set_bit(type - 1, mxc->mx_valid);
2415         }
2416
2417         if (ecn_ca) {
2418                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2419                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2420         }
2421
2422         mxc->mx = mp;
2423         return 0;
2424  err:
2425         kfree(mp);
2426         return -EINVAL;
2427 }
2428
2429 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2430                                             struct fib6_config *cfg,
2431                                             const struct in6_addr *gw_addr)
2432 {
2433         struct flowi6 fl6 = {
2434                 .flowi6_oif = cfg->fc_ifindex,
2435                 .daddr = *gw_addr,
2436                 .saddr = cfg->fc_prefsrc,
2437         };
2438         struct fib6_table *table;
2439         struct rt6_info *rt;
2440         int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
2441
2442         table = fib6_get_table(net, cfg->fc_table);
2443         if (!table)
2444                 return NULL;
2445
2446         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2447                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2448
2449         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2450
2451         /* if table lookup failed, fall back to full lookup */
2452         if (rt == net->ipv6.ip6_null_entry) {
2453                 ip6_rt_put(rt);
2454                 rt = NULL;
2455         }
2456
2457         return rt;
2458 }
2459
2460 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2461                                               struct netlink_ext_ack *extack)
2462 {
2463         struct net *net = cfg->fc_nlinfo.nl_net;
2464         struct rt6_info *rt = NULL;
2465         struct net_device *dev = NULL;
2466         struct inet6_dev *idev = NULL;
2467         struct fib6_table *table;
2468         int addr_type;
2469         int err = -EINVAL;
2470
2471         /* RTF_PCPU is an internal flag; can not be set by userspace */
2472         if (cfg->fc_flags & RTF_PCPU) {
2473                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2474                 goto out;
2475         }
2476
2477         if (cfg->fc_dst_len > 128) {
2478                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2479                 goto out;
2480         }
2481         if (cfg->fc_src_len > 128) {
2482                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2483                 goto out;
2484         }
2485 #ifndef CONFIG_IPV6_SUBTREES
2486         if (cfg->fc_src_len) {
2487                 NL_SET_ERR_MSG(extack,
2488                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2489                 goto out;
2490         }
2491 #endif
2492         if (cfg->fc_ifindex) {
2493                 err = -ENODEV;
2494                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2495                 if (!dev)
2496                         goto out;
2497                 idev = in6_dev_get(dev);
2498                 if (!idev)
2499                         goto out;
2500         }
2501
2502         if (cfg->fc_metric == 0)
2503                 cfg->fc_metric = IP6_RT_PRIO_USER;
2504
2505         err = -ENOBUFS;
2506         if (cfg->fc_nlinfo.nlh &&
2507             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2508                 table = fib6_get_table(net, cfg->fc_table);
2509                 if (!table) {
2510                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2511                         table = fib6_new_table(net, cfg->fc_table);
2512                 }
2513         } else {
2514                 table = fib6_new_table(net, cfg->fc_table);
2515         }
2516
2517         if (!table)
2518                 goto out;
2519
2520         rt = ip6_dst_alloc(net, NULL,
2521                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2522
2523         if (!rt) {
2524                 err = -ENOMEM;
2525                 goto out;
2526         }
2527
2528         if (cfg->fc_flags & RTF_EXPIRES)
2529                 rt6_set_expires(rt, jiffies +
2530                                 clock_t_to_jiffies(cfg->fc_expires));
2531         else
2532                 rt6_clean_expires(rt);
2533
2534         if (cfg->fc_protocol == RTPROT_UNSPEC)
2535                 cfg->fc_protocol = RTPROT_BOOT;
2536         rt->rt6i_protocol = cfg->fc_protocol;
2537
2538         addr_type = ipv6_addr_type(&cfg->fc_dst);
2539
2540         if (addr_type & IPV6_ADDR_MULTICAST)
2541                 rt->dst.input = ip6_mc_input;
2542         else if (cfg->fc_flags & RTF_LOCAL)
2543                 rt->dst.input = ip6_input;
2544         else
2545                 rt->dst.input = ip6_forward;
2546
2547         rt->dst.output = ip6_output;
2548
2549         if (cfg->fc_encap) {
2550                 struct lwtunnel_state *lwtstate;
2551
2552                 err = lwtunnel_build_state(cfg->fc_encap_type,
2553                                            cfg->fc_encap, AF_INET6, cfg,
2554                                            &lwtstate, extack);
2555                 if (err)
2556                         goto out;
2557                 rt->dst.lwtstate = lwtstate_get(lwtstate);
2558                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2559                         rt->dst.lwtstate->orig_output = rt->dst.output;
2560                         rt->dst.output = lwtunnel_output;
2561                 }
2562                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2563                         rt->dst.lwtstate->orig_input = rt->dst.input;
2564                         rt->dst.input = lwtunnel_input;
2565                 }
2566         }
2567
2568         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2569         rt->rt6i_dst.plen = cfg->fc_dst_len;
2570         if (rt->rt6i_dst.plen == 128)
2571                 rt->dst.flags |= DST_HOST;
2572
2573 #ifdef CONFIG_IPV6_SUBTREES
2574         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2575         rt->rt6i_src.plen = cfg->fc_src_len;
2576 #endif
2577
2578         rt->rt6i_metric = cfg->fc_metric;
2579
2580         /* We cannot add true routes via loopback here,
2581            they would result in kernel looping; promote them to reject routes
2582          */
2583         if ((cfg->fc_flags & RTF_REJECT) ||
2584             (dev && (dev->flags & IFF_LOOPBACK) &&
2585              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2586              !(cfg->fc_flags & RTF_LOCAL))) {
2587                 /* hold loopback dev/idev if we haven't done so. */
2588                 if (dev != net->loopback_dev) {
2589                         if (dev) {
2590                                 dev_put(dev);
2591                                 in6_dev_put(idev);
2592                         }
2593                         dev = net->loopback_dev;
2594                         dev_hold(dev);
2595                         idev = in6_dev_get(dev);
2596                         if (!idev) {
2597                                 err = -ENODEV;
2598                                 goto out;
2599                         }
2600                 }
2601                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2602                 switch (cfg->fc_type) {
2603                 case RTN_BLACKHOLE:
2604                         rt->dst.error = -EINVAL;
2605                         rt->dst.output = dst_discard_out;
2606                         rt->dst.input = dst_discard;
2607                         break;
2608                 case RTN_PROHIBIT:
2609                         rt->dst.error = -EACCES;
2610                         rt->dst.output = ip6_pkt_prohibit_out;
2611                         rt->dst.input = ip6_pkt_prohibit;
2612                         break;
2613                 case RTN_THROW:
2614                 case RTN_UNREACHABLE:
2615                 default:
2616                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2617                                         : (cfg->fc_type == RTN_UNREACHABLE)
2618                                         ? -EHOSTUNREACH : -ENETUNREACH;
2619                         rt->dst.output = ip6_pkt_discard_out;
2620                         rt->dst.input = ip6_pkt_discard;
2621                         break;
2622                 }
2623                 goto install_route;
2624         }
2625
2626         if (cfg->fc_flags & RTF_GATEWAY) {
2627                 const struct in6_addr *gw_addr;
2628                 int gwa_type;
2629
2630                 gw_addr = &cfg->fc_gateway;
2631                 gwa_type = ipv6_addr_type(gw_addr);
2632
2633                 /* if gw_addr is local we will fail to detect this in case
2634                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
2635                  * will return already-added prefix route via interface that
2636                  * prefix route was assigned to, which might be non-loopback.
2637                  */
2638                 err = -EINVAL;
2639                 if (ipv6_chk_addr_and_flags(net, gw_addr,
2640                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
2641                                             dev : NULL, 0, 0)) {
2642                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2643                         goto out;
2644                 }
2645                 rt->rt6i_gateway = *gw_addr;
2646
2647                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2648                         struct rt6_info *grt = NULL;
2649
2650                         /* IPv6 strictly inhibits using not link-local
2651                            addresses as nexthop address.
2652                            Otherwise, router will not able to send redirects.
2653                            It is very good, but in some (rare!) circumstances
2654                            (SIT, PtP, NBMA NOARP links) it is handy to allow
2655                            some exceptions. --ANK
2656                            We allow IPv4-mapped nexthops to support RFC4798-type
2657                            addressing
2658                          */
2659                         if (!(gwa_type & (IPV6_ADDR_UNICAST |
2660                                           IPV6_ADDR_MAPPED))) {
2661                                 NL_SET_ERR_MSG(extack,
2662                                                "Invalid gateway address");
2663                                 goto out;
2664                         }
2665
2666                         if (cfg->fc_table) {
2667                                 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2668
2669                                 if (grt) {
2670                                         if (grt->rt6i_flags & RTF_GATEWAY ||
2671                                             (dev && dev != grt->dst.dev)) {
2672                                                 ip6_rt_put(grt);
2673                                                 grt = NULL;
2674                                         }
2675                                 }
2676                         }
2677
2678                         if (!grt)
2679                                 grt = rt6_lookup(net, gw_addr, NULL,
2680                                                  cfg->fc_ifindex, 1);
2681
2682                         err = -EHOSTUNREACH;
2683                         if (!grt)
2684                                 goto out;
2685                         if (dev) {
2686                                 if (dev != grt->dst.dev) {
2687                                         ip6_rt_put(grt);
2688                                         goto out;
2689                                 }
2690                         } else {
2691                                 dev = grt->dst.dev;
2692                                 idev = grt->rt6i_idev;
2693                                 dev_hold(dev);
2694                                 in6_dev_hold(grt->rt6i_idev);
2695                         }
2696                         if (!(grt->rt6i_flags & RTF_GATEWAY))
2697                                 err = 0;
2698                         ip6_rt_put(grt);
2699
2700                         if (err)
2701                                 goto out;
2702                 }
2703                 err = -EINVAL;
2704                 if (!dev) {
2705                         NL_SET_ERR_MSG(extack, "Egress device not specified");
2706                         goto out;
2707                 } else if (dev->flags & IFF_LOOPBACK) {
2708                         NL_SET_ERR_MSG(extack,
2709                                        "Egress device can not be loopback device for this route");
2710                         goto out;
2711                 }
2712         }
2713
2714         err = -ENODEV;
2715         if (!dev)
2716                 goto out;
2717
2718         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2719                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2720                         NL_SET_ERR_MSG(extack, "Invalid source address");
2721                         err = -EINVAL;
2722                         goto out;
2723                 }
2724                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2725                 rt->rt6i_prefsrc.plen = 128;
2726         } else
2727                 rt->rt6i_prefsrc.plen = 0;
2728
2729         rt->rt6i_flags = cfg->fc_flags;
2730
2731 install_route:
2732         rt->dst.dev = dev;
2733         rt->rt6i_idev = idev;
2734         rt->rt6i_table = table;
2735
2736         cfg->fc_nlinfo.nl_net = dev_net(dev);
2737
2738         return rt;
2739 out:
2740         if (dev)
2741                 dev_put(dev);
2742         if (idev)
2743                 in6_dev_put(idev);
2744         if (rt)
2745                 dst_release_immediate(&rt->dst);
2746
2747         return ERR_PTR(err);
2748 }
2749
2750 int ip6_route_add(struct fib6_config *cfg,
2751                   struct netlink_ext_ack *extack)
2752 {
2753         struct mx6_config mxc = { .mx = NULL, };
2754         struct rt6_info *rt;
2755         int err;
2756
2757         rt = ip6_route_info_create(cfg, extack);
2758         if (IS_ERR(rt)) {
2759                 err = PTR_ERR(rt);
2760                 rt = NULL;
2761                 goto out;
2762         }
2763
2764         err = ip6_convert_metrics(&mxc, cfg);
2765         if (err)
2766                 goto out;
2767
2768         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2769
2770         kfree(mxc.mx);
2771
2772         return err;
2773 out:
2774         if (rt)
2775                 dst_release_immediate(&rt->dst);
2776
2777         return err;
2778 }
2779
2780 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2781 {
2782         int err;
2783         struct fib6_table *table;
2784         struct net *net = dev_net(rt->dst.dev);
2785
2786         if (rt == net->ipv6.ip6_null_entry) {
2787                 err = -ENOENT;
2788                 goto out;
2789         }
2790
2791         table = rt->rt6i_table;
2792         spin_lock_bh(&table->tb6_lock);
2793         err = fib6_del(rt, info);
2794         spin_unlock_bh(&table->tb6_lock);
2795
2796 out:
2797         ip6_rt_put(rt);
2798         return err;
2799 }
2800
2801 int ip6_del_rt(struct rt6_info *rt)
2802 {
2803         struct nl_info info = {
2804                 .nl_net = dev_net(rt->dst.dev),
2805         };
2806         return __ip6_del_rt(rt, &info);
2807 }
2808
2809 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2810 {
2811         struct nl_info *info = &cfg->fc_nlinfo;
2812         struct net *net = info->nl_net;
2813         struct sk_buff *skb = NULL;
2814         struct fib6_table *table;
2815         int err = -ENOENT;
2816
2817         if (rt == net->ipv6.ip6_null_entry)
2818                 goto out_put;
2819         table = rt->rt6i_table;
2820         spin_lock_bh(&table->tb6_lock);
2821
2822         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2823                 struct rt6_info *sibling, *next_sibling;
2824
2825                 /* prefer to send a single notification with all hops */
2826                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2827                 if (skb) {
2828                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2829
2830                         if (rt6_fill_node(net, skb, rt,
2831                                           NULL, NULL, 0, RTM_DELROUTE,
2832                                           info->portid, seq, 0) < 0) {
2833                                 kfree_skb(skb);
2834                                 skb = NULL;
2835                         } else
2836                                 info->skip_notify = 1;
2837                 }
2838
2839                 list_for_each_entry_safe(sibling, next_sibling,
2840                                          &rt->rt6i_siblings,
2841                                          rt6i_siblings) {
2842                         err = fib6_del(sibling, info);
2843                         if (err)
2844                                 goto out_unlock;
2845                 }
2846         }
2847
2848         err = fib6_del(rt, info);
2849 out_unlock:
2850         spin_unlock_bh(&table->tb6_lock);
2851 out_put:
2852         ip6_rt_put(rt);
2853
2854         if (skb) {
2855                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2856                             info->nlh, gfp_any());
2857         }
2858         return err;
2859 }
2860
2861 static int ip6_route_del(struct fib6_config *cfg,
2862                          struct netlink_ext_ack *extack)
2863 {
2864         struct rt6_info *rt, *rt_cache;
2865         struct fib6_table *table;
2866         struct fib6_node *fn;
2867         int err = -ESRCH;
2868
2869         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2870         if (!table) {
2871                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2872                 return err;
2873         }
2874
2875         rcu_read_lock();
2876
2877         fn = fib6_locate(&table->tb6_root,
2878                          &cfg->fc_dst, cfg->fc_dst_len,
2879                          &cfg->fc_src, cfg->fc_src_len,
2880                          !(cfg->fc_flags & RTF_CACHE));
2881
2882         if (fn) {
2883                 for_each_fib6_node_rt_rcu(fn) {
2884                         if (cfg->fc_flags & RTF_CACHE) {
2885                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
2886                                                               &cfg->fc_src);
2887                                 if (!rt_cache)
2888                                         continue;
2889                                 rt = rt_cache;
2890                         }
2891                         if (cfg->fc_ifindex &&
2892                             (!rt->dst.dev ||
2893                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2894                                 continue;
2895                         if (cfg->fc_flags & RTF_GATEWAY &&
2896                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2897                                 continue;
2898                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2899                                 continue;
2900                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2901                                 continue;
2902                         if (!dst_hold_safe(&rt->dst))
2903                                 break;
2904                         rcu_read_unlock();
2905
2906                         /* if gateway was specified only delete the one hop */
2907                         if (cfg->fc_flags & RTF_GATEWAY)
2908                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2909
2910                         return __ip6_del_rt_siblings(rt, cfg);
2911                 }
2912         }
2913         rcu_read_unlock();
2914
2915         return err;
2916 }
2917
2918 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2919 {
2920         struct netevent_redirect netevent;
2921         struct rt6_info *rt, *nrt = NULL;
2922         struct ndisc_options ndopts;
2923         struct inet6_dev *in6_dev;
2924         struct neighbour *neigh;
2925         struct rd_msg *msg;
2926         int optlen, on_link;
2927         u8 *lladdr;
2928
2929         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2930         optlen -= sizeof(*msg);
2931
2932         if (optlen < 0) {
2933                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2934                 return;
2935         }
2936
2937         msg = (struct rd_msg *)icmp6_hdr(skb);
2938
2939         if (ipv6_addr_is_multicast(&msg->dest)) {
2940                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2941                 return;
2942         }
2943
2944         on_link = 0;
2945         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2946                 on_link = 1;
2947         } else if (ipv6_addr_type(&msg->target) !=
2948                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2949                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2950                 return;
2951         }
2952
2953         in6_dev = __in6_dev_get(skb->dev);
2954         if (!in6_dev)
2955                 return;
2956         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2957                 return;
2958
2959         /* RFC2461 8.1:
2960          *      The IP source address of the Redirect MUST be the same as the current
2961          *      first-hop router for the specified ICMP Destination Address.
2962          */
2963
2964         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2965                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2966                 return;
2967         }
2968
2969         lladdr = NULL;
2970         if (ndopts.nd_opts_tgt_lladdr) {
2971                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2972                                              skb->dev);
2973                 if (!lladdr) {
2974                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2975                         return;
2976                 }
2977         }
2978
2979         rt = (struct rt6_info *) dst;
2980         if (rt->rt6i_flags & RTF_REJECT) {
2981                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2982                 return;
2983         }
2984
2985         /* Redirect received -> path was valid.
2986          * Look, redirects are sent only in response to data packets,
2987          * so that this nexthop apparently is reachable. --ANK
2988          */
2989         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2990
2991         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2992         if (!neigh)
2993                 return;
2994
2995         /*
2996          *      We have finally decided to accept it.
2997          */
2998
2999         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3000                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3001                      NEIGH_UPDATE_F_OVERRIDE|
3002                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3003                                      NEIGH_UPDATE_F_ISROUTER)),
3004                      NDISC_REDIRECT, &ndopts);
3005
3006         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3007         if (!nrt)
3008                 goto out;
3009
3010         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3011         if (on_link)
3012                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3013
3014         nrt->rt6i_protocol = RTPROT_REDIRECT;
3015         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3016
3017         /* No need to remove rt from the exception table if rt is
3018          * a cached route because rt6_insert_exception() will
3019          * takes care of it
3020          */
3021         if (rt6_insert_exception(nrt, rt)) {
3022                 dst_release_immediate(&nrt->dst);
3023                 goto out;
3024         }
3025
3026         netevent.old = &rt->dst;
3027         netevent.new = &nrt->dst;
3028         netevent.daddr = &msg->dest;
3029         netevent.neigh = neigh;
3030         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3031
3032 out:
3033         neigh_release(neigh);
3034 }
3035
3036 /*
3037  *      Misc support functions
3038  */
3039
3040 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3041 {
3042         BUG_ON(from->dst.from);
3043
3044         rt->rt6i_flags &= ~RTF_EXPIRES;
3045         dst_hold(&from->dst);
3046         rt->dst.from = &from->dst;
3047         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3048 }
3049
3050 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3051 {
3052         rt->dst.input = ort->dst.input;
3053         rt->dst.output = ort->dst.output;
3054         rt->rt6i_dst = ort->rt6i_dst;
3055         rt->dst.error = ort->dst.error;
3056         rt->rt6i_idev = ort->rt6i_idev;
3057         if (rt->rt6i_idev)
3058                 in6_dev_hold(rt->rt6i_idev);
3059         rt->dst.lastuse = jiffies;
3060         rt->rt6i_gateway = ort->rt6i_gateway;
3061         rt->rt6i_flags = ort->rt6i_flags;
3062         rt6_set_from(rt, ort);
3063         rt->rt6i_metric = ort->rt6i_metric;
3064 #ifdef CONFIG_IPV6_SUBTREES
3065         rt->rt6i_src = ort->rt6i_src;
3066 #endif
3067         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3068         rt->rt6i_table = ort->rt6i_table;
3069         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3070 }
3071
3072 #ifdef CONFIG_IPV6_ROUTE_INFO
3073 static struct rt6_info *rt6_get_route_info(struct net *net,
3074                                            const struct in6_addr *prefix, int prefixlen,
3075                                            const struct in6_addr *gwaddr,
3076                                            struct net_device *dev)
3077 {
3078         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3079         int ifindex = dev->ifindex;
3080         struct fib6_node *fn;
3081         struct rt6_info *rt = NULL;
3082         struct fib6_table *table;
3083
3084         table = fib6_get_table(net, tb_id);
3085         if (!table)
3086                 return NULL;
3087
3088         rcu_read_lock();
3089         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3090         if (!fn)
3091                 goto out;
3092
3093         for_each_fib6_node_rt_rcu(fn) {
3094                 if (rt->dst.dev->ifindex != ifindex)
3095                         continue;
3096                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3097                         continue;
3098                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3099                         continue;
3100                 ip6_hold_safe(NULL, &rt, false);
3101                 break;
3102         }
3103 out:
3104         rcu_read_unlock();
3105         return rt;
3106 }
3107
3108 static struct rt6_info *rt6_add_route_info(struct net *net,
3109                                            const struct in6_addr *prefix, int prefixlen,
3110                                            const struct in6_addr *gwaddr,
3111                                            struct net_device *dev,
3112                                            unsigned int pref)
3113 {
3114         struct fib6_config cfg = {
3115                 .fc_metric      = IP6_RT_PRIO_USER,
3116                 .fc_ifindex     = dev->ifindex,
3117                 .fc_dst_len     = prefixlen,
3118                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3119                                   RTF_UP | RTF_PREF(pref),
3120                 .fc_protocol = RTPROT_RA,
3121                 .fc_nlinfo.portid = 0,
3122                 .fc_nlinfo.nlh = NULL,
3123                 .fc_nlinfo.nl_net = net,
3124         };
3125
3126         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3127         cfg.fc_dst = *prefix;
3128         cfg.fc_gateway = *gwaddr;
3129
3130         /* We should treat it as a default route if prefix length is 0. */
3131         if (!prefixlen)
3132                 cfg.fc_flags |= RTF_DEFAULT;
3133
3134         ip6_route_add(&cfg, NULL);
3135
3136         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3137 }
3138 #endif
3139
3140 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3141 {
3142         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3143         struct rt6_info *rt;
3144         struct fib6_table *table;
3145
3146         table = fib6_get_table(dev_net(dev), tb_id);
3147         if (!table)
3148                 return NULL;
3149
3150         rcu_read_lock();
3151         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3152                 if (dev == rt->dst.dev &&
3153                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3154                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
3155                         break;
3156         }
3157         if (rt)
3158                 ip6_hold_safe(NULL, &rt, false);
3159         rcu_read_unlock();
3160         return rt;
3161 }
3162
3163 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3164                                      struct net_device *dev,
3165                                      unsigned int pref)
3166 {
3167         struct fib6_config cfg = {
3168                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3169                 .fc_metric      = IP6_RT_PRIO_USER,
3170                 .fc_ifindex     = dev->ifindex,
3171                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3172                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3173                 .fc_protocol = RTPROT_RA,
3174                 .fc_nlinfo.portid = 0,
3175                 .fc_nlinfo.nlh = NULL,
3176                 .fc_nlinfo.nl_net = dev_net(dev),
3177         };
3178
3179         cfg.fc_gateway = *gwaddr;
3180
3181         if (!ip6_route_add(&cfg, NULL)) {
3182                 struct fib6_table *table;
3183
3184                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3185                 if (table)
3186                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3187         }
3188
3189         return rt6_get_dflt_router(gwaddr, dev);
3190 }
3191
3192 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3193 {
3194         struct rt6_info *rt;
3195
3196 restart:
3197         rcu_read_lock();
3198         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3199                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3200                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3201                         if (dst_hold_safe(&rt->dst)) {
3202                                 rcu_read_unlock();
3203                                 ip6_del_rt(rt);
3204                         } else {
3205                                 rcu_read_unlock();
3206                         }
3207                         goto restart;
3208                 }
3209         }
3210         rcu_read_unlock();
3211
3212         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3213 }
3214
3215 void rt6_purge_dflt_routers(struct net *net)
3216 {
3217         struct fib6_table *table;
3218         struct hlist_head *head;
3219         unsigned int h;
3220
3221         rcu_read_lock();
3222
3223         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3224                 head = &net->ipv6.fib_table_hash[h];
3225                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3226                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3227                                 __rt6_purge_dflt_routers(table);
3228                 }
3229         }
3230
3231         rcu_read_unlock();
3232 }
3233
3234 static void rtmsg_to_fib6_config(struct net *net,
3235                                  struct in6_rtmsg *rtmsg,
3236                                  struct fib6_config *cfg)
3237 {
3238         memset(cfg, 0, sizeof(*cfg));
3239
3240         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3241                          : RT6_TABLE_MAIN;
3242         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3243         cfg->fc_metric = rtmsg->rtmsg_metric;
3244         cfg->fc_expires = rtmsg->rtmsg_info;
3245         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3246         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3247         cfg->fc_flags = rtmsg->rtmsg_flags;
3248
3249         cfg->fc_nlinfo.nl_net = net;
3250
3251         cfg->fc_dst = rtmsg->rtmsg_dst;
3252         cfg->fc_src = rtmsg->rtmsg_src;
3253         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3254 }
3255
3256 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3257 {
3258         struct fib6_config cfg;
3259         struct in6_rtmsg rtmsg;
3260         int err;
3261
3262         switch (cmd) {
3263         case SIOCADDRT:         /* Add a route */
3264         case SIOCDELRT:         /* Delete a route */
3265                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3266                         return -EPERM;
3267                 err = copy_from_user(&rtmsg, arg,
3268                                      sizeof(struct in6_rtmsg));
3269                 if (err)
3270                         return -EFAULT;
3271
3272                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3273
3274                 rtnl_lock();
3275                 switch (cmd) {
3276                 case SIOCADDRT:
3277                         err = ip6_route_add(&cfg, NULL);
3278                         break;
3279                 case SIOCDELRT:
3280                         err = ip6_route_del(&cfg, NULL);
3281                         break;
3282                 default:
3283                         err = -EINVAL;
3284                 }
3285                 rtnl_unlock();
3286
3287                 return err;
3288         }
3289
3290         return -EINVAL;
3291 }
3292
3293 /*
3294  *      Drop the packet on the floor
3295  */
3296
3297 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3298 {
3299         int type;
3300         struct dst_entry *dst = skb_dst(skb);
3301         switch (ipstats_mib_noroutes) {
3302         case IPSTATS_MIB_INNOROUTES:
3303                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3304                 if (type == IPV6_ADDR_ANY) {
3305                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3306                                       IPSTATS_MIB_INADDRERRORS);
3307                         break;
3308                 }
3309                 /* FALLTHROUGH */
3310         case IPSTATS_MIB_OUTNOROUTES:
3311                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3312                               ipstats_mib_noroutes);
3313                 break;
3314         }
3315         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3316         kfree_skb(skb);
3317         return 0;
3318 }
3319
3320 static int ip6_pkt_discard(struct sk_buff *skb)
3321 {
3322         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3323 }
3324
3325 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3326 {
3327         skb->dev = skb_dst(skb)->dev;
3328         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3329 }
3330
3331 static int ip6_pkt_prohibit(struct sk_buff *skb)
3332 {
3333         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3334 }
3335
3336 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3337 {
3338         skb->dev = skb_dst(skb)->dev;
3339         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3340 }
3341
3342 /*
3343  *      Allocate a dst for local (unicast / anycast) address.
3344  */
3345
3346 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3347                                     const struct in6_addr *addr,
3348                                     bool anycast)
3349 {
3350         u32 tb_id;
3351         struct net *net = dev_net(idev->dev);
3352         struct net_device *dev = idev->dev;
3353         struct rt6_info *rt;
3354
3355         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3356         if (!rt)
3357                 return ERR_PTR(-ENOMEM);
3358
3359         in6_dev_hold(idev);
3360
3361         rt->dst.flags |= DST_HOST;
3362         rt->dst.input = ip6_input;
3363         rt->dst.output = ip6_output;
3364         rt->rt6i_idev = idev;
3365
3366         rt->rt6i_protocol = RTPROT_KERNEL;
3367         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3368         if (anycast)
3369                 rt->rt6i_flags |= RTF_ANYCAST;
3370         else
3371                 rt->rt6i_flags |= RTF_LOCAL;
3372
3373         rt->rt6i_gateway  = *addr;
3374         rt->rt6i_dst.addr = *addr;
3375         rt->rt6i_dst.plen = 128;
3376         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3377         rt->rt6i_table = fib6_get_table(net, tb_id);
3378
3379         return rt;
3380 }
3381
3382 /* remove deleted ip from prefsrc entries */
3383 struct arg_dev_net_ip {
3384         struct net_device *dev;
3385         struct net *net;
3386         struct in6_addr *addr;
3387 };
3388
3389 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3390 {
3391         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3392         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3393         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3394
3395         if (((void *)rt->dst.dev == dev || !dev) &&
3396             rt != net->ipv6.ip6_null_entry &&
3397             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3398                 spin_lock_bh(&rt6_exception_lock);
3399                 /* remove prefsrc entry */
3400                 rt->rt6i_prefsrc.plen = 0;
3401                 /* need to update cache as well */
3402                 rt6_exceptions_remove_prefsrc(rt);
3403                 spin_unlock_bh(&rt6_exception_lock);
3404         }
3405         return 0;
3406 }
3407
3408 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3409 {
3410         struct net *net = dev_net(ifp->idev->dev);
3411         struct arg_dev_net_ip adni = {
3412                 .dev = ifp->idev->dev,
3413                 .net = net,
3414                 .addr = &ifp->addr,
3415         };
3416         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3417 }
3418
3419 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3420
3421 /* Remove routers and update dst entries when gateway turn into host. */
3422 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3423 {
3424         struct in6_addr *gateway = (struct in6_addr *)arg;
3425
3426         if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3427             ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3428                 return -1;
3429         }
3430
3431         /* Further clean up cached routes in exception table.
3432          * This is needed because cached route may have a different
3433          * gateway than its 'parent' in the case of an ip redirect.
3434          */
3435         rt6_exceptions_clean_tohost(rt, gateway);
3436
3437         return 0;
3438 }
3439
3440 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3441 {
3442         fib6_clean_all(net, fib6_clean_tohost, gateway);
3443 }
3444
3445 struct arg_dev_net {
3446         struct net_device *dev;
3447         struct net *net;
3448 };
3449
3450 /* called with write lock held for table with rt */
3451 static int fib6_ifdown(struct rt6_info *rt, void *arg)
3452 {
3453         const struct arg_dev_net *adn = arg;
3454         const struct net_device *dev = adn->dev;
3455
3456         if ((rt->dst.dev == dev || !dev) &&
3457             rt != adn->net->ipv6.ip6_null_entry &&
3458             (rt->rt6i_nsiblings == 0 ||
3459              (dev && netdev_unregistering(dev)) ||
3460              !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3461                 return -1;
3462
3463         return 0;
3464 }
3465
3466 void rt6_ifdown(struct net *net, struct net_device *dev)
3467 {
3468         struct arg_dev_net adn = {
3469                 .dev = dev,
3470                 .net = net,
3471         };
3472
3473         fib6_clean_all(net, fib6_ifdown, &adn);
3474         if (dev)
3475                 rt6_uncached_list_flush_dev(net, dev);
3476 }
3477
3478 struct rt6_mtu_change_arg {
3479         struct net_device *dev;
3480         unsigned int mtu;
3481 };
3482
3483 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3484 {
3485         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3486         struct inet6_dev *idev;
3487
3488         /* In IPv6 pmtu discovery is not optional,
3489            so that RTAX_MTU lock cannot disable it.
3490            We still use this lock to block changes
3491            caused by addrconf/ndisc.
3492         */
3493
3494         idev = __in6_dev_get(arg->dev);
3495         if (!idev)
3496                 return 0;
3497
3498         /* For administrative MTU increase, there is no way to discover
3499            IPv6 PMTU increase, so PMTU increase should be updated here.
3500            Since RFC 1981 doesn't include administrative MTU increase
3501            update PMTU increase is a MUST. (i.e. jumbo frame)
3502          */
3503         /*
3504            If new MTU is less than route PMTU, this new MTU will be the
3505            lowest MTU in the path, update the route PMTU to reflect PMTU
3506            decreases; if new MTU is greater than route PMTU, and the
3507            old MTU is the lowest MTU in the path, update the route PMTU
3508            to reflect the increase. In this case if the other nodes' MTU
3509            also have the lowest MTU, TOO BIG MESSAGE will be lead to
3510            PMTU discovery.
3511          */
3512         if (rt->dst.dev == arg->dev &&
3513             dst_metric_raw(&rt->dst, RTAX_MTU) &&
3514             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3515                 spin_lock_bh(&rt6_exception_lock);
3516                 if (dst_mtu(&rt->dst) >= arg->mtu ||
3517                     (dst_mtu(&rt->dst) < arg->mtu &&
3518                      dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
3519                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3520                 }
3521                 rt6_exceptions_update_pmtu(rt, arg->mtu);
3522                 spin_unlock_bh(&rt6_exception_lock);
3523         }
3524         return 0;
3525 }
3526
3527 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3528 {
3529         struct rt6_mtu_change_arg arg = {
3530                 .dev = dev,
3531                 .mtu = mtu,
3532         };
3533
3534         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3535 }
3536
3537 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3538         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3539         [RTA_OIF]               = { .type = NLA_U32 },
3540         [RTA_IIF]               = { .type = NLA_U32 },
3541         [RTA_PRIORITY]          = { .type = NLA_U32 },
3542         [RTA_METRICS]           = { .type = NLA_NESTED },
3543         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
3544         [RTA_PREF]              = { .type = NLA_U8 },
3545         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
3546         [RTA_ENCAP]             = { .type = NLA_NESTED },
3547         [RTA_EXPIRES]           = { .type = NLA_U32 },
3548         [RTA_UID]               = { .type = NLA_U32 },
3549         [RTA_MARK]              = { .type = NLA_U32 },
3550 };
3551
3552 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3553                               struct fib6_config *cfg,
3554                               struct netlink_ext_ack *extack)
3555 {
3556         struct rtmsg *rtm;
3557         struct nlattr *tb[RTA_MAX+1];
3558         unsigned int pref;
3559         int err;
3560
3561         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3562                           NULL);
3563         if (err < 0)
3564                 goto errout;
3565
3566         err = -EINVAL;
3567         rtm = nlmsg_data(nlh);
3568         memset(cfg, 0, sizeof(*cfg));
3569
3570         cfg->fc_table = rtm->rtm_table;
3571         cfg->fc_dst_len = rtm->rtm_dst_len;
3572         cfg->fc_src_len = rtm->rtm_src_len;
3573         cfg->fc_flags = RTF_UP;
3574         cfg->fc_protocol = rtm->rtm_protocol;
3575         cfg->fc_type = rtm->rtm_type;
3576
3577         if (rtm->rtm_type == RTN_UNREACHABLE ||
3578             rtm->rtm_type == RTN_BLACKHOLE ||
3579             rtm->rtm_type == RTN_PROHIBIT ||
3580             rtm->rtm_type == RTN_THROW)
3581                 cfg->fc_flags |= RTF_REJECT;
3582
3583         if (rtm->rtm_type == RTN_LOCAL)
3584                 cfg->fc_flags |= RTF_LOCAL;
3585
3586         if (rtm->rtm_flags & RTM_F_CLONED)
3587                 cfg->fc_flags |= RTF_CACHE;
3588
3589         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3590         cfg->fc_nlinfo.nlh = nlh;
3591         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3592
3593         if (tb[RTA_GATEWAY]) {
3594                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3595                 cfg->fc_flags |= RTF_GATEWAY;
3596         }
3597
3598         if (tb[RTA_DST]) {
3599                 int plen = (rtm->rtm_dst_len + 7) >> 3;
3600
3601                 if (nla_len(tb[RTA_DST]) < plen)
3602                         goto errout;
3603
3604                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3605         }
3606
3607         if (tb[RTA_SRC]) {
3608                 int plen = (rtm->rtm_src_len + 7) >> 3;
3609
3610                 if (nla_len(tb[RTA_SRC]) < plen)
3611                         goto errout;
3612
3613                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3614         }
3615
3616         if (tb[RTA_PREFSRC])
3617                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3618
3619         if (tb[RTA_OIF])
3620                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3621
3622         if (tb[RTA_PRIORITY])
3623                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3624
3625         if (tb[RTA_METRICS]) {
3626                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3627                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3628         }
3629
3630         if (tb[RTA_TABLE])
3631                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3632
3633         if (tb[RTA_MULTIPATH]) {
3634                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3635                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3636
3637                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3638                                                      cfg->fc_mp_len, extack);
3639                 if (err < 0)
3640                         goto errout;
3641         }
3642
3643         if (tb[RTA_PREF]) {
3644                 pref = nla_get_u8(tb[RTA_PREF]);
3645                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3646                     pref != ICMPV6_ROUTER_PREF_HIGH)
3647                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
3648                 cfg->fc_flags |= RTF_PREF(pref);
3649         }
3650
3651         if (tb[RTA_ENCAP])
3652                 cfg->fc_encap = tb[RTA_ENCAP];
3653
3654         if (tb[RTA_ENCAP_TYPE]) {
3655                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3656
3657                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3658                 if (err < 0)
3659                         goto errout;
3660         }
3661
3662         if (tb[RTA_EXPIRES]) {
3663                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3664
3665                 if (addrconf_finite_timeout(timeout)) {
3666                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3667                         cfg->fc_flags |= RTF_EXPIRES;
3668                 }
3669         }
3670
3671         err = 0;
3672 errout:
3673         return err;
3674 }
3675
3676 struct rt6_nh {
3677         struct rt6_info *rt6_info;
3678         struct fib6_config r_cfg;
3679         struct mx6_config mxc;
3680         struct list_head next;
3681 };
3682
3683 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3684 {
3685         struct rt6_nh *nh;
3686
3687         list_for_each_entry(nh, rt6_nh_list, next) {
3688                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3689                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3690                         nh->r_cfg.fc_ifindex);
3691         }
3692 }
3693
3694 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3695                                  struct rt6_info *rt, struct fib6_config *r_cfg)
3696 {
3697         struct rt6_nh *nh;
3698         int err = -EEXIST;
3699
3700         list_for_each_entry(nh, rt6_nh_list, next) {
3701                 /* check if rt6_info already exists */
3702                 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3703                         return err;
3704         }
3705
3706         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3707         if (!nh)
3708                 return -ENOMEM;
3709         nh->rt6_info = rt;
3710         err = ip6_convert_metrics(&nh->mxc, r_cfg);
3711         if (err) {
3712                 kfree(nh);
3713                 return err;
3714         }
3715         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3716         list_add_tail(&nh->next, rt6_nh_list);
3717
3718         return 0;
3719 }
3720
3721 static void ip6_route_mpath_notify(struct rt6_info *rt,
3722                                    struct rt6_info *rt_last,
3723                                    struct nl_info *info,
3724                                    __u16 nlflags)
3725 {
3726         /* if this is an APPEND route, then rt points to the first route
3727          * inserted and rt_last points to last route inserted. Userspace
3728          * wants a consistent dump of the route which starts at the first
3729          * nexthop. Since sibling routes are always added at the end of
3730          * the list, find the first sibling of the last route appended
3731          */
3732         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3733                 rt = list_first_entry(&rt_last->rt6i_siblings,
3734                                       struct rt6_info,
3735                                       rt6i_siblings);
3736         }
3737
3738         if (rt)
3739                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3740 }
3741
3742 static int ip6_route_multipath_add(struct fib6_config *cfg,
3743                                    struct netlink_ext_ack *extack)
3744 {
3745         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3746         struct nl_info *info = &cfg->fc_nlinfo;
3747         struct fib6_config r_cfg;
3748         struct rtnexthop *rtnh;
3749         struct rt6_info *rt;
3750         struct rt6_nh *err_nh;
3751         struct rt6_nh *nh, *nh_safe;
3752         __u16 nlflags;
3753         int remaining;
3754         int attrlen;
3755         int err = 1;
3756         int nhn = 0;
3757         int replace = (cfg->fc_nlinfo.nlh &&
3758                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3759         LIST_HEAD(rt6_nh_list);
3760
3761         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3762         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3763                 nlflags |= NLM_F_APPEND;
3764
3765         remaining = cfg->fc_mp_len;
3766         rtnh = (struct rtnexthop *)cfg->fc_mp;
3767
3768         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3769          * rt6_info structs per nexthop
3770          */
3771         while (rtnh_ok(rtnh, remaining)) {
3772                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3773                 if (rtnh->rtnh_ifindex)
3774                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3775
3776                 attrlen = rtnh_attrlen(rtnh);
3777                 if (attrlen > 0) {
3778                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3779
3780                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3781                         if (nla) {
3782                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3783                                 r_cfg.fc_flags |= RTF_GATEWAY;
3784                         }
3785                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3786                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3787                         if (nla)
3788                                 r_cfg.fc_encap_type = nla_get_u16(nla);
3789                 }
3790
3791                 rt = ip6_route_info_create(&r_cfg, extack);
3792                 if (IS_ERR(rt)) {
3793                         err = PTR_ERR(rt);
3794                         rt = NULL;
3795                         goto cleanup;
3796                 }
3797
3798                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3799                 if (err) {
3800                         dst_release_immediate(&rt->dst);
3801                         goto cleanup;
3802                 }
3803
3804                 rtnh = rtnh_next(rtnh, &remaining);
3805         }
3806
3807         /* for add and replace send one notification with all nexthops.
3808          * Skip the notification in fib6_add_rt2node and send one with
3809          * the full route when done
3810          */
3811         info->skip_notify = 1;
3812
3813         err_nh = NULL;
3814         list_for_each_entry(nh, &rt6_nh_list, next) {
3815                 rt_last = nh->rt6_info;
3816                 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3817                 /* save reference to first route for notification */
3818                 if (!rt_notif && !err)
3819                         rt_notif = nh->rt6_info;
3820
3821                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3822                 nh->rt6_info = NULL;
3823                 if (err) {
3824                         if (replace && nhn)
3825                                 ip6_print_replace_route_err(&rt6_nh_list);
3826                         err_nh = nh;
3827                         goto add_errout;
3828                 }
3829
3830                 /* Because each route is added like a single route we remove
3831                  * these flags after the first nexthop: if there is a collision,
3832                  * we have already failed to add the first nexthop:
3833                  * fib6_add_rt2node() has rejected it; when replacing, old
3834                  * nexthops have been replaced by first new, the rest should
3835                  * be added to it.
3836                  */
3837                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3838                                                      NLM_F_REPLACE);
3839                 nhn++;
3840         }
3841
3842         /* success ... tell user about new route */
3843         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3844         goto cleanup;
3845
3846 add_errout:
3847         /* send notification for routes that were added so that
3848          * the delete notifications sent by ip6_route_del are
3849          * coherent
3850          */
3851         if (rt_notif)
3852                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3853
3854         /* Delete routes that were already added */
3855         list_for_each_entry(nh, &rt6_nh_list, next) {
3856                 if (err_nh == nh)
3857                         break;
3858                 ip6_route_del(&nh->r_cfg, extack);
3859         }
3860
3861 cleanup:
3862         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3863                 if (nh->rt6_info)
3864                         dst_release_immediate(&nh->rt6_info->dst);
3865                 kfree(nh->mxc.mx);
3866                 list_del(&nh->next);
3867                 kfree(nh);
3868         }
3869
3870         return err;
3871 }
3872
3873 static int ip6_route_multipath_del(struct fib6_config *cfg,
3874                                    struct netlink_ext_ack *extack)
3875 {
3876         struct fib6_config r_cfg;
3877         struct rtnexthop *rtnh;
3878         int remaining;
3879         int attrlen;
3880         int err = 1, last_err = 0;
3881
3882         remaining = cfg->fc_mp_len;
3883         rtnh = (struct rtnexthop *)cfg->fc_mp;
3884
3885         /* Parse a Multipath Entry */
3886         while (rtnh_ok(rtnh, remaining)) {
3887                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3888                 if (rtnh->rtnh_ifindex)
3889                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3890
3891                 attrlen = rtnh_attrlen(rtnh);
3892                 if (attrlen > 0) {
3893                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3894
3895                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3896                         if (nla) {
3897                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3898                                 r_cfg.fc_flags |= RTF_GATEWAY;
3899                         }
3900                 }
3901                 err = ip6_route_del(&r_cfg, extack);
3902                 if (err)
3903                         last_err = err;
3904
3905                 rtnh = rtnh_next(rtnh, &remaining);
3906         }
3907
3908         return last_err;
3909 }
3910
3911 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3912                               struct netlink_ext_ack *extack)
3913 {
3914         struct fib6_config cfg;
3915         int err;
3916
3917         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3918         if (err < 0)
3919                 return err;
3920
3921         if (cfg.fc_mp)
3922                 return ip6_route_multipath_del(&cfg, extack);
3923         else {
3924                 cfg.fc_delete_all_nh = 1;
3925                 return ip6_route_del(&cfg, extack);
3926         }
3927 }
3928
3929 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3930                               struct netlink_ext_ack *extack)
3931 {
3932         struct fib6_config cfg;
3933         int err;
3934
3935         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3936         if (err < 0)
3937                 return err;
3938
3939         if (cfg.fc_mp)
3940                 return ip6_route_multipath_add(&cfg, extack);
3941         else
3942                 return ip6_route_add(&cfg, extack);
3943 }
3944
3945 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3946 {
3947         int nexthop_len = 0;
3948
3949         if (rt->rt6i_nsiblings) {
3950                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
3951                             + NLA_ALIGN(sizeof(struct rtnexthop))
3952                             + nla_total_size(16) /* RTA_GATEWAY */
3953                             + lwtunnel_get_encap_size(rt->dst.lwtstate);
3954
3955                 nexthop_len *= rt->rt6i_nsiblings;
3956         }
3957
3958         return NLMSG_ALIGN(sizeof(struct rtmsg))
3959                + nla_total_size(16) /* RTA_SRC */
3960                + nla_total_size(16) /* RTA_DST */
3961                + nla_total_size(16) /* RTA_GATEWAY */
3962                + nla_total_size(16) /* RTA_PREFSRC */
3963                + nla_total_size(4) /* RTA_TABLE */
3964                + nla_total_size(4) /* RTA_IIF */
3965                + nla_total_size(4) /* RTA_OIF */
3966                + nla_total_size(4) /* RTA_PRIORITY */
3967                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3968                + nla_total_size(sizeof(struct rta_cacheinfo))
3969                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3970                + nla_total_size(1) /* RTA_PREF */
3971                + lwtunnel_get_encap_size(rt->dst.lwtstate)
3972                + nexthop_len;
3973 }
3974
3975 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3976                             unsigned int *flags, bool skip_oif)
3977 {
3978         if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3979                 *flags |= RTNH_F_LINKDOWN;
3980                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3981                         *flags |= RTNH_F_DEAD;
3982         }
3983
3984         if (rt->rt6i_flags & RTF_GATEWAY) {
3985                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3986                         goto nla_put_failure;
3987         }
3988
3989         if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
3990                 *flags |= RTNH_F_OFFLOAD;
3991
3992         /* not needed for multipath encoding b/c it has a rtnexthop struct */
3993         if (!skip_oif && rt->dst.dev &&
3994             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3995                 goto nla_put_failure;
3996
3997         if (rt->dst.lwtstate &&
3998             lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3999                 goto nla_put_failure;
4000
4001         return 0;
4002
4003 nla_put_failure:
4004         return -EMSGSIZE;
4005 }
4006
4007 /* add multipath next hop */
4008 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4009 {
4010         struct rtnexthop *rtnh;
4011         unsigned int flags = 0;
4012
4013         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4014         if (!rtnh)
4015                 goto nla_put_failure;
4016
4017         rtnh->rtnh_hops = 0;
4018         rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4019
4020         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4021                 goto nla_put_failure;
4022
4023         rtnh->rtnh_flags = flags;
4024
4025         /* length of rtnetlink header + attributes */
4026         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4027
4028         return 0;
4029
4030 nla_put_failure:
4031         return -EMSGSIZE;
4032 }
4033
4034 static int rt6_fill_node(struct net *net,
4035                          struct sk_buff *skb, struct rt6_info *rt,
4036                          struct in6_addr *dst, struct in6_addr *src,
4037                          int iif, int type, u32 portid, u32 seq,
4038                          unsigned int flags)
4039 {
4040         u32 metrics[RTAX_MAX];
4041         struct rtmsg *rtm;
4042         struct nlmsghdr *nlh;
4043         long expires;
4044         u32 table;
4045
4046         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4047         if (!nlh)
4048                 return -EMSGSIZE;
4049
4050         rtm = nlmsg_data(nlh);
4051         rtm->rtm_family = AF_INET6;
4052         rtm->rtm_dst_len = rt->rt6i_dst.plen;
4053         rtm->rtm_src_len = rt->rt6i_src.plen;
4054         rtm->rtm_tos = 0;
4055         if (rt->rt6i_table)
4056                 table = rt->rt6i_table->tb6_id;
4057         else
4058                 table = RT6_TABLE_UNSPEC;
4059         rtm->rtm_table = table;
4060         if (nla_put_u32(skb, RTA_TABLE, table))
4061                 goto nla_put_failure;
4062         if (rt->rt6i_flags & RTF_REJECT) {
4063                 switch (rt->dst.error) {
4064                 case -EINVAL:
4065                         rtm->rtm_type = RTN_BLACKHOLE;
4066                         break;
4067                 case -EACCES:
4068                         rtm->rtm_type = RTN_PROHIBIT;
4069                         break;
4070                 case -EAGAIN:
4071                         rtm->rtm_type = RTN_THROW;
4072                         break;
4073                 default:
4074                         rtm->rtm_type = RTN_UNREACHABLE;
4075                         break;
4076                 }
4077         }
4078         else if (rt->rt6i_flags & RTF_LOCAL)
4079                 rtm->rtm_type = RTN_LOCAL;
4080         else if (rt->rt6i_flags & RTF_ANYCAST)
4081                 rtm->rtm_type = RTN_ANYCAST;
4082         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4083                 rtm->rtm_type = RTN_LOCAL;
4084         else
4085                 rtm->rtm_type = RTN_UNICAST;
4086         rtm->rtm_flags = 0;
4087         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4088         rtm->rtm_protocol = rt->rt6i_protocol;
4089
4090         if (rt->rt6i_flags & RTF_CACHE)
4091                 rtm->rtm_flags |= RTM_F_CLONED;
4092
4093         if (dst) {
4094                 if (nla_put_in6_addr(skb, RTA_DST, dst))
4095                         goto nla_put_failure;
4096                 rtm->rtm_dst_len = 128;
4097         } else if (rtm->rtm_dst_len)
4098                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4099                         goto nla_put_failure;
4100 #ifdef CONFIG_IPV6_SUBTREES
4101         if (src) {
4102                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4103                         goto nla_put_failure;
4104                 rtm->rtm_src_len = 128;
4105         } else if (rtm->rtm_src_len &&
4106                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4107                 goto nla_put_failure;
4108 #endif
4109         if (iif) {
4110 #ifdef CONFIG_IPV6_MROUTE
4111                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4112                         int err = ip6mr_get_route(net, skb, rtm, portid);
4113
4114                         if (err == 0)
4115                                 return 0;
4116                         if (err < 0)
4117                                 goto nla_put_failure;
4118                 } else
4119 #endif
4120                         if (nla_put_u32(skb, RTA_IIF, iif))
4121                                 goto nla_put_failure;
4122         } else if (dst) {
4123                 struct in6_addr saddr_buf;
4124                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4125                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4126                         goto nla_put_failure;
4127         }
4128
4129         if (rt->rt6i_prefsrc.plen) {
4130                 struct in6_addr saddr_buf;
4131                 saddr_buf = rt->rt6i_prefsrc.addr;
4132                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4133                         goto nla_put_failure;
4134         }
4135
4136         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4137         if (rt->rt6i_pmtu)
4138                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4139         if (rtnetlink_put_metrics(skb, metrics) < 0)
4140                 goto nla_put_failure;
4141
4142         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4143                 goto nla_put_failure;
4144
4145         /* For multipath routes, walk the siblings list and add
4146          * each as a nexthop within RTA_MULTIPATH.
4147          */
4148         if (rt->rt6i_nsiblings) {
4149                 struct rt6_info *sibling, *next_sibling;
4150                 struct nlattr *mp;
4151
4152                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4153                 if (!mp)
4154                         goto nla_put_failure;
4155
4156                 if (rt6_add_nexthop(skb, rt) < 0)
4157                         goto nla_put_failure;
4158
4159                 list_for_each_entry_safe(sibling, next_sibling,
4160                                          &rt->rt6i_siblings, rt6i_siblings) {
4161                         if (rt6_add_nexthop(skb, sibling) < 0)
4162                                 goto nla_put_failure;
4163                 }
4164
4165                 nla_nest_end(skb, mp);
4166         } else {
4167                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4168                         goto nla_put_failure;
4169         }
4170
4171         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4172
4173         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4174                 goto nla_put_failure;
4175
4176         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4177                 goto nla_put_failure;
4178
4179
4180         nlmsg_end(skb, nlh);
4181         return 0;
4182
4183 nla_put_failure:
4184         nlmsg_cancel(skb, nlh);
4185         return -EMSGSIZE;
4186 }
4187
4188 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4189 {
4190         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4191         struct net *net = arg->net;
4192
4193         if (rt == net->ipv6.ip6_null_entry)
4194                 return 0;
4195
4196         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4197                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4198
4199                 /* user wants prefix routes only */
4200                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4201                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4202                         /* success since this is not a prefix route */
4203                         return 1;
4204                 }
4205         }
4206
4207         return rt6_fill_node(net,
4208                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4209                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4210                      NLM_F_MULTI);
4211 }
4212
4213 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4214                               struct netlink_ext_ack *extack)
4215 {
4216         struct net *net = sock_net(in_skb->sk);
4217         struct nlattr *tb[RTA_MAX+1];
4218         int err, iif = 0, oif = 0;
4219         struct dst_entry *dst;
4220         struct rt6_info *rt;
4221         struct sk_buff *skb;
4222         struct rtmsg *rtm;
4223         struct flowi6 fl6;
4224         bool fibmatch;
4225
4226         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4227                           extack);
4228         if (err < 0)
4229                 goto errout;
4230
4231         err = -EINVAL;
4232         memset(&fl6, 0, sizeof(fl6));
4233         rtm = nlmsg_data(nlh);
4234         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4235         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4236
4237         if (tb[RTA_SRC]) {
4238                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4239                         goto errout;
4240
4241                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4242         }
4243
4244         if (tb[RTA_DST]) {
4245                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4246                         goto errout;
4247
4248                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4249         }
4250
4251         if (tb[RTA_IIF])
4252                 iif = nla_get_u32(tb[RTA_IIF]);
4253
4254         if (tb[RTA_OIF])
4255                 oif = nla_get_u32(tb[RTA_OIF]);
4256
4257         if (tb[RTA_MARK])
4258                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4259
4260         if (tb[RTA_UID])
4261                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4262                                            nla_get_u32(tb[RTA_UID]));
4263         else
4264                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4265
4266         if (iif) {
4267                 struct net_device *dev;
4268                 int flags = 0;
4269
4270                 rcu_read_lock();
4271
4272                 dev = dev_get_by_index_rcu(net, iif);
4273                 if (!dev) {
4274                         rcu_read_unlock();
4275                         err = -ENODEV;
4276                         goto errout;
4277                 }
4278
4279                 fl6.flowi6_iif = iif;
4280
4281                 if (!ipv6_addr_any(&fl6.saddr))
4282                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4283
4284                 if (!fibmatch)
4285                         dst = ip6_route_input_lookup(net, dev, &fl6, flags);
4286                 else
4287                         dst = ip6_route_lookup(net, &fl6, 0);
4288
4289                 rcu_read_unlock();
4290         } else {
4291                 fl6.flowi6_oif = oif;
4292
4293                 if (!fibmatch)
4294                         dst = ip6_route_output(net, NULL, &fl6);
4295                 else
4296                         dst = ip6_route_lookup(net, &fl6, 0);
4297         }
4298
4299
4300         rt = container_of(dst, struct rt6_info, dst);
4301         if (rt->dst.error) {
4302                 err = rt->dst.error;
4303                 ip6_rt_put(rt);
4304                 goto errout;
4305         }
4306
4307         if (rt == net->ipv6.ip6_null_entry) {
4308                 err = rt->dst.error;
4309                 ip6_rt_put(rt);
4310                 goto errout;
4311         }
4312
4313         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4314         if (!skb) {
4315                 ip6_rt_put(rt);
4316                 err = -ENOBUFS;
4317                 goto errout;
4318         }
4319
4320         skb_dst_set(skb, &rt->dst);
4321         if (fibmatch)
4322                 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4323                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4324                                     nlh->nlmsg_seq, 0);
4325         else
4326                 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4327                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4328                                     nlh->nlmsg_seq, 0);
4329         if (err < 0) {
4330                 kfree_skb(skb);
4331                 goto errout;
4332         }
4333
4334         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4335 errout:
4336         return err;
4337 }
4338
4339 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4340                      unsigned int nlm_flags)
4341 {
4342         struct sk_buff *skb;
4343         struct net *net = info->nl_net;
4344         u32 seq;
4345         int err;
4346
4347         err = -ENOBUFS;
4348         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4349
4350         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4351         if (!skb)
4352                 goto errout;
4353
4354         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4355                                 event, info->portid, seq, nlm_flags);
4356         if (err < 0) {
4357                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4358                 WARN_ON(err == -EMSGSIZE);
4359                 kfree_skb(skb);
4360                 goto errout;
4361         }
4362         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4363                     info->nlh, gfp_any());
4364         return;
4365 errout:
4366         if (err < 0)
4367                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4368 }
4369
4370 static int ip6_route_dev_notify(struct notifier_block *this,
4371                                 unsigned long event, void *ptr)
4372 {
4373         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4374         struct net *net = dev_net(dev);
4375
4376         if (!(dev->flags & IFF_LOOPBACK))
4377                 return NOTIFY_OK;
4378
4379         if (event == NETDEV_REGISTER) {
4380                 net->ipv6.ip6_null_entry->dst.dev = dev;
4381                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4382 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4383                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4384                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4385                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4386                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4387 #endif
4388          } else if (event == NETDEV_UNREGISTER &&
4389                     dev->reg_state != NETREG_UNREGISTERED) {
4390                 /* NETDEV_UNREGISTER could be fired for multiple times by
4391                  * netdev_wait_allrefs(). Make sure we only call this once.
4392                  */
4393                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4394 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4395                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4396                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4397 #endif
4398         }
4399
4400         return NOTIFY_OK;
4401 }
4402
4403 /*
4404  *      /proc
4405  */
4406
4407 #ifdef CONFIG_PROC_FS
4408
4409 static const struct file_operations ipv6_route_proc_fops = {
4410         .owner          = THIS_MODULE,
4411         .open           = ipv6_route_open,
4412         .read           = seq_read,
4413         .llseek         = seq_lseek,
4414         .release        = seq_release_net,
4415 };
4416
4417 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4418 {
4419         struct net *net = (struct net *)seq->private;
4420         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4421                    net->ipv6.rt6_stats->fib_nodes,
4422                    net->ipv6.rt6_stats->fib_route_nodes,
4423                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4424                    net->ipv6.rt6_stats->fib_rt_entries,
4425                    net->ipv6.rt6_stats->fib_rt_cache,
4426                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4427                    net->ipv6.rt6_stats->fib_discarded_routes);
4428
4429         return 0;
4430 }
4431
4432 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4433 {
4434         return single_open_net(inode, file, rt6_stats_seq_show);
4435 }
4436
4437 static const struct file_operations rt6_stats_seq_fops = {
4438         .owner   = THIS_MODULE,
4439         .open    = rt6_stats_seq_open,
4440         .read    = seq_read,
4441         .llseek  = seq_lseek,
4442         .release = single_release_net,
4443 };
4444 #endif  /* CONFIG_PROC_FS */
4445
4446 #ifdef CONFIG_SYSCTL
4447
4448 static
4449 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4450                               void __user *buffer, size_t *lenp, loff_t *ppos)
4451 {
4452         struct net *net;
4453         int delay;
4454         if (!write)
4455                 return -EINVAL;
4456
4457         net = (struct net *)ctl->extra1;
4458         delay = net->ipv6.sysctl.flush_delay;
4459         proc_dointvec(ctl, write, buffer, lenp, ppos);
4460         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4461         return 0;
4462 }
4463
4464 struct ctl_table ipv6_route_table_template[] = {
4465         {
4466                 .procname       =       "flush",
4467                 .data           =       &init_net.ipv6.sysctl.flush_delay,
4468                 .maxlen         =       sizeof(int),
4469                 .mode           =       0200,
4470                 .proc_handler   =       ipv6_sysctl_rtcache_flush
4471         },
4472         {
4473                 .procname       =       "gc_thresh",
4474                 .data           =       &ip6_dst_ops_template.gc_thresh,
4475                 .maxlen         =       sizeof(int),
4476                 .mode           =       0644,
4477                 .proc_handler   =       proc_dointvec,
4478         },
4479         {
4480                 .procname       =       "max_size",
4481                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
4482                 .maxlen         =       sizeof(int),
4483                 .mode           =       0644,
4484                 .proc_handler   =       proc_dointvec,
4485         },
4486         {
4487                 .procname       =       "gc_min_interval",
4488                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4489                 .maxlen         =       sizeof(int),
4490                 .mode           =       0644,
4491                 .proc_handler   =       proc_dointvec_jiffies,
4492         },
4493         {
4494                 .procname       =       "gc_timeout",
4495                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4496                 .maxlen         =       sizeof(int),
4497                 .mode           =       0644,
4498                 .proc_handler   =       proc_dointvec_jiffies,
4499         },
4500         {
4501                 .procname       =       "gc_interval",
4502                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4503                 .maxlen         =       sizeof(int),
4504                 .mode           =       0644,
4505                 .proc_handler   =       proc_dointvec_jiffies,
4506         },
4507         {
4508                 .procname       =       "gc_elasticity",
4509                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4510                 .maxlen         =       sizeof(int),
4511                 .mode           =       0644,
4512                 .proc_handler   =       proc_dointvec,
4513         },
4514         {
4515                 .procname       =       "mtu_expires",
4516                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4517                 .maxlen         =       sizeof(int),
4518                 .mode           =       0644,
4519                 .proc_handler   =       proc_dointvec_jiffies,
4520         },
4521         {
4522                 .procname       =       "min_adv_mss",
4523                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4524                 .maxlen         =       sizeof(int),
4525                 .mode           =       0644,
4526                 .proc_handler   =       proc_dointvec,
4527         },
4528         {
4529                 .procname       =       "gc_min_interval_ms",
4530                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4531                 .maxlen         =       sizeof(int),
4532                 .mode           =       0644,
4533                 .proc_handler   =       proc_dointvec_ms_jiffies,
4534         },
4535         { }
4536 };
4537
4538 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4539 {
4540         struct ctl_table *table;
4541
4542         table = kmemdup(ipv6_route_table_template,
4543                         sizeof(ipv6_route_table_template),
4544                         GFP_KERNEL);
4545
4546         if (table) {
4547                 table[0].data = &net->ipv6.sysctl.flush_delay;
4548                 table[0].extra1 = net;
4549                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4550                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4551                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4552                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4553                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4554                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4555                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4556                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4557                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4558
4559                 /* Don't export sysctls to unprivileged users */
4560                 if (net->user_ns != &init_user_ns)
4561                         table[0].procname = NULL;
4562         }
4563
4564         return table;
4565 }
4566 #endif
4567
4568 static int __net_init ip6_route_net_init(struct net *net)
4569 {
4570         int ret = -ENOMEM;
4571
4572         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4573                sizeof(net->ipv6.ip6_dst_ops));
4574
4575         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4576                 goto out_ip6_dst_ops;
4577
4578         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4579                                            sizeof(*net->ipv6.ip6_null_entry),
4580                                            GFP_KERNEL);
4581         if (!net->ipv6.ip6_null_entry)
4582                 goto out_ip6_dst_entries;
4583         net->ipv6.ip6_null_entry->dst.path =
4584                 (struct dst_entry *)net->ipv6.ip6_null_entry;
4585         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4586         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4587                          ip6_template_metrics, true);
4588
4589 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4590         net->ipv6.fib6_has_custom_rules = false;
4591         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4592                                                sizeof(*net->ipv6.ip6_prohibit_entry),
4593                                                GFP_KERNEL);
4594         if (!net->ipv6.ip6_prohibit_entry)
4595                 goto out_ip6_null_entry;
4596         net->ipv6.ip6_prohibit_entry->dst.path =
4597                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
4598         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4599         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4600                          ip6_template_metrics, true);
4601
4602         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4603                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
4604                                                GFP_KERNEL);
4605         if (!net->ipv6.ip6_blk_hole_entry)
4606                 goto out_ip6_prohibit_entry;
4607         net->ipv6.ip6_blk_hole_entry->dst.path =
4608                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
4609         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4610         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4611                          ip6_template_metrics, true);
4612 #endif
4613
4614         net->ipv6.sysctl.flush_delay = 0;
4615         net->ipv6.sysctl.ip6_rt_max_size = 4096;
4616         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4617         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4618         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4619         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4620         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4621         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4622
4623         net->ipv6.ip6_rt_gc_expire = 30*HZ;
4624
4625         ret = 0;
4626 out:
4627         return ret;
4628
4629 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4630 out_ip6_prohibit_entry:
4631         kfree(net->ipv6.ip6_prohibit_entry);
4632 out_ip6_null_entry:
4633         kfree(net->ipv6.ip6_null_entry);
4634 #endif
4635 out_ip6_dst_entries:
4636         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4637 out_ip6_dst_ops:
4638         goto out;
4639 }
4640
4641 static void __net_exit ip6_route_net_exit(struct net *net)
4642 {
4643         kfree(net->ipv6.ip6_null_entry);
4644 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4645         kfree(net->ipv6.ip6_prohibit_entry);
4646         kfree(net->ipv6.ip6_blk_hole_entry);
4647 #endif
4648         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4649 }
4650
4651 static int __net_init ip6_route_net_init_late(struct net *net)
4652 {
4653 #ifdef CONFIG_PROC_FS
4654         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4655         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4656 #endif
4657         return 0;
4658 }
4659
4660 static void __net_exit ip6_route_net_exit_late(struct net *net)
4661 {
4662 #ifdef CONFIG_PROC_FS
4663         remove_proc_entry("ipv6_route", net->proc_net);
4664         remove_proc_entry("rt6_stats", net->proc_net);
4665 #endif
4666 }
4667
4668 static struct pernet_operations ip6_route_net_ops = {
4669         .init = ip6_route_net_init,
4670         .exit = ip6_route_net_exit,
4671 };
4672
4673 static int __net_init ipv6_inetpeer_init(struct net *net)
4674 {
4675         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4676
4677         if (!bp)
4678                 return -ENOMEM;
4679         inet_peer_base_init(bp);
4680         net->ipv6.peers = bp;
4681         return 0;
4682 }
4683
4684 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4685 {
4686         struct inet_peer_base *bp = net->ipv6.peers;
4687
4688         net->ipv6.peers = NULL;
4689         inetpeer_invalidate_tree(bp);
4690         kfree(bp);
4691 }
4692
4693 static struct pernet_operations ipv6_inetpeer_ops = {
4694         .init   =       ipv6_inetpeer_init,
4695         .exit   =       ipv6_inetpeer_exit,
4696 };
4697
4698 static struct pernet_operations ip6_route_net_late_ops = {
4699         .init = ip6_route_net_init_late,
4700         .exit = ip6_route_net_exit_late,
4701 };
4702
4703 static struct notifier_block ip6_route_dev_notifier = {
4704         .notifier_call = ip6_route_dev_notify,
4705         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4706 };
4707
4708 void __init ip6_route_init_special_entries(void)
4709 {
4710         /* Registering of the loopback is done before this portion of code,
4711          * the loopback reference in rt6_info will not be taken, do it
4712          * manually for init_net */
4713         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4714         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4715   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4716         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4717         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4718         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4719         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4720   #endif
4721 }
4722
4723 int __init ip6_route_init(void)
4724 {
4725         int ret;
4726         int cpu;
4727
4728         ret = -ENOMEM;
4729         ip6_dst_ops_template.kmem_cachep =
4730                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4731                                   SLAB_HWCACHE_ALIGN, NULL);
4732         if (!ip6_dst_ops_template.kmem_cachep)
4733                 goto out;
4734
4735         ret = dst_entries_init(&ip6_dst_blackhole_ops);
4736         if (ret)
4737                 goto out_kmem_cache;
4738
4739         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4740         if (ret)
4741                 goto out_dst_entries;
4742
4743         ret = register_pernet_subsys(&ip6_route_net_ops);
4744         if (ret)
4745                 goto out_register_inetpeer;
4746
4747         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4748
4749         ret = fib6_init();
4750         if (ret)
4751                 goto out_register_subsys;
4752
4753         ret = xfrm6_init();
4754         if (ret)
4755                 goto out_fib6_init;
4756
4757         ret = fib6_rules_init();
4758         if (ret)
4759                 goto xfrm6_init;
4760
4761         ret = register_pernet_subsys(&ip6_route_net_late_ops);
4762         if (ret)
4763                 goto fib6_rules_init;
4764
4765         ret = -ENOBUFS;
4766         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
4767             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
4768             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
4769                             RTNL_FLAG_DOIT_UNLOCKED))
4770                 goto out_register_late_subsys;
4771
4772         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4773         if (ret)
4774                 goto out_register_late_subsys;
4775
4776         for_each_possible_cpu(cpu) {
4777                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4778
4779                 INIT_LIST_HEAD(&ul->head);
4780                 spin_lock_init(&ul->lock);
4781         }
4782
4783 out:
4784         return ret;
4785
4786 out_register_late_subsys:
4787         unregister_pernet_subsys(&ip6_route_net_late_ops);
4788 fib6_rules_init:
4789         fib6_rules_cleanup();
4790 xfrm6_init:
4791         xfrm6_fini();
4792 out_fib6_init:
4793         fib6_gc_cleanup();
4794 out_register_subsys:
4795         unregister_pernet_subsys(&ip6_route_net_ops);
4796 out_register_inetpeer:
4797         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4798 out_dst_entries:
4799         dst_entries_destroy(&ip6_dst_blackhole_ops);
4800 out_kmem_cache:
4801         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4802         goto out;
4803 }
4804
4805 void ip6_route_cleanup(void)
4806 {
4807         unregister_netdevice_notifier(&ip6_route_dev_notifier);
4808         unregister_pernet_subsys(&ip6_route_net_late_ops);
4809         fib6_rules_cleanup();
4810         xfrm6_fini();
4811         fib6_gc_cleanup();
4812         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4813         unregister_pernet_subsys(&ip6_route_net_ops);
4814         dst_entries_destroy(&ip6_dst_blackhole_ops);
4815         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4816 }