]> asedeno.scripts.mit.edu Git - linux.git/blob - net/ipv6/route.c
a366c05a239da50e98ced776b66d34f923900701
[linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 static int ip6_rt_type_to_error(u8 fib6_type);
74
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79
80 enum rt6_nud_state {
81         RT6_NUD_FAIL_HARD = -3,
82         RT6_NUD_FAIL_PROBE = -2,
83         RT6_NUD_FAIL_DO_RR = -1,
84         RT6_NUD_SUCCEED = 1
85 };
86
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int      ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(struct dst_ops *ops);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int              ip6_pkt_prohibit(struct sk_buff *skb);
99 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void             ip6_link_failure(struct sk_buff *skb);
101 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102                                            struct sk_buff *skb, u32 mtu);
103 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104                                         struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108                          struct fib6_info *rt, struct dst_entry *dst,
109                          struct in6_addr *dest, struct in6_addr *src,
110                          int iif, int type, u32 portid, u32 seq,
111                          unsigned int flags);
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113                                            struct in6_addr *daddr,
114                                            struct in6_addr *saddr);
115
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118                                            const struct in6_addr *prefix, int prefixlen,
119                                            const struct in6_addr *gwaddr,
120                                            struct net_device *dev,
121                                            unsigned int pref);
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123                                            const struct in6_addr *prefix, int prefixlen,
124                                            const struct in6_addr *gwaddr,
125                                            struct net_device *dev);
126 #endif
127
128 struct uncached_list {
129         spinlock_t              lock;
130         struct list_head        head;
131 };
132
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
134
135 void rt6_uncached_list_add(struct rt6_info *rt)
136 {
137         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
138
139         rt->rt6i_uncached_list = ul;
140
141         spin_lock_bh(&ul->lock);
142         list_add_tail(&rt->rt6i_uncached, &ul->head);
143         spin_unlock_bh(&ul->lock);
144 }
145
146 void rt6_uncached_list_del(struct rt6_info *rt)
147 {
148         if (!list_empty(&rt->rt6i_uncached)) {
149                 struct uncached_list *ul = rt->rt6i_uncached_list;
150                 struct net *net = dev_net(rt->dst.dev);
151
152                 spin_lock_bh(&ul->lock);
153                 list_del(&rt->rt6i_uncached);
154                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155                 spin_unlock_bh(&ul->lock);
156         }
157 }
158
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
160 {
161         struct net_device *loopback_dev = net->loopback_dev;
162         int cpu;
163
164         if (dev == loopback_dev)
165                 return;
166
167         for_each_possible_cpu(cpu) {
168                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
169                 struct rt6_info *rt;
170
171                 spin_lock_bh(&ul->lock);
172                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173                         struct inet6_dev *rt_idev = rt->rt6i_idev;
174                         struct net_device *rt_dev = rt->dst.dev;
175
176                         if (rt_idev->dev == dev) {
177                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
178                                 in6_dev_put(rt_idev);
179                         }
180
181                         if (rt_dev == dev) {
182                                 rt->dst.dev = loopback_dev;
183                                 dev_hold(rt->dst.dev);
184                                 dev_put(rt_dev);
185                         }
186                 }
187                 spin_unlock_bh(&ul->lock);
188         }
189 }
190
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
192                                              struct sk_buff *skb,
193                                              const void *daddr)
194 {
195         if (!ipv6_addr_any(p))
196                 return (const void *) p;
197         else if (skb)
198                 return &ipv6_hdr(skb)->daddr;
199         return daddr;
200 }
201
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203                                    struct net_device *dev,
204                                    struct sk_buff *skb,
205                                    const void *daddr)
206 {
207         struct neighbour *n;
208
209         daddr = choose_neigh_daddr(gw, skb, daddr);
210         n = __ipv6_neigh_lookup(dev, daddr);
211         if (n)
212                 return n;
213         return neigh_create(&nd_tbl, daddr, dev);
214 }
215
216 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
217                                               struct sk_buff *skb,
218                                               const void *daddr)
219 {
220         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
221
222         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
223 }
224
225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
226 {
227         struct net_device *dev = dst->dev;
228         struct rt6_info *rt = (struct rt6_info *)dst;
229
230         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
231         if (!daddr)
232                 return;
233         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
234                 return;
235         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
236                 return;
237         __ipv6_confirm_neigh(dev, daddr);
238 }
239
240 static struct dst_ops ip6_dst_ops_template = {
241         .family                 =       AF_INET6,
242         .gc                     =       ip6_dst_gc,
243         .gc_thresh              =       1024,
244         .check                  =       ip6_dst_check,
245         .default_advmss         =       ip6_default_advmss,
246         .mtu                    =       ip6_mtu,
247         .cow_metrics            =       dst_cow_metrics_generic,
248         .destroy                =       ip6_dst_destroy,
249         .ifdown                 =       ip6_dst_ifdown,
250         .negative_advice        =       ip6_negative_advice,
251         .link_failure           =       ip6_link_failure,
252         .update_pmtu            =       ip6_rt_update_pmtu,
253         .redirect               =       rt6_do_redirect,
254         .local_out              =       __ip6_local_out,
255         .neigh_lookup           =       ip6_dst_neigh_lookup,
256         .confirm_neigh          =       ip6_confirm_neigh,
257 };
258
259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
260 {
261         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
262
263         return mtu ? : dst->dev->mtu;
264 }
265
266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
267                                          struct sk_buff *skb, u32 mtu)
268 {
269 }
270
271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
272                                       struct sk_buff *skb)
273 {
274 }
275
276 static struct dst_ops ip6_dst_blackhole_ops = {
277         .family                 =       AF_INET6,
278         .destroy                =       ip6_dst_destroy,
279         .check                  =       ip6_dst_check,
280         .mtu                    =       ip6_blackhole_mtu,
281         .default_advmss         =       ip6_default_advmss,
282         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
283         .redirect               =       ip6_rt_blackhole_redirect,
284         .cow_metrics            =       dst_cow_metrics_generic,
285         .neigh_lookup           =       ip6_dst_neigh_lookup,
286 };
287
288 static const u32 ip6_template_metrics[RTAX_MAX] = {
289         [RTAX_HOPLIMIT - 1] = 0,
290 };
291
292 static const struct fib6_info fib6_null_entry_template = {
293         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
294         .fib6_protocol  = RTPROT_KERNEL,
295         .fib6_metric    = ~(u32)0,
296         .fib6_ref       = ATOMIC_INIT(1),
297         .fib6_type      = RTN_UNREACHABLE,
298         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
299 };
300
301 static const struct rt6_info ip6_null_entry_template = {
302         .dst = {
303                 .__refcnt       = ATOMIC_INIT(1),
304                 .__use          = 1,
305                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
306                 .error          = -ENETUNREACH,
307                 .input          = ip6_pkt_discard,
308                 .output         = ip6_pkt_discard_out,
309         },
310         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
311 };
312
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
315 static const struct rt6_info ip6_prohibit_entry_template = {
316         .dst = {
317                 .__refcnt       = ATOMIC_INIT(1),
318                 .__use          = 1,
319                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
320                 .error          = -EACCES,
321                 .input          = ip6_pkt_prohibit,
322                 .output         = ip6_pkt_prohibit_out,
323         },
324         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
325 };
326
327 static const struct rt6_info ip6_blk_hole_entry_template = {
328         .dst = {
329                 .__refcnt       = ATOMIC_INIT(1),
330                 .__use          = 1,
331                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
332                 .error          = -EINVAL,
333                 .input          = dst_discard,
334                 .output         = dst_discard_out,
335         },
336         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
337 };
338
339 #endif
340
341 static void rt6_info_init(struct rt6_info *rt)
342 {
343         struct dst_entry *dst = &rt->dst;
344
345         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
346         INIT_LIST_HEAD(&rt->rt6i_uncached);
347 }
348
349 /* allocate dst with ip6_dst_ops */
350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
351                                int flags)
352 {
353         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
354                                         1, DST_OBSOLETE_FORCE_CHK, flags);
355
356         if (rt) {
357                 rt6_info_init(rt);
358                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
359         }
360
361         return rt;
362 }
363 EXPORT_SYMBOL(ip6_dst_alloc);
364
365 static void ip6_dst_destroy(struct dst_entry *dst)
366 {
367         struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
368         struct rt6_info *rt = (struct rt6_info *)dst;
369         struct fib6_info *from;
370         struct inet6_dev *idev;
371
372         if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
373                 kfree(p);
374
375         rt6_uncached_list_del(rt);
376
377         idev = rt->rt6i_idev;
378         if (idev) {
379                 rt->rt6i_idev = NULL;
380                 in6_dev_put(idev);
381         }
382
383         rcu_read_lock();
384         from = rcu_dereference(rt->from);
385         rcu_assign_pointer(rt->from, NULL);
386         fib6_info_release(from);
387         rcu_read_unlock();
388 }
389
390 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
391                            int how)
392 {
393         struct rt6_info *rt = (struct rt6_info *)dst;
394         struct inet6_dev *idev = rt->rt6i_idev;
395         struct net_device *loopback_dev =
396                 dev_net(dev)->loopback_dev;
397
398         if (idev && idev->dev != loopback_dev) {
399                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
400                 if (loopback_idev) {
401                         rt->rt6i_idev = loopback_idev;
402                         in6_dev_put(idev);
403                 }
404         }
405 }
406
407 static bool __rt6_check_expired(const struct rt6_info *rt)
408 {
409         if (rt->rt6i_flags & RTF_EXPIRES)
410                 return time_after(jiffies, rt->dst.expires);
411         else
412                 return false;
413 }
414
415 static bool rt6_check_expired(const struct rt6_info *rt)
416 {
417         struct fib6_info *from;
418
419         from = rcu_dereference(rt->from);
420
421         if (rt->rt6i_flags & RTF_EXPIRES) {
422                 if (time_after(jiffies, rt->dst.expires))
423                         return true;
424         } else if (from) {
425                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
426                         fib6_check_expired(from);
427         }
428         return false;
429 }
430
431 struct fib6_info *fib6_multipath_select(const struct net *net,
432                                         struct fib6_info *match,
433                                         struct flowi6 *fl6, int oif,
434                                         const struct sk_buff *skb,
435                                         int strict)
436 {
437         struct fib6_info *sibling, *next_sibling;
438
439         /* We might have already computed the hash for ICMPv6 errors. In such
440          * case it will always be non-zero. Otherwise now is the time to do it.
441          */
442         if (!fl6->mp_hash)
443                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
444
445         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
446                 return match;
447
448         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
449                                  fib6_siblings) {
450                 int nh_upper_bound;
451
452                 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
453                 if (fl6->mp_hash > nh_upper_bound)
454                         continue;
455                 if (rt6_score_route(sibling, oif, strict) < 0)
456                         break;
457                 match = sibling;
458                 break;
459         }
460
461         return match;
462 }
463
464 /*
465  *      Route lookup. rcu_read_lock() should be held.
466  */
467
468 static inline struct fib6_info *rt6_device_match(struct net *net,
469                                                  struct fib6_info *rt,
470                                                     const struct in6_addr *saddr,
471                                                     int oif,
472                                                     int flags)
473 {
474         struct fib6_info *sprt;
475
476         if (!oif && ipv6_addr_any(saddr) &&
477             !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
478                 return rt;
479
480         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
481                 const struct net_device *dev = sprt->fib6_nh.nh_dev;
482
483                 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
484                         continue;
485
486                 if (oif) {
487                         if (dev->ifindex == oif)
488                                 return sprt;
489                 } else {
490                         if (ipv6_chk_addr(net, saddr, dev,
491                                           flags & RT6_LOOKUP_F_IFACE))
492                                 return sprt;
493                 }
494         }
495
496         if (oif && flags & RT6_LOOKUP_F_IFACE)
497                 return net->ipv6.fib6_null_entry;
498
499         return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
500 }
501
502 #ifdef CONFIG_IPV6_ROUTER_PREF
503 struct __rt6_probe_work {
504         struct work_struct work;
505         struct in6_addr target;
506         struct net_device *dev;
507 };
508
509 static void rt6_probe_deferred(struct work_struct *w)
510 {
511         struct in6_addr mcaddr;
512         struct __rt6_probe_work *work =
513                 container_of(w, struct __rt6_probe_work, work);
514
515         addrconf_addr_solict_mult(&work->target, &mcaddr);
516         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
517         dev_put(work->dev);
518         kfree(work);
519 }
520
521 static void rt6_probe(struct fib6_info *rt)
522 {
523         struct __rt6_probe_work *work;
524         const struct in6_addr *nh_gw;
525         struct neighbour *neigh;
526         struct net_device *dev;
527
528         /*
529          * Okay, this does not seem to be appropriate
530          * for now, however, we need to check if it
531          * is really so; aka Router Reachability Probing.
532          *
533          * Router Reachability Probe MUST be rate-limited
534          * to no more than one per minute.
535          */
536         if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
537                 return;
538
539         nh_gw = &rt->fib6_nh.nh_gw;
540         dev = rt->fib6_nh.nh_dev;
541         rcu_read_lock_bh();
542         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
543         if (neigh) {
544                 struct inet6_dev *idev;
545
546                 if (neigh->nud_state & NUD_VALID)
547                         goto out;
548
549                 idev = __in6_dev_get(dev);
550                 work = NULL;
551                 write_lock(&neigh->lock);
552                 if (!(neigh->nud_state & NUD_VALID) &&
553                     time_after(jiffies,
554                                neigh->updated + idev->cnf.rtr_probe_interval)) {
555                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
556                         if (work)
557                                 __neigh_set_probe_once(neigh);
558                 }
559                 write_unlock(&neigh->lock);
560         } else {
561                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
562         }
563
564         if (work) {
565                 INIT_WORK(&work->work, rt6_probe_deferred);
566                 work->target = *nh_gw;
567                 dev_hold(dev);
568                 work->dev = dev;
569                 schedule_work(&work->work);
570         }
571
572 out:
573         rcu_read_unlock_bh();
574 }
575 #else
576 static inline void rt6_probe(struct fib6_info *rt)
577 {
578 }
579 #endif
580
581 /*
582  * Default Router Selection (RFC 2461 6.3.6)
583  */
584 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
585 {
586         const struct net_device *dev = rt->fib6_nh.nh_dev;
587
588         if (!oif || dev->ifindex == oif)
589                 return 2;
590         return 0;
591 }
592
593 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
594 {
595         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
596         struct neighbour *neigh;
597
598         if (rt->fib6_flags & RTF_NONEXTHOP ||
599             !(rt->fib6_flags & RTF_GATEWAY))
600                 return RT6_NUD_SUCCEED;
601
602         rcu_read_lock_bh();
603         neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
604                                           &rt->fib6_nh.nh_gw);
605         if (neigh) {
606                 read_lock(&neigh->lock);
607                 if (neigh->nud_state & NUD_VALID)
608                         ret = RT6_NUD_SUCCEED;
609 #ifdef CONFIG_IPV6_ROUTER_PREF
610                 else if (!(neigh->nud_state & NUD_FAILED))
611                         ret = RT6_NUD_SUCCEED;
612                 else
613                         ret = RT6_NUD_FAIL_PROBE;
614 #endif
615                 read_unlock(&neigh->lock);
616         } else {
617                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
618                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
619         }
620         rcu_read_unlock_bh();
621
622         return ret;
623 }
624
625 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
626 {
627         int m;
628
629         m = rt6_check_dev(rt, oif);
630         if (!m && (strict & RT6_LOOKUP_F_IFACE))
631                 return RT6_NUD_FAIL_HARD;
632 #ifdef CONFIG_IPV6_ROUTER_PREF
633         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
634 #endif
635         if (strict & RT6_LOOKUP_F_REACHABLE) {
636                 int n = rt6_check_neigh(rt);
637                 if (n < 0)
638                         return n;
639         }
640         return m;
641 }
642
643 /* called with rc_read_lock held */
644 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
645 {
646         const struct net_device *dev = fib6_info_nh_dev(f6i);
647         bool rc = false;
648
649         if (dev) {
650                 const struct inet6_dev *idev = __in6_dev_get(dev);
651
652                 rc = !!idev->cnf.ignore_routes_with_linkdown;
653         }
654
655         return rc;
656 }
657
658 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
659                                    int *mpri, struct fib6_info *match,
660                                    bool *do_rr)
661 {
662         int m;
663         bool match_do_rr = false;
664
665         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
666                 goto out;
667
668         if (fib6_ignore_linkdown(rt) &&
669             rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
670             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
671                 goto out;
672
673         if (fib6_check_expired(rt))
674                 goto out;
675
676         m = rt6_score_route(rt, oif, strict);
677         if (m == RT6_NUD_FAIL_DO_RR) {
678                 match_do_rr = true;
679                 m = 0; /* lowest valid score */
680         } else if (m == RT6_NUD_FAIL_HARD) {
681                 goto out;
682         }
683
684         if (strict & RT6_LOOKUP_F_REACHABLE)
685                 rt6_probe(rt);
686
687         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
688         if (m > *mpri) {
689                 *do_rr = match_do_rr;
690                 *mpri = m;
691                 match = rt;
692         }
693 out:
694         return match;
695 }
696
697 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
698                                      struct fib6_info *leaf,
699                                      struct fib6_info *rr_head,
700                                      u32 metric, int oif, int strict,
701                                      bool *do_rr)
702 {
703         struct fib6_info *rt, *match, *cont;
704         int mpri = -1;
705
706         match = NULL;
707         cont = NULL;
708         for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
709                 if (rt->fib6_metric != metric) {
710                         cont = rt;
711                         break;
712                 }
713
714                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
715         }
716
717         for (rt = leaf; rt && rt != rr_head;
718              rt = rcu_dereference(rt->fib6_next)) {
719                 if (rt->fib6_metric != metric) {
720                         cont = rt;
721                         break;
722                 }
723
724                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
725         }
726
727         if (match || !cont)
728                 return match;
729
730         for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
731                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
732
733         return match;
734 }
735
736 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
737                                    int oif, int strict)
738 {
739         struct fib6_info *leaf = rcu_dereference(fn->leaf);
740         struct fib6_info *match, *rt0;
741         bool do_rr = false;
742         int key_plen;
743
744         if (!leaf || leaf == net->ipv6.fib6_null_entry)
745                 return net->ipv6.fib6_null_entry;
746
747         rt0 = rcu_dereference(fn->rr_ptr);
748         if (!rt0)
749                 rt0 = leaf;
750
751         /* Double check to make sure fn is not an intermediate node
752          * and fn->leaf does not points to its child's leaf
753          * (This might happen if all routes under fn are deleted from
754          * the tree and fib6_repair_tree() is called on the node.)
755          */
756         key_plen = rt0->fib6_dst.plen;
757 #ifdef CONFIG_IPV6_SUBTREES
758         if (rt0->fib6_src.plen)
759                 key_plen = rt0->fib6_src.plen;
760 #endif
761         if (fn->fn_bit != key_plen)
762                 return net->ipv6.fib6_null_entry;
763
764         match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
765                              &do_rr);
766
767         if (do_rr) {
768                 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
769
770                 /* no entries matched; do round-robin */
771                 if (!next || next->fib6_metric != rt0->fib6_metric)
772                         next = leaf;
773
774                 if (next != rt0) {
775                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
776                         /* make sure next is not being deleted from the tree */
777                         if (next->fib6_node)
778                                 rcu_assign_pointer(fn->rr_ptr, next);
779                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
780                 }
781         }
782
783         return match ? match : net->ipv6.fib6_null_entry;
784 }
785
786 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
787 {
788         return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
789 }
790
791 #ifdef CONFIG_IPV6_ROUTE_INFO
792 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
793                   const struct in6_addr *gwaddr)
794 {
795         struct net *net = dev_net(dev);
796         struct route_info *rinfo = (struct route_info *) opt;
797         struct in6_addr prefix_buf, *prefix;
798         unsigned int pref;
799         unsigned long lifetime;
800         struct fib6_info *rt;
801
802         if (len < sizeof(struct route_info)) {
803                 return -EINVAL;
804         }
805
806         /* Sanity check for prefix_len and length */
807         if (rinfo->length > 3) {
808                 return -EINVAL;
809         } else if (rinfo->prefix_len > 128) {
810                 return -EINVAL;
811         } else if (rinfo->prefix_len > 64) {
812                 if (rinfo->length < 2) {
813                         return -EINVAL;
814                 }
815         } else if (rinfo->prefix_len > 0) {
816                 if (rinfo->length < 1) {
817                         return -EINVAL;
818                 }
819         }
820
821         pref = rinfo->route_pref;
822         if (pref == ICMPV6_ROUTER_PREF_INVALID)
823                 return -EINVAL;
824
825         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
826
827         if (rinfo->length == 3)
828                 prefix = (struct in6_addr *)rinfo->prefix;
829         else {
830                 /* this function is safe */
831                 ipv6_addr_prefix(&prefix_buf,
832                                  (struct in6_addr *)rinfo->prefix,
833                                  rinfo->prefix_len);
834                 prefix = &prefix_buf;
835         }
836
837         if (rinfo->prefix_len == 0)
838                 rt = rt6_get_dflt_router(net, gwaddr, dev);
839         else
840                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
841                                         gwaddr, dev);
842
843         if (rt && !lifetime) {
844                 ip6_del_rt(net, rt);
845                 rt = NULL;
846         }
847
848         if (!rt && lifetime)
849                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
850                                         dev, pref);
851         else if (rt)
852                 rt->fib6_flags = RTF_ROUTEINFO |
853                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
854
855         if (rt) {
856                 if (!addrconf_finite_timeout(lifetime))
857                         fib6_clean_expires(rt);
858                 else
859                         fib6_set_expires(rt, jiffies + HZ * lifetime);
860
861                 fib6_info_release(rt);
862         }
863         return 0;
864 }
865 #endif
866
867 /*
868  *      Misc support functions
869  */
870
871 /* called with rcu_lock held */
872 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
873 {
874         struct net_device *dev = rt->fib6_nh.nh_dev;
875
876         if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
877                 /* for copies of local routes, dst->dev needs to be the
878                  * device if it is a master device, the master device if
879                  * device is enslaved, and the loopback as the default
880                  */
881                 if (netif_is_l3_slave(dev) &&
882                     !rt6_need_strict(&rt->fib6_dst.addr))
883                         dev = l3mdev_master_dev_rcu(dev);
884                 else if (!netif_is_l3_master(dev))
885                         dev = dev_net(dev)->loopback_dev;
886                 /* last case is netif_is_l3_master(dev) is true in which
887                  * case we want dev returned to be dev
888                  */
889         }
890
891         return dev;
892 }
893
894 static const int fib6_prop[RTN_MAX + 1] = {
895         [RTN_UNSPEC]    = 0,
896         [RTN_UNICAST]   = 0,
897         [RTN_LOCAL]     = 0,
898         [RTN_BROADCAST] = 0,
899         [RTN_ANYCAST]   = 0,
900         [RTN_MULTICAST] = 0,
901         [RTN_BLACKHOLE] = -EINVAL,
902         [RTN_UNREACHABLE] = -EHOSTUNREACH,
903         [RTN_PROHIBIT]  = -EACCES,
904         [RTN_THROW]     = -EAGAIN,
905         [RTN_NAT]       = -EINVAL,
906         [RTN_XRESOLVE]  = -EINVAL,
907 };
908
909 static int ip6_rt_type_to_error(u8 fib6_type)
910 {
911         return fib6_prop[fib6_type];
912 }
913
914 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
915 {
916         unsigned short flags = 0;
917
918         if (rt->dst_nocount)
919                 flags |= DST_NOCOUNT;
920         if (rt->dst_nopolicy)
921                 flags |= DST_NOPOLICY;
922         if (rt->dst_host)
923                 flags |= DST_HOST;
924
925         return flags;
926 }
927
928 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
929 {
930         rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
931
932         switch (ort->fib6_type) {
933         case RTN_BLACKHOLE:
934                 rt->dst.output = dst_discard_out;
935                 rt->dst.input = dst_discard;
936                 break;
937         case RTN_PROHIBIT:
938                 rt->dst.output = ip6_pkt_prohibit_out;
939                 rt->dst.input = ip6_pkt_prohibit;
940                 break;
941         case RTN_THROW:
942         case RTN_UNREACHABLE:
943         default:
944                 rt->dst.output = ip6_pkt_discard_out;
945                 rt->dst.input = ip6_pkt_discard;
946                 break;
947         }
948 }
949
950 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
951 {
952         if (ort->fib6_flags & RTF_REJECT) {
953                 ip6_rt_init_dst_reject(rt, ort);
954                 return;
955         }
956
957         rt->dst.error = 0;
958         rt->dst.output = ip6_output;
959
960         if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
961                 rt->dst.input = ip6_input;
962         } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
963                 rt->dst.input = ip6_mc_input;
964         } else {
965                 rt->dst.input = ip6_forward;
966         }
967
968         if (ort->fib6_nh.nh_lwtstate) {
969                 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
970                 lwtunnel_set_redirect(&rt->dst);
971         }
972
973         rt->dst.lastuse = jiffies;
974 }
975
976 /* Caller must already hold reference to @from */
977 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
978 {
979         rt->rt6i_flags &= ~RTF_EXPIRES;
980         rcu_assign_pointer(rt->from, from);
981         dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
982         if (from->fib6_metrics != &dst_default_metrics) {
983                 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
984                 refcount_inc(&from->fib6_metrics->refcnt);
985         }
986 }
987
988 /* Caller must already hold reference to @ort */
989 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
990 {
991         struct net_device *dev = fib6_info_nh_dev(ort);
992
993         ip6_rt_init_dst(rt, ort);
994
995         rt->rt6i_dst = ort->fib6_dst;
996         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
997         rt->rt6i_gateway = ort->fib6_nh.nh_gw;
998         rt->rt6i_flags = ort->fib6_flags;
999         rt6_set_from(rt, ort);
1000 #ifdef CONFIG_IPV6_SUBTREES
1001         rt->rt6i_src = ort->fib6_src;
1002 #endif
1003         rt->rt6i_prefsrc = ort->fib6_prefsrc;
1004 }
1005
1006 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1007                                         struct in6_addr *saddr)
1008 {
1009         struct fib6_node *pn, *sn;
1010         while (1) {
1011                 if (fn->fn_flags & RTN_TL_ROOT)
1012                         return NULL;
1013                 pn = rcu_dereference(fn->parent);
1014                 sn = FIB6_SUBTREE(pn);
1015                 if (sn && sn != fn)
1016                         fn = fib6_node_lookup(sn, NULL, saddr);
1017                 else
1018                         fn = pn;
1019                 if (fn->fn_flags & RTN_RTINFO)
1020                         return fn;
1021         }
1022 }
1023
1024 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1025                           bool null_fallback)
1026 {
1027         struct rt6_info *rt = *prt;
1028
1029         if (dst_hold_safe(&rt->dst))
1030                 return true;
1031         if (null_fallback) {
1032                 rt = net->ipv6.ip6_null_entry;
1033                 dst_hold(&rt->dst);
1034         } else {
1035                 rt = NULL;
1036         }
1037         *prt = rt;
1038         return false;
1039 }
1040
1041 /* called with rcu_lock held */
1042 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1043 {
1044         unsigned short flags = fib6_info_dst_flags(rt);
1045         struct net_device *dev = rt->fib6_nh.nh_dev;
1046         struct rt6_info *nrt;
1047
1048         if (!fib6_info_hold_safe(rt))
1049                 return NULL;
1050
1051         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1052         if (nrt)
1053                 ip6_rt_copy_init(nrt, rt);
1054         else
1055                 fib6_info_release(rt);
1056
1057         return nrt;
1058 }
1059
1060 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1061                                              struct fib6_table *table,
1062                                              struct flowi6 *fl6,
1063                                              const struct sk_buff *skb,
1064                                              int flags)
1065 {
1066         struct fib6_info *f6i;
1067         struct fib6_node *fn;
1068         struct rt6_info *rt;
1069
1070         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1071                 flags &= ~RT6_LOOKUP_F_IFACE;
1072
1073         rcu_read_lock();
1074         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1075 restart:
1076         f6i = rcu_dereference(fn->leaf);
1077         if (!f6i) {
1078                 f6i = net->ipv6.fib6_null_entry;
1079         } else {
1080                 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1081                                       fl6->flowi6_oif, flags);
1082                 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1083                         f6i = fib6_multipath_select(net, f6i, fl6,
1084                                                     fl6->flowi6_oif, skb,
1085                                                     flags);
1086         }
1087         if (f6i == net->ipv6.fib6_null_entry) {
1088                 fn = fib6_backtrack(fn, &fl6->saddr);
1089                 if (fn)
1090                         goto restart;
1091         }
1092
1093         trace_fib6_table_lookup(net, f6i, table, fl6);
1094
1095         /* Search through exception table */
1096         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1097         if (rt) {
1098                 if (ip6_hold_safe(net, &rt, true))
1099                         dst_use_noref(&rt->dst, jiffies);
1100         } else if (f6i == net->ipv6.fib6_null_entry) {
1101                 rt = net->ipv6.ip6_null_entry;
1102                 dst_hold(&rt->dst);
1103         } else {
1104                 rt = ip6_create_rt_rcu(f6i);
1105                 if (!rt) {
1106                         rt = net->ipv6.ip6_null_entry;
1107                         dst_hold(&rt->dst);
1108                 }
1109         }
1110
1111         rcu_read_unlock();
1112
1113         return rt;
1114 }
1115
1116 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1117                                    const struct sk_buff *skb, int flags)
1118 {
1119         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1120 }
1121 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1122
1123 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1124                             const struct in6_addr *saddr, int oif,
1125                             const struct sk_buff *skb, int strict)
1126 {
1127         struct flowi6 fl6 = {
1128                 .flowi6_oif = oif,
1129                 .daddr = *daddr,
1130         };
1131         struct dst_entry *dst;
1132         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1133
1134         if (saddr) {
1135                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1136                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1137         }
1138
1139         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1140         if (dst->error == 0)
1141                 return (struct rt6_info *) dst;
1142
1143         dst_release(dst);
1144
1145         return NULL;
1146 }
1147 EXPORT_SYMBOL(rt6_lookup);
1148
1149 /* ip6_ins_rt is called with FREE table->tb6_lock.
1150  * It takes new route entry, the addition fails by any reason the
1151  * route is released.
1152  * Caller must hold dst before calling it.
1153  */
1154
1155 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1156                         struct netlink_ext_ack *extack)
1157 {
1158         int err;
1159         struct fib6_table *table;
1160
1161         table = rt->fib6_table;
1162         spin_lock_bh(&table->tb6_lock);
1163         err = fib6_add(&table->tb6_root, rt, info, extack);
1164         spin_unlock_bh(&table->tb6_lock);
1165
1166         return err;
1167 }
1168
1169 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1170 {
1171         struct nl_info info = { .nl_net = net, };
1172
1173         return __ip6_ins_rt(rt, &info, NULL);
1174 }
1175
1176 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1177                                            const struct in6_addr *daddr,
1178                                            const struct in6_addr *saddr)
1179 {
1180         struct net_device *dev;
1181         struct rt6_info *rt;
1182
1183         /*
1184          *      Clone the route.
1185          */
1186
1187         if (!fib6_info_hold_safe(ort))
1188                 return NULL;
1189
1190         dev = ip6_rt_get_dev_rcu(ort);
1191         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1192         if (!rt) {
1193                 fib6_info_release(ort);
1194                 return NULL;
1195         }
1196
1197         ip6_rt_copy_init(rt, ort);
1198         rt->rt6i_flags |= RTF_CACHE;
1199         rt->dst.flags |= DST_HOST;
1200         rt->rt6i_dst.addr = *daddr;
1201         rt->rt6i_dst.plen = 128;
1202
1203         if (!rt6_is_gw_or_nonexthop(ort)) {
1204                 if (ort->fib6_dst.plen != 128 &&
1205                     ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1206                         rt->rt6i_flags |= RTF_ANYCAST;
1207 #ifdef CONFIG_IPV6_SUBTREES
1208                 if (rt->rt6i_src.plen && saddr) {
1209                         rt->rt6i_src.addr = *saddr;
1210                         rt->rt6i_src.plen = 128;
1211                 }
1212 #endif
1213         }
1214
1215         return rt;
1216 }
1217
1218 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1219 {
1220         unsigned short flags = fib6_info_dst_flags(rt);
1221         struct net_device *dev;
1222         struct rt6_info *pcpu_rt;
1223
1224         if (!fib6_info_hold_safe(rt))
1225                 return NULL;
1226
1227         rcu_read_lock();
1228         dev = ip6_rt_get_dev_rcu(rt);
1229         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1230         rcu_read_unlock();
1231         if (!pcpu_rt) {
1232                 fib6_info_release(rt);
1233                 return NULL;
1234         }
1235         ip6_rt_copy_init(pcpu_rt, rt);
1236         pcpu_rt->rt6i_flags |= RTF_PCPU;
1237         return pcpu_rt;
1238 }
1239
1240 /* It should be called with rcu_read_lock() acquired */
1241 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1242 {
1243         struct rt6_info *pcpu_rt, **p;
1244
1245         p = this_cpu_ptr(rt->rt6i_pcpu);
1246         pcpu_rt = *p;
1247
1248         if (pcpu_rt)
1249                 ip6_hold_safe(NULL, &pcpu_rt, false);
1250
1251         return pcpu_rt;
1252 }
1253
1254 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1255                                             struct fib6_info *rt)
1256 {
1257         struct rt6_info *pcpu_rt, *prev, **p;
1258
1259         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1260         if (!pcpu_rt) {
1261                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1262                 return net->ipv6.ip6_null_entry;
1263         }
1264
1265         dst_hold(&pcpu_rt->dst);
1266         p = this_cpu_ptr(rt->rt6i_pcpu);
1267         prev = cmpxchg(p, NULL, pcpu_rt);
1268         BUG_ON(prev);
1269
1270         return pcpu_rt;
1271 }
1272
1273 /* exception hash table implementation
1274  */
1275 static DEFINE_SPINLOCK(rt6_exception_lock);
1276
1277 /* Remove rt6_ex from hash table and free the memory
1278  * Caller must hold rt6_exception_lock
1279  */
1280 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1281                                  struct rt6_exception *rt6_ex)
1282 {
1283         struct net *net;
1284
1285         if (!bucket || !rt6_ex)
1286                 return;
1287
1288         net = dev_net(rt6_ex->rt6i->dst.dev);
1289         hlist_del_rcu(&rt6_ex->hlist);
1290         dst_release(&rt6_ex->rt6i->dst);
1291         kfree_rcu(rt6_ex, rcu);
1292         WARN_ON_ONCE(!bucket->depth);
1293         bucket->depth--;
1294         net->ipv6.rt6_stats->fib_rt_cache--;
1295 }
1296
1297 /* Remove oldest rt6_ex in bucket and free the memory
1298  * Caller must hold rt6_exception_lock
1299  */
1300 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1301 {
1302         struct rt6_exception *rt6_ex, *oldest = NULL;
1303
1304         if (!bucket)
1305                 return;
1306
1307         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1308                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1309                         oldest = rt6_ex;
1310         }
1311         rt6_remove_exception(bucket, oldest);
1312 }
1313
1314 static u32 rt6_exception_hash(const struct in6_addr *dst,
1315                               const struct in6_addr *src)
1316 {
1317         static u32 seed __read_mostly;
1318         u32 val;
1319
1320         net_get_random_once(&seed, sizeof(seed));
1321         val = jhash(dst, sizeof(*dst), seed);
1322
1323 #ifdef CONFIG_IPV6_SUBTREES
1324         if (src)
1325                 val = jhash(src, sizeof(*src), val);
1326 #endif
1327         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1328 }
1329
1330 /* Helper function to find the cached rt in the hash table
1331  * and update bucket pointer to point to the bucket for this
1332  * (daddr, saddr) pair
1333  * Caller must hold rt6_exception_lock
1334  */
1335 static struct rt6_exception *
1336 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1337                               const struct in6_addr *daddr,
1338                               const struct in6_addr *saddr)
1339 {
1340         struct rt6_exception *rt6_ex;
1341         u32 hval;
1342
1343         if (!(*bucket) || !daddr)
1344                 return NULL;
1345
1346         hval = rt6_exception_hash(daddr, saddr);
1347         *bucket += hval;
1348
1349         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1350                 struct rt6_info *rt6 = rt6_ex->rt6i;
1351                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1352
1353 #ifdef CONFIG_IPV6_SUBTREES
1354                 if (matched && saddr)
1355                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1356 #endif
1357                 if (matched)
1358                         return rt6_ex;
1359         }
1360         return NULL;
1361 }
1362
1363 /* Helper function to find the cached rt in the hash table
1364  * and update bucket pointer to point to the bucket for this
1365  * (daddr, saddr) pair
1366  * Caller must hold rcu_read_lock()
1367  */
1368 static struct rt6_exception *
1369 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1370                          const struct in6_addr *daddr,
1371                          const struct in6_addr *saddr)
1372 {
1373         struct rt6_exception *rt6_ex;
1374         u32 hval;
1375
1376         WARN_ON_ONCE(!rcu_read_lock_held());
1377
1378         if (!(*bucket) || !daddr)
1379                 return NULL;
1380
1381         hval = rt6_exception_hash(daddr, saddr);
1382         *bucket += hval;
1383
1384         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1385                 struct rt6_info *rt6 = rt6_ex->rt6i;
1386                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1387
1388 #ifdef CONFIG_IPV6_SUBTREES
1389                 if (matched && saddr)
1390                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1391 #endif
1392                 if (matched)
1393                         return rt6_ex;
1394         }
1395         return NULL;
1396 }
1397
1398 static unsigned int fib6_mtu(const struct fib6_info *rt)
1399 {
1400         unsigned int mtu;
1401
1402         if (rt->fib6_pmtu) {
1403                 mtu = rt->fib6_pmtu;
1404         } else {
1405                 struct net_device *dev = fib6_info_nh_dev(rt);
1406                 struct inet6_dev *idev;
1407
1408                 rcu_read_lock();
1409                 idev = __in6_dev_get(dev);
1410                 mtu = idev->cnf.mtu6;
1411                 rcu_read_unlock();
1412         }
1413
1414         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1415
1416         return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1417 }
1418
1419 static int rt6_insert_exception(struct rt6_info *nrt,
1420                                 struct fib6_info *ort)
1421 {
1422         struct net *net = dev_net(nrt->dst.dev);
1423         struct rt6_exception_bucket *bucket;
1424         struct in6_addr *src_key = NULL;
1425         struct rt6_exception *rt6_ex;
1426         int err = 0;
1427
1428         spin_lock_bh(&rt6_exception_lock);
1429
1430         if (ort->exception_bucket_flushed) {
1431                 err = -EINVAL;
1432                 goto out;
1433         }
1434
1435         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1436                                         lockdep_is_held(&rt6_exception_lock));
1437         if (!bucket) {
1438                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1439                                  GFP_ATOMIC);
1440                 if (!bucket) {
1441                         err = -ENOMEM;
1442                         goto out;
1443                 }
1444                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1445         }
1446
1447 #ifdef CONFIG_IPV6_SUBTREES
1448         /* rt6i_src.plen != 0 indicates ort is in subtree
1449          * and exception table is indexed by a hash of
1450          * both rt6i_dst and rt6i_src.
1451          * Otherwise, the exception table is indexed by
1452          * a hash of only rt6i_dst.
1453          */
1454         if (ort->fib6_src.plen)
1455                 src_key = &nrt->rt6i_src.addr;
1456 #endif
1457
1458         /* Update rt6i_prefsrc as it could be changed
1459          * in rt6_remove_prefsrc()
1460          */
1461         nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1462         /* rt6_mtu_change() might lower mtu on ort.
1463          * Only insert this exception route if its mtu
1464          * is less than ort's mtu value.
1465          */
1466         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1467                 err = -EINVAL;
1468                 goto out;
1469         }
1470
1471         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1472                                                src_key);
1473         if (rt6_ex)
1474                 rt6_remove_exception(bucket, rt6_ex);
1475
1476         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1477         if (!rt6_ex) {
1478                 err = -ENOMEM;
1479                 goto out;
1480         }
1481         rt6_ex->rt6i = nrt;
1482         rt6_ex->stamp = jiffies;
1483         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1484         bucket->depth++;
1485         net->ipv6.rt6_stats->fib_rt_cache++;
1486
1487         if (bucket->depth > FIB6_MAX_DEPTH)
1488                 rt6_exception_remove_oldest(bucket);
1489
1490 out:
1491         spin_unlock_bh(&rt6_exception_lock);
1492
1493         /* Update fn->fn_sernum to invalidate all cached dst */
1494         if (!err) {
1495                 spin_lock_bh(&ort->fib6_table->tb6_lock);
1496                 fib6_update_sernum(net, ort);
1497                 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1498                 fib6_force_start_gc(net);
1499         }
1500
1501         return err;
1502 }
1503
1504 void rt6_flush_exceptions(struct fib6_info *rt)
1505 {
1506         struct rt6_exception_bucket *bucket;
1507         struct rt6_exception *rt6_ex;
1508         struct hlist_node *tmp;
1509         int i;
1510
1511         spin_lock_bh(&rt6_exception_lock);
1512         /* Prevent rt6_insert_exception() to recreate the bucket list */
1513         rt->exception_bucket_flushed = 1;
1514
1515         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1516                                     lockdep_is_held(&rt6_exception_lock));
1517         if (!bucket)
1518                 goto out;
1519
1520         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1521                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1522                         rt6_remove_exception(bucket, rt6_ex);
1523                 WARN_ON_ONCE(bucket->depth);
1524                 bucket++;
1525         }
1526
1527 out:
1528         spin_unlock_bh(&rt6_exception_lock);
1529 }
1530
1531 /* Find cached rt in the hash table inside passed in rt
1532  * Caller has to hold rcu_read_lock()
1533  */
1534 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1535                                            struct in6_addr *daddr,
1536                                            struct in6_addr *saddr)
1537 {
1538         struct rt6_exception_bucket *bucket;
1539         struct in6_addr *src_key = NULL;
1540         struct rt6_exception *rt6_ex;
1541         struct rt6_info *res = NULL;
1542
1543         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1544
1545 #ifdef CONFIG_IPV6_SUBTREES
1546         /* rt6i_src.plen != 0 indicates rt is in subtree
1547          * and exception table is indexed by a hash of
1548          * both rt6i_dst and rt6i_src.
1549          * Otherwise, the exception table is indexed by
1550          * a hash of only rt6i_dst.
1551          */
1552         if (rt->fib6_src.plen)
1553                 src_key = saddr;
1554 #endif
1555         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1556
1557         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1558                 res = rt6_ex->rt6i;
1559
1560         return res;
1561 }
1562
1563 /* Remove the passed in cached rt from the hash table that contains it */
1564 static int rt6_remove_exception_rt(struct rt6_info *rt)
1565 {
1566         struct rt6_exception_bucket *bucket;
1567         struct in6_addr *src_key = NULL;
1568         struct rt6_exception *rt6_ex;
1569         struct fib6_info *from;
1570         int err;
1571
1572         from = rcu_dereference(rt->from);
1573         if (!from ||
1574             !(rt->rt6i_flags & RTF_CACHE))
1575                 return -EINVAL;
1576
1577         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1578                 return -ENOENT;
1579
1580         spin_lock_bh(&rt6_exception_lock);
1581         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1582                                     lockdep_is_held(&rt6_exception_lock));
1583 #ifdef CONFIG_IPV6_SUBTREES
1584         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1585          * and exception table is indexed by a hash of
1586          * both rt6i_dst and rt6i_src.
1587          * Otherwise, the exception table is indexed by
1588          * a hash of only rt6i_dst.
1589          */
1590         if (from->fib6_src.plen)
1591                 src_key = &rt->rt6i_src.addr;
1592 #endif
1593         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1594                                                &rt->rt6i_dst.addr,
1595                                                src_key);
1596         if (rt6_ex) {
1597                 rt6_remove_exception(bucket, rt6_ex);
1598                 err = 0;
1599         } else {
1600                 err = -ENOENT;
1601         }
1602
1603         spin_unlock_bh(&rt6_exception_lock);
1604         return err;
1605 }
1606
1607 /* Find rt6_ex which contains the passed in rt cache and
1608  * refresh its stamp
1609  */
1610 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1611 {
1612         struct rt6_exception_bucket *bucket;
1613         struct fib6_info *from = rt->from;
1614         struct in6_addr *src_key = NULL;
1615         struct rt6_exception *rt6_ex;
1616
1617         if (!from ||
1618             !(rt->rt6i_flags & RTF_CACHE))
1619                 return;
1620
1621         rcu_read_lock();
1622         bucket = rcu_dereference(from->rt6i_exception_bucket);
1623
1624 #ifdef CONFIG_IPV6_SUBTREES
1625         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1626          * and exception table is indexed by a hash of
1627          * both rt6i_dst and rt6i_src.
1628          * Otherwise, the exception table is indexed by
1629          * a hash of only rt6i_dst.
1630          */
1631         if (from->fib6_src.plen)
1632                 src_key = &rt->rt6i_src.addr;
1633 #endif
1634         rt6_ex = __rt6_find_exception_rcu(&bucket,
1635                                           &rt->rt6i_dst.addr,
1636                                           src_key);
1637         if (rt6_ex)
1638                 rt6_ex->stamp = jiffies;
1639
1640         rcu_read_unlock();
1641 }
1642
1643 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1644 {
1645         struct rt6_exception_bucket *bucket;
1646         struct rt6_exception *rt6_ex;
1647         int i;
1648
1649         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1650                                         lockdep_is_held(&rt6_exception_lock));
1651
1652         if (bucket) {
1653                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1654                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1655                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1656                         }
1657                         bucket++;
1658                 }
1659         }
1660 }
1661
1662 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1663                                          struct rt6_info *rt, int mtu)
1664 {
1665         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1666          * lowest MTU in the path: always allow updating the route PMTU to
1667          * reflect PMTU decreases.
1668          *
1669          * If the new MTU is higher, and the route PMTU is equal to the local
1670          * MTU, this means the old MTU is the lowest in the path, so allow
1671          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1672          * handle this.
1673          */
1674
1675         if (dst_mtu(&rt->dst) >= mtu)
1676                 return true;
1677
1678         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1679                 return true;
1680
1681         return false;
1682 }
1683
1684 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1685                                        struct fib6_info *rt, int mtu)
1686 {
1687         struct rt6_exception_bucket *bucket;
1688         struct rt6_exception *rt6_ex;
1689         int i;
1690
1691         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1692                                         lockdep_is_held(&rt6_exception_lock));
1693
1694         if (!bucket)
1695                 return;
1696
1697         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1698                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1699                         struct rt6_info *entry = rt6_ex->rt6i;
1700
1701                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1702                          * route), the metrics of its rt->from have already
1703                          * been updated.
1704                          */
1705                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1706                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1707                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1708                 }
1709                 bucket++;
1710         }
1711 }
1712
1713 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1714
1715 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1716                                         struct in6_addr *gateway)
1717 {
1718         struct rt6_exception_bucket *bucket;
1719         struct rt6_exception *rt6_ex;
1720         struct hlist_node *tmp;
1721         int i;
1722
1723         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1724                 return;
1725
1726         spin_lock_bh(&rt6_exception_lock);
1727         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1728                                      lockdep_is_held(&rt6_exception_lock));
1729
1730         if (bucket) {
1731                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1732                         hlist_for_each_entry_safe(rt6_ex, tmp,
1733                                                   &bucket->chain, hlist) {
1734                                 struct rt6_info *entry = rt6_ex->rt6i;
1735
1736                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1737                                     RTF_CACHE_GATEWAY &&
1738                                     ipv6_addr_equal(gateway,
1739                                                     &entry->rt6i_gateway)) {
1740                                         rt6_remove_exception(bucket, rt6_ex);
1741                                 }
1742                         }
1743                         bucket++;
1744                 }
1745         }
1746
1747         spin_unlock_bh(&rt6_exception_lock);
1748 }
1749
1750 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1751                                       struct rt6_exception *rt6_ex,
1752                                       struct fib6_gc_args *gc_args,
1753                                       unsigned long now)
1754 {
1755         struct rt6_info *rt = rt6_ex->rt6i;
1756
1757         /* we are pruning and obsoleting aged-out and non gateway exceptions
1758          * even if others have still references to them, so that on next
1759          * dst_check() such references can be dropped.
1760          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1761          * expired, independently from their aging, as per RFC 8201 section 4
1762          */
1763         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1764                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1765                         RT6_TRACE("aging clone %p\n", rt);
1766                         rt6_remove_exception(bucket, rt6_ex);
1767                         return;
1768                 }
1769         } else if (time_after(jiffies, rt->dst.expires)) {
1770                 RT6_TRACE("purging expired route %p\n", rt);
1771                 rt6_remove_exception(bucket, rt6_ex);
1772                 return;
1773         }
1774
1775         if (rt->rt6i_flags & RTF_GATEWAY) {
1776                 struct neighbour *neigh;
1777                 __u8 neigh_flags = 0;
1778
1779                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1780                 if (neigh)
1781                         neigh_flags = neigh->flags;
1782
1783                 if (!(neigh_flags & NTF_ROUTER)) {
1784                         RT6_TRACE("purging route %p via non-router but gateway\n",
1785                                   rt);
1786                         rt6_remove_exception(bucket, rt6_ex);
1787                         return;
1788                 }
1789         }
1790
1791         gc_args->more++;
1792 }
1793
1794 void rt6_age_exceptions(struct fib6_info *rt,
1795                         struct fib6_gc_args *gc_args,
1796                         unsigned long now)
1797 {
1798         struct rt6_exception_bucket *bucket;
1799         struct rt6_exception *rt6_ex;
1800         struct hlist_node *tmp;
1801         int i;
1802
1803         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1804                 return;
1805
1806         rcu_read_lock_bh();
1807         spin_lock(&rt6_exception_lock);
1808         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1809                                     lockdep_is_held(&rt6_exception_lock));
1810
1811         if (bucket) {
1812                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1813                         hlist_for_each_entry_safe(rt6_ex, tmp,
1814                                                   &bucket->chain, hlist) {
1815                                 rt6_age_examine_exception(bucket, rt6_ex,
1816                                                           gc_args, now);
1817                         }
1818                         bucket++;
1819                 }
1820         }
1821         spin_unlock(&rt6_exception_lock);
1822         rcu_read_unlock_bh();
1823 }
1824
1825 /* must be called with rcu lock held */
1826 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1827                                     int oif, struct flowi6 *fl6, int strict)
1828 {
1829         struct fib6_node *fn, *saved_fn;
1830         struct fib6_info *f6i;
1831
1832         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1833         saved_fn = fn;
1834
1835         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1836                 oif = 0;
1837
1838 redo_rt6_select:
1839         f6i = rt6_select(net, fn, oif, strict);
1840         if (f6i == net->ipv6.fib6_null_entry) {
1841                 fn = fib6_backtrack(fn, &fl6->saddr);
1842                 if (fn)
1843                         goto redo_rt6_select;
1844                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1845                         /* also consider unreachable route */
1846                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1847                         fn = saved_fn;
1848                         goto redo_rt6_select;
1849                 }
1850         }
1851
1852         trace_fib6_table_lookup(net, f6i, table, fl6);
1853
1854         return f6i;
1855 }
1856
1857 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1858                                int oif, struct flowi6 *fl6,
1859                                const struct sk_buff *skb, int flags)
1860 {
1861         struct fib6_info *f6i;
1862         struct rt6_info *rt;
1863         int strict = 0;
1864
1865         strict |= flags & RT6_LOOKUP_F_IFACE;
1866         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1867         if (net->ipv6.devconf_all->forwarding == 0)
1868                 strict |= RT6_LOOKUP_F_REACHABLE;
1869
1870         rcu_read_lock();
1871
1872         f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1873         if (f6i->fib6_nsiblings)
1874                 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1875
1876         if (f6i == net->ipv6.fib6_null_entry) {
1877                 rt = net->ipv6.ip6_null_entry;
1878                 rcu_read_unlock();
1879                 dst_hold(&rt->dst);
1880                 return rt;
1881         }
1882
1883         /*Search through exception table */
1884         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1885         if (rt) {
1886                 if (ip6_hold_safe(net, &rt, true))
1887                         dst_use_noref(&rt->dst, jiffies);
1888
1889                 rcu_read_unlock();
1890                 return rt;
1891         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1892                             !(f6i->fib6_flags & RTF_GATEWAY))) {
1893                 /* Create a RTF_CACHE clone which will not be
1894                  * owned by the fib6 tree.  It is for the special case where
1895                  * the daddr in the skb during the neighbor look-up is different
1896                  * from the fl6->daddr used to look-up route here.
1897                  */
1898                 struct rt6_info *uncached_rt;
1899
1900                 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1901
1902                 rcu_read_unlock();
1903
1904                 if (uncached_rt) {
1905                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1906                          * No need for another dst_hold()
1907                          */
1908                         rt6_uncached_list_add(uncached_rt);
1909                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1910                 } else {
1911                         uncached_rt = net->ipv6.ip6_null_entry;
1912                         dst_hold(&uncached_rt->dst);
1913                 }
1914
1915                 return uncached_rt;
1916         } else {
1917                 /* Get a percpu copy */
1918
1919                 struct rt6_info *pcpu_rt;
1920
1921                 local_bh_disable();
1922                 pcpu_rt = rt6_get_pcpu_route(f6i);
1923
1924                 if (!pcpu_rt)
1925                         pcpu_rt = rt6_make_pcpu_route(net, f6i);
1926
1927                 local_bh_enable();
1928                 rcu_read_unlock();
1929
1930                 return pcpu_rt;
1931         }
1932 }
1933 EXPORT_SYMBOL_GPL(ip6_pol_route);
1934
1935 static struct rt6_info *ip6_pol_route_input(struct net *net,
1936                                             struct fib6_table *table,
1937                                             struct flowi6 *fl6,
1938                                             const struct sk_buff *skb,
1939                                             int flags)
1940 {
1941         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1942 }
1943
1944 struct dst_entry *ip6_route_input_lookup(struct net *net,
1945                                          struct net_device *dev,
1946                                          struct flowi6 *fl6,
1947                                          const struct sk_buff *skb,
1948                                          int flags)
1949 {
1950         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1951                 flags |= RT6_LOOKUP_F_IFACE;
1952
1953         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1954 }
1955 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1956
1957 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1958                                   struct flow_keys *keys,
1959                                   struct flow_keys *flkeys)
1960 {
1961         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1962         const struct ipv6hdr *key_iph = outer_iph;
1963         struct flow_keys *_flkeys = flkeys;
1964         const struct ipv6hdr *inner_iph;
1965         const struct icmp6hdr *icmph;
1966         struct ipv6hdr _inner_iph;
1967         struct icmp6hdr _icmph;
1968
1969         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1970                 goto out;
1971
1972         icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1973                                    sizeof(_icmph), &_icmph);
1974         if (!icmph)
1975                 goto out;
1976
1977         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1978             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1979             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1980             icmph->icmp6_type != ICMPV6_PARAMPROB)
1981                 goto out;
1982
1983         inner_iph = skb_header_pointer(skb,
1984                                        skb_transport_offset(skb) + sizeof(*icmph),
1985                                        sizeof(_inner_iph), &_inner_iph);
1986         if (!inner_iph)
1987                 goto out;
1988
1989         key_iph = inner_iph;
1990         _flkeys = NULL;
1991 out:
1992         if (_flkeys) {
1993                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1994                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1995                 keys->tags.flow_label = _flkeys->tags.flow_label;
1996                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1997         } else {
1998                 keys->addrs.v6addrs.src = key_iph->saddr;
1999                 keys->addrs.v6addrs.dst = key_iph->daddr;
2000                 keys->tags.flow_label = ip6_flowlabel(key_iph);
2001                 keys->basic.ip_proto = key_iph->nexthdr;
2002         }
2003 }
2004
2005 /* if skb is set it will be used and fl6 can be NULL */
2006 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2007                        const struct sk_buff *skb, struct flow_keys *flkeys)
2008 {
2009         struct flow_keys hash_keys;
2010         u32 mhash;
2011
2012         switch (ip6_multipath_hash_policy(net)) {
2013         case 0:
2014                 memset(&hash_keys, 0, sizeof(hash_keys));
2015                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2016                 if (skb) {
2017                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2018                 } else {
2019                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2020                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2021                         hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2022                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2023                 }
2024                 break;
2025         case 1:
2026                 if (skb) {
2027                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2028                         struct flow_keys keys;
2029
2030                         /* short-circuit if we already have L4 hash present */
2031                         if (skb->l4_hash)
2032                                 return skb_get_hash_raw(skb) >> 1;
2033
2034                         memset(&hash_keys, 0, sizeof(hash_keys));
2035
2036                         if (!flkeys) {
2037                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2038                                 flkeys = &keys;
2039                         }
2040                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2041                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2042                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2043                         hash_keys.ports.src = flkeys->ports.src;
2044                         hash_keys.ports.dst = flkeys->ports.dst;
2045                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2046                 } else {
2047                         memset(&hash_keys, 0, sizeof(hash_keys));
2048                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2049                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2050                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2051                         hash_keys.ports.src = fl6->fl6_sport;
2052                         hash_keys.ports.dst = fl6->fl6_dport;
2053                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2054                 }
2055                 break;
2056         }
2057         mhash = flow_hash_from_keys(&hash_keys);
2058
2059         return mhash >> 1;
2060 }
2061
2062 void ip6_route_input(struct sk_buff *skb)
2063 {
2064         const struct ipv6hdr *iph = ipv6_hdr(skb);
2065         struct net *net = dev_net(skb->dev);
2066         int flags = RT6_LOOKUP_F_HAS_SADDR;
2067         struct ip_tunnel_info *tun_info;
2068         struct flowi6 fl6 = {
2069                 .flowi6_iif = skb->dev->ifindex,
2070                 .daddr = iph->daddr,
2071                 .saddr = iph->saddr,
2072                 .flowlabel = ip6_flowinfo(iph),
2073                 .flowi6_mark = skb->mark,
2074                 .flowi6_proto = iph->nexthdr,
2075         };
2076         struct flow_keys *flkeys = NULL, _flkeys;
2077
2078         tun_info = skb_tunnel_info(skb);
2079         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2080                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2081
2082         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2083                 flkeys = &_flkeys;
2084
2085         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2086                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2087         skb_dst_drop(skb);
2088         skb_dst_set(skb,
2089                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2090 }
2091
2092 static struct rt6_info *ip6_pol_route_output(struct net *net,
2093                                              struct fib6_table *table,
2094                                              struct flowi6 *fl6,
2095                                              const struct sk_buff *skb,
2096                                              int flags)
2097 {
2098         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2099 }
2100
2101 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2102                                          struct flowi6 *fl6, int flags)
2103 {
2104         bool any_src;
2105
2106         if (rt6_need_strict(&fl6->daddr)) {
2107                 struct dst_entry *dst;
2108
2109                 dst = l3mdev_link_scope_lookup(net, fl6);
2110                 if (dst)
2111                         return dst;
2112         }
2113
2114         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2115
2116         any_src = ipv6_addr_any(&fl6->saddr);
2117         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2118             (fl6->flowi6_oif && any_src))
2119                 flags |= RT6_LOOKUP_F_IFACE;
2120
2121         if (!any_src)
2122                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2123         else if (sk)
2124                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2125
2126         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2127 }
2128 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2129
2130 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2131 {
2132         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2133         struct net_device *loopback_dev = net->loopback_dev;
2134         struct dst_entry *new = NULL;
2135
2136         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2137                        DST_OBSOLETE_DEAD, 0);
2138         if (rt) {
2139                 rt6_info_init(rt);
2140                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2141
2142                 new = &rt->dst;
2143                 new->__use = 1;
2144                 new->input = dst_discard;
2145                 new->output = dst_discard_out;
2146
2147                 dst_copy_metrics(new, &ort->dst);
2148
2149                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2150                 rt->rt6i_gateway = ort->rt6i_gateway;
2151                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2152
2153                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2154 #ifdef CONFIG_IPV6_SUBTREES
2155                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2156 #endif
2157         }
2158
2159         dst_release(dst_orig);
2160         return new ? new : ERR_PTR(-ENOMEM);
2161 }
2162
2163 /*
2164  *      Destination cache support functions
2165  */
2166
2167 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2168 {
2169         u32 rt_cookie = 0;
2170
2171         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2172                 return false;
2173
2174         if (fib6_check_expired(f6i))
2175                 return false;
2176
2177         return true;
2178 }
2179
2180 static struct dst_entry *rt6_check(struct rt6_info *rt,
2181                                    struct fib6_info *from,
2182                                    u32 cookie)
2183 {
2184         u32 rt_cookie = 0;
2185
2186         if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2187             rt_cookie != cookie)
2188                 return NULL;
2189
2190         if (rt6_check_expired(rt))
2191                 return NULL;
2192
2193         return &rt->dst;
2194 }
2195
2196 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2197                                             struct fib6_info *from,
2198                                             u32 cookie)
2199 {
2200         if (!__rt6_check_expired(rt) &&
2201             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2202             fib6_check(from, cookie))
2203                 return &rt->dst;
2204         else
2205                 return NULL;
2206 }
2207
2208 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2209 {
2210         struct dst_entry *dst_ret;
2211         struct fib6_info *from;
2212         struct rt6_info *rt;
2213
2214         rt = container_of(dst, struct rt6_info, dst);
2215
2216         rcu_read_lock();
2217
2218         /* All IPV6 dsts are created with ->obsolete set to the value
2219          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2220          * into this function always.
2221          */
2222
2223         from = rcu_dereference(rt->from);
2224
2225         if (from && (rt->rt6i_flags & RTF_PCPU ||
2226             unlikely(!list_empty(&rt->rt6i_uncached))))
2227                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2228         else
2229                 dst_ret = rt6_check(rt, from, cookie);
2230
2231         rcu_read_unlock();
2232
2233         return dst_ret;
2234 }
2235
2236 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2237 {
2238         struct rt6_info *rt = (struct rt6_info *) dst;
2239
2240         if (rt) {
2241                 if (rt->rt6i_flags & RTF_CACHE) {
2242                         rcu_read_lock();
2243                         if (rt6_check_expired(rt)) {
2244                                 rt6_remove_exception_rt(rt);
2245                                 dst = NULL;
2246                         }
2247                         rcu_read_unlock();
2248                 } else {
2249                         dst_release(dst);
2250                         dst = NULL;
2251                 }
2252         }
2253         return dst;
2254 }
2255
2256 static void ip6_link_failure(struct sk_buff *skb)
2257 {
2258         struct rt6_info *rt;
2259
2260         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2261
2262         rt = (struct rt6_info *) skb_dst(skb);
2263         if (rt) {
2264                 rcu_read_lock();
2265                 if (rt->rt6i_flags & RTF_CACHE) {
2266                         if (dst_hold_safe(&rt->dst))
2267                                 rt6_remove_exception_rt(rt);
2268                 } else {
2269                         struct fib6_info *from;
2270                         struct fib6_node *fn;
2271
2272                         from = rcu_dereference(rt->from);
2273                         if (from) {
2274                                 fn = rcu_dereference(from->fib6_node);
2275                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2276                                         fn->fn_sernum = -1;
2277                         }
2278                 }
2279                 rcu_read_unlock();
2280         }
2281 }
2282
2283 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2284 {
2285         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2286                 struct fib6_info *from;
2287
2288                 rcu_read_lock();
2289                 from = rcu_dereference(rt0->from);
2290                 if (from)
2291                         rt0->dst.expires = from->expires;
2292                 rcu_read_unlock();
2293         }
2294
2295         dst_set_expires(&rt0->dst, timeout);
2296         rt0->rt6i_flags |= RTF_EXPIRES;
2297 }
2298
2299 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2300 {
2301         struct net *net = dev_net(rt->dst.dev);
2302
2303         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2304         rt->rt6i_flags |= RTF_MODIFIED;
2305         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2306 }
2307
2308 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2309 {
2310         bool from_set;
2311
2312         rcu_read_lock();
2313         from_set = !!rcu_dereference(rt->from);
2314         rcu_read_unlock();
2315
2316         return !(rt->rt6i_flags & RTF_CACHE) &&
2317                 (rt->rt6i_flags & RTF_PCPU || from_set);
2318 }
2319
2320 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2321                                  const struct ipv6hdr *iph, u32 mtu)
2322 {
2323         const struct in6_addr *daddr, *saddr;
2324         struct rt6_info *rt6 = (struct rt6_info *)dst;
2325
2326         if (dst_metric_locked(dst, RTAX_MTU))
2327                 return;
2328
2329         if (iph) {
2330                 daddr = &iph->daddr;
2331                 saddr = &iph->saddr;
2332         } else if (sk) {
2333                 daddr = &sk->sk_v6_daddr;
2334                 saddr = &inet6_sk(sk)->saddr;
2335         } else {
2336                 daddr = NULL;
2337                 saddr = NULL;
2338         }
2339         dst_confirm_neigh(dst, daddr);
2340         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2341         if (mtu >= dst_mtu(dst))
2342                 return;
2343
2344         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2345                 rt6_do_update_pmtu(rt6, mtu);
2346                 /* update rt6_ex->stamp for cache */
2347                 if (rt6->rt6i_flags & RTF_CACHE)
2348                         rt6_update_exception_stamp_rt(rt6);
2349         } else if (daddr) {
2350                 struct fib6_info *from;
2351                 struct rt6_info *nrt6;
2352
2353                 rcu_read_lock();
2354                 from = rcu_dereference(rt6->from);
2355                 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2356                 if (nrt6) {
2357                         rt6_do_update_pmtu(nrt6, mtu);
2358                         if (rt6_insert_exception(nrt6, from))
2359                                 dst_release_immediate(&nrt6->dst);
2360                 }
2361                 rcu_read_unlock();
2362         }
2363 }
2364
2365 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2366                                struct sk_buff *skb, u32 mtu)
2367 {
2368         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2369 }
2370
2371 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2372                      int oif, u32 mark, kuid_t uid)
2373 {
2374         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2375         struct dst_entry *dst;
2376         struct flowi6 fl6;
2377
2378         memset(&fl6, 0, sizeof(fl6));
2379         fl6.flowi6_oif = oif;
2380         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2381         fl6.daddr = iph->daddr;
2382         fl6.saddr = iph->saddr;
2383         fl6.flowlabel = ip6_flowinfo(iph);
2384         fl6.flowi6_uid = uid;
2385
2386         dst = ip6_route_output(net, NULL, &fl6);
2387         if (!dst->error)
2388                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2389         dst_release(dst);
2390 }
2391 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2392
2393 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2394 {
2395         struct dst_entry *dst;
2396
2397         ip6_update_pmtu(skb, sock_net(sk), mtu,
2398                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2399
2400         dst = __sk_dst_get(sk);
2401         if (!dst || !dst->obsolete ||
2402             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2403                 return;
2404
2405         bh_lock_sock(sk);
2406         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2407                 ip6_datagram_dst_update(sk, false);
2408         bh_unlock_sock(sk);
2409 }
2410 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2411
2412 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2413                            const struct flowi6 *fl6)
2414 {
2415 #ifdef CONFIG_IPV6_SUBTREES
2416         struct ipv6_pinfo *np = inet6_sk(sk);
2417 #endif
2418
2419         ip6_dst_store(sk, dst,
2420                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2421                       &sk->sk_v6_daddr : NULL,
2422 #ifdef CONFIG_IPV6_SUBTREES
2423                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2424                       &np->saddr :
2425 #endif
2426                       NULL);
2427 }
2428
2429 /* Handle redirects */
2430 struct ip6rd_flowi {
2431         struct flowi6 fl6;
2432         struct in6_addr gateway;
2433 };
2434
2435 static struct rt6_info *__ip6_route_redirect(struct net *net,
2436                                              struct fib6_table *table,
2437                                              struct flowi6 *fl6,
2438                                              const struct sk_buff *skb,
2439                                              int flags)
2440 {
2441         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2442         struct rt6_info *ret = NULL, *rt_cache;
2443         struct fib6_info *rt;
2444         struct fib6_node *fn;
2445
2446         /* Get the "current" route for this destination and
2447          * check if the redirect has come from appropriate router.
2448          *
2449          * RFC 4861 specifies that redirects should only be
2450          * accepted if they come from the nexthop to the target.
2451          * Due to the way the routes are chosen, this notion
2452          * is a bit fuzzy and one might need to check all possible
2453          * routes.
2454          */
2455
2456         rcu_read_lock();
2457         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2458 restart:
2459         for_each_fib6_node_rt_rcu(fn) {
2460                 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2461                         continue;
2462                 if (fib6_check_expired(rt))
2463                         continue;
2464                 if (rt->fib6_flags & RTF_REJECT)
2465                         break;
2466                 if (!(rt->fib6_flags & RTF_GATEWAY))
2467                         continue;
2468                 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2469                         continue;
2470                 /* rt_cache's gateway might be different from its 'parent'
2471                  * in the case of an ip redirect.
2472                  * So we keep searching in the exception table if the gateway
2473                  * is different.
2474                  */
2475                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2476                         rt_cache = rt6_find_cached_rt(rt,
2477                                                       &fl6->daddr,
2478                                                       &fl6->saddr);
2479                         if (rt_cache &&
2480                             ipv6_addr_equal(&rdfl->gateway,
2481                                             &rt_cache->rt6i_gateway)) {
2482                                 ret = rt_cache;
2483                                 break;
2484                         }
2485                         continue;
2486                 }
2487                 break;
2488         }
2489
2490         if (!rt)
2491                 rt = net->ipv6.fib6_null_entry;
2492         else if (rt->fib6_flags & RTF_REJECT) {
2493                 ret = net->ipv6.ip6_null_entry;
2494                 goto out;
2495         }
2496
2497         if (rt == net->ipv6.fib6_null_entry) {
2498                 fn = fib6_backtrack(fn, &fl6->saddr);
2499                 if (fn)
2500                         goto restart;
2501         }
2502
2503 out:
2504         if (ret)
2505                 ip6_hold_safe(net, &ret, true);
2506         else
2507                 ret = ip6_create_rt_rcu(rt);
2508
2509         rcu_read_unlock();
2510
2511         trace_fib6_table_lookup(net, rt, table, fl6);
2512         return ret;
2513 };
2514
2515 static struct dst_entry *ip6_route_redirect(struct net *net,
2516                                             const struct flowi6 *fl6,
2517                                             const struct sk_buff *skb,
2518                                             const struct in6_addr *gateway)
2519 {
2520         int flags = RT6_LOOKUP_F_HAS_SADDR;
2521         struct ip6rd_flowi rdfl;
2522
2523         rdfl.fl6 = *fl6;
2524         rdfl.gateway = *gateway;
2525
2526         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2527                                 flags, __ip6_route_redirect);
2528 }
2529
2530 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2531                   kuid_t uid)
2532 {
2533         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2534         struct dst_entry *dst;
2535         struct flowi6 fl6;
2536
2537         memset(&fl6, 0, sizeof(fl6));
2538         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2539         fl6.flowi6_oif = oif;
2540         fl6.flowi6_mark = mark;
2541         fl6.daddr = iph->daddr;
2542         fl6.saddr = iph->saddr;
2543         fl6.flowlabel = ip6_flowinfo(iph);
2544         fl6.flowi6_uid = uid;
2545
2546         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2547         rt6_do_redirect(dst, NULL, skb);
2548         dst_release(dst);
2549 }
2550 EXPORT_SYMBOL_GPL(ip6_redirect);
2551
2552 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2553                             u32 mark)
2554 {
2555         const struct ipv6hdr *iph = ipv6_hdr(skb);
2556         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2557         struct dst_entry *dst;
2558         struct flowi6 fl6;
2559
2560         memset(&fl6, 0, sizeof(fl6));
2561         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2562         fl6.flowi6_oif = oif;
2563         fl6.flowi6_mark = mark;
2564         fl6.daddr = msg->dest;
2565         fl6.saddr = iph->daddr;
2566         fl6.flowi6_uid = sock_net_uid(net, NULL);
2567
2568         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2569         rt6_do_redirect(dst, NULL, skb);
2570         dst_release(dst);
2571 }
2572
2573 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2574 {
2575         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2576                      sk->sk_uid);
2577 }
2578 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2579
2580 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2581 {
2582         struct net_device *dev = dst->dev;
2583         unsigned int mtu = dst_mtu(dst);
2584         struct net *net = dev_net(dev);
2585
2586         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2587
2588         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2589                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2590
2591         /*
2592          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2593          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2594          * IPV6_MAXPLEN is also valid and means: "any MSS,
2595          * rely only on pmtu discovery"
2596          */
2597         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2598                 mtu = IPV6_MAXPLEN;
2599         return mtu;
2600 }
2601
2602 static unsigned int ip6_mtu(const struct dst_entry *dst)
2603 {
2604         struct inet6_dev *idev;
2605         unsigned int mtu;
2606
2607         mtu = dst_metric_raw(dst, RTAX_MTU);
2608         if (mtu)
2609                 goto out;
2610
2611         mtu = IPV6_MIN_MTU;
2612
2613         rcu_read_lock();
2614         idev = __in6_dev_get(dst->dev);
2615         if (idev)
2616                 mtu = idev->cnf.mtu6;
2617         rcu_read_unlock();
2618
2619 out:
2620         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2621
2622         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2623 }
2624
2625 /* MTU selection:
2626  * 1. mtu on route is locked - use it
2627  * 2. mtu from nexthop exception
2628  * 3. mtu from egress device
2629  *
2630  * based on ip6_dst_mtu_forward and exception logic of
2631  * rt6_find_cached_rt; called with rcu_read_lock
2632  */
2633 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2634                       struct in6_addr *saddr)
2635 {
2636         struct rt6_exception_bucket *bucket;
2637         struct rt6_exception *rt6_ex;
2638         struct in6_addr *src_key;
2639         struct inet6_dev *idev;
2640         u32 mtu = 0;
2641
2642         if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2643                 mtu = f6i->fib6_pmtu;
2644                 if (mtu)
2645                         goto out;
2646         }
2647
2648         src_key = NULL;
2649 #ifdef CONFIG_IPV6_SUBTREES
2650         if (f6i->fib6_src.plen)
2651                 src_key = saddr;
2652 #endif
2653
2654         bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2655         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2656         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2657                 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2658
2659         if (likely(!mtu)) {
2660                 struct net_device *dev = fib6_info_nh_dev(f6i);
2661
2662                 mtu = IPV6_MIN_MTU;
2663                 idev = __in6_dev_get(dev);
2664                 if (idev && idev->cnf.mtu6 > mtu)
2665                         mtu = idev->cnf.mtu6;
2666         }
2667
2668         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2669 out:
2670         return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2671 }
2672
2673 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2674                                   struct flowi6 *fl6)
2675 {
2676         struct dst_entry *dst;
2677         struct rt6_info *rt;
2678         struct inet6_dev *idev = in6_dev_get(dev);
2679         struct net *net = dev_net(dev);
2680
2681         if (unlikely(!idev))
2682                 return ERR_PTR(-ENODEV);
2683
2684         rt = ip6_dst_alloc(net, dev, 0);
2685         if (unlikely(!rt)) {
2686                 in6_dev_put(idev);
2687                 dst = ERR_PTR(-ENOMEM);
2688                 goto out;
2689         }
2690
2691         rt->dst.flags |= DST_HOST;
2692         rt->dst.input = ip6_input;
2693         rt->dst.output  = ip6_output;
2694         rt->rt6i_gateway  = fl6->daddr;
2695         rt->rt6i_dst.addr = fl6->daddr;
2696         rt->rt6i_dst.plen = 128;
2697         rt->rt6i_idev     = idev;
2698         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2699
2700         /* Add this dst into uncached_list so that rt6_disable_ip() can
2701          * do proper release of the net_device
2702          */
2703         rt6_uncached_list_add(rt);
2704         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2705
2706         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2707
2708 out:
2709         return dst;
2710 }
2711
2712 static int ip6_dst_gc(struct dst_ops *ops)
2713 {
2714         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2715         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2716         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2717         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2718         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2719         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2720         int entries;
2721
2722         entries = dst_entries_get_fast(ops);
2723         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2724             entries <= rt_max_size)
2725                 goto out;
2726
2727         net->ipv6.ip6_rt_gc_expire++;
2728         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2729         entries = dst_entries_get_slow(ops);
2730         if (entries < ops->gc_thresh)
2731                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2732 out:
2733         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2734         return entries > rt_max_size;
2735 }
2736
2737 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2738                                struct fib6_config *cfg)
2739 {
2740         struct dst_metrics *p;
2741
2742         if (!cfg->fc_mx)
2743                 return 0;
2744
2745         p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2746         if (unlikely(!p))
2747                 return -ENOMEM;
2748
2749         refcount_set(&p->refcnt, 1);
2750         rt->fib6_metrics = p;
2751
2752         return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2753 }
2754
2755 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2756                                             struct fib6_config *cfg,
2757                                             const struct in6_addr *gw_addr,
2758                                             u32 tbid, int flags)
2759 {
2760         struct flowi6 fl6 = {
2761                 .flowi6_oif = cfg->fc_ifindex,
2762                 .daddr = *gw_addr,
2763                 .saddr = cfg->fc_prefsrc,
2764         };
2765         struct fib6_table *table;
2766         struct rt6_info *rt;
2767
2768         table = fib6_get_table(net, tbid);
2769         if (!table)
2770                 return NULL;
2771
2772         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2773                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2774
2775         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2776         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2777
2778         /* if table lookup failed, fall back to full lookup */
2779         if (rt == net->ipv6.ip6_null_entry) {
2780                 ip6_rt_put(rt);
2781                 rt = NULL;
2782         }
2783
2784         return rt;
2785 }
2786
2787 static int ip6_route_check_nh_onlink(struct net *net,
2788                                      struct fib6_config *cfg,
2789                                      const struct net_device *dev,
2790                                      struct netlink_ext_ack *extack)
2791 {
2792         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2793         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2794         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2795         struct rt6_info *grt;
2796         int err;
2797
2798         err = 0;
2799         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2800         if (grt) {
2801                 if (!grt->dst.error &&
2802                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2803                         NL_SET_ERR_MSG(extack,
2804                                        "Nexthop has invalid gateway or device mismatch");
2805                         err = -EINVAL;
2806                 }
2807
2808                 ip6_rt_put(grt);
2809         }
2810
2811         return err;
2812 }
2813
2814 static int ip6_route_check_nh(struct net *net,
2815                               struct fib6_config *cfg,
2816                               struct net_device **_dev,
2817                               struct inet6_dev **idev)
2818 {
2819         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2820         struct net_device *dev = _dev ? *_dev : NULL;
2821         struct rt6_info *grt = NULL;
2822         int err = -EHOSTUNREACH;
2823
2824         if (cfg->fc_table) {
2825                 int flags = RT6_LOOKUP_F_IFACE;
2826
2827                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2828                                           cfg->fc_table, flags);
2829                 if (grt) {
2830                         if (grt->rt6i_flags & RTF_GATEWAY ||
2831                             (dev && dev != grt->dst.dev)) {
2832                                 ip6_rt_put(grt);
2833                                 grt = NULL;
2834                         }
2835                 }
2836         }
2837
2838         if (!grt)
2839                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2840
2841         if (!grt)
2842                 goto out;
2843
2844         if (dev) {
2845                 if (dev != grt->dst.dev) {
2846                         ip6_rt_put(grt);
2847                         goto out;
2848                 }
2849         } else {
2850                 *_dev = dev = grt->dst.dev;
2851                 *idev = grt->rt6i_idev;
2852                 dev_hold(dev);
2853                 in6_dev_hold(grt->rt6i_idev);
2854         }
2855
2856         if (!(grt->rt6i_flags & RTF_GATEWAY))
2857                 err = 0;
2858
2859         ip6_rt_put(grt);
2860
2861 out:
2862         return err;
2863 }
2864
2865 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2866                            struct net_device **_dev, struct inet6_dev **idev,
2867                            struct netlink_ext_ack *extack)
2868 {
2869         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2870         int gwa_type = ipv6_addr_type(gw_addr);
2871         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2872         const struct net_device *dev = *_dev;
2873         bool need_addr_check = !dev;
2874         int err = -EINVAL;
2875
2876         /* if gw_addr is local we will fail to detect this in case
2877          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2878          * will return already-added prefix route via interface that
2879          * prefix route was assigned to, which might be non-loopback.
2880          */
2881         if (dev &&
2882             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2883                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2884                 goto out;
2885         }
2886
2887         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2888                 /* IPv6 strictly inhibits using not link-local
2889                  * addresses as nexthop address.
2890                  * Otherwise, router will not able to send redirects.
2891                  * It is very good, but in some (rare!) circumstances
2892                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2893                  * some exceptions. --ANK
2894                  * We allow IPv4-mapped nexthops to support RFC4798-type
2895                  * addressing
2896                  */
2897                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2898                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2899                         goto out;
2900                 }
2901
2902                 if (cfg->fc_flags & RTNH_F_ONLINK)
2903                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2904                 else
2905                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2906
2907                 if (err)
2908                         goto out;
2909         }
2910
2911         /* reload in case device was changed */
2912         dev = *_dev;
2913
2914         err = -EINVAL;
2915         if (!dev) {
2916                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2917                 goto out;
2918         } else if (dev->flags & IFF_LOOPBACK) {
2919                 NL_SET_ERR_MSG(extack,
2920                                "Egress device can not be loopback device for this route");
2921                 goto out;
2922         }
2923
2924         /* if we did not check gw_addr above, do so now that the
2925          * egress device has been resolved.
2926          */
2927         if (need_addr_check &&
2928             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2929                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2930                 goto out;
2931         }
2932
2933         err = 0;
2934 out:
2935         return err;
2936 }
2937
2938 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2939                                               gfp_t gfp_flags,
2940                                               struct netlink_ext_ack *extack)
2941 {
2942         struct net *net = cfg->fc_nlinfo.nl_net;
2943         struct fib6_info *rt = NULL;
2944         struct net_device *dev = NULL;
2945         struct inet6_dev *idev = NULL;
2946         struct fib6_table *table;
2947         int addr_type;
2948         int err = -EINVAL;
2949
2950         /* RTF_PCPU is an internal flag; can not be set by userspace */
2951         if (cfg->fc_flags & RTF_PCPU) {
2952                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2953                 goto out;
2954         }
2955
2956         /* RTF_CACHE is an internal flag; can not be set by userspace */
2957         if (cfg->fc_flags & RTF_CACHE) {
2958                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2959                 goto out;
2960         }
2961
2962         if (cfg->fc_type > RTN_MAX) {
2963                 NL_SET_ERR_MSG(extack, "Invalid route type");
2964                 goto out;
2965         }
2966
2967         if (cfg->fc_dst_len > 128) {
2968                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2969                 goto out;
2970         }
2971         if (cfg->fc_src_len > 128) {
2972                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2973                 goto out;
2974         }
2975 #ifndef CONFIG_IPV6_SUBTREES
2976         if (cfg->fc_src_len) {
2977                 NL_SET_ERR_MSG(extack,
2978                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2979                 goto out;
2980         }
2981 #endif
2982         if (cfg->fc_ifindex) {
2983                 err = -ENODEV;
2984                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2985                 if (!dev)
2986                         goto out;
2987                 idev = in6_dev_get(dev);
2988                 if (!idev)
2989                         goto out;
2990         }
2991
2992         if (cfg->fc_metric == 0)
2993                 cfg->fc_metric = IP6_RT_PRIO_USER;
2994
2995         if (cfg->fc_flags & RTNH_F_ONLINK) {
2996                 if (!dev) {
2997                         NL_SET_ERR_MSG(extack,
2998                                        "Nexthop device required for onlink");
2999                         err = -ENODEV;
3000                         goto out;
3001                 }
3002
3003                 if (!(dev->flags & IFF_UP)) {
3004                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3005                         err = -ENETDOWN;
3006                         goto out;
3007                 }
3008         }
3009
3010         err = -ENOBUFS;
3011         if (cfg->fc_nlinfo.nlh &&
3012             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3013                 table = fib6_get_table(net, cfg->fc_table);
3014                 if (!table) {
3015                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3016                         table = fib6_new_table(net, cfg->fc_table);
3017                 }
3018         } else {
3019                 table = fib6_new_table(net, cfg->fc_table);
3020         }
3021
3022         if (!table)
3023                 goto out;
3024
3025         err = -ENOMEM;
3026         rt = fib6_info_alloc(gfp_flags);
3027         if (!rt)
3028                 goto out;
3029
3030         if (cfg->fc_flags & RTF_ADDRCONF)
3031                 rt->dst_nocount = true;
3032
3033         err = ip6_convert_metrics(net, rt, cfg);
3034         if (err < 0)
3035                 goto out;
3036
3037         if (cfg->fc_flags & RTF_EXPIRES)
3038                 fib6_set_expires(rt, jiffies +
3039                                 clock_t_to_jiffies(cfg->fc_expires));
3040         else
3041                 fib6_clean_expires(rt);
3042
3043         if (cfg->fc_protocol == RTPROT_UNSPEC)
3044                 cfg->fc_protocol = RTPROT_BOOT;
3045         rt->fib6_protocol = cfg->fc_protocol;
3046
3047         addr_type = ipv6_addr_type(&cfg->fc_dst);
3048
3049         if (cfg->fc_encap) {
3050                 struct lwtunnel_state *lwtstate;
3051
3052                 err = lwtunnel_build_state(cfg->fc_encap_type,
3053                                            cfg->fc_encap, AF_INET6, cfg,
3054                                            &lwtstate, extack);
3055                 if (err)
3056                         goto out;
3057                 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3058         }
3059
3060         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3061         rt->fib6_dst.plen = cfg->fc_dst_len;
3062         if (rt->fib6_dst.plen == 128)
3063                 rt->dst_host = true;
3064
3065 #ifdef CONFIG_IPV6_SUBTREES
3066         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3067         rt->fib6_src.plen = cfg->fc_src_len;
3068 #endif
3069
3070         rt->fib6_metric = cfg->fc_metric;
3071         rt->fib6_nh.nh_weight = 1;
3072
3073         rt->fib6_type = cfg->fc_type;
3074
3075         /* We cannot add true routes via loopback here,
3076            they would result in kernel looping; promote them to reject routes
3077          */
3078         if ((cfg->fc_flags & RTF_REJECT) ||
3079             (dev && (dev->flags & IFF_LOOPBACK) &&
3080              !(addr_type & IPV6_ADDR_LOOPBACK) &&
3081              !(cfg->fc_flags & RTF_LOCAL))) {
3082                 /* hold loopback dev/idev if we haven't done so. */
3083                 if (dev != net->loopback_dev) {
3084                         if (dev) {
3085                                 dev_put(dev);
3086                                 in6_dev_put(idev);
3087                         }
3088                         dev = net->loopback_dev;
3089                         dev_hold(dev);
3090                         idev = in6_dev_get(dev);
3091                         if (!idev) {
3092                                 err = -ENODEV;
3093                                 goto out;
3094                         }
3095                 }
3096                 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3097                 goto install_route;
3098         }
3099
3100         if (cfg->fc_flags & RTF_GATEWAY) {
3101                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3102                 if (err)
3103                         goto out;
3104
3105                 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3106         }
3107
3108         err = -ENODEV;
3109         if (!dev)
3110                 goto out;
3111
3112         if (idev->cnf.disable_ipv6) {
3113                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3114                 err = -EACCES;
3115                 goto out;
3116         }
3117
3118         if (!(dev->flags & IFF_UP)) {
3119                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3120                 err = -ENETDOWN;
3121                 goto out;
3122         }
3123
3124         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3125                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3126                         NL_SET_ERR_MSG(extack, "Invalid source address");
3127                         err = -EINVAL;
3128                         goto out;
3129                 }
3130                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3131                 rt->fib6_prefsrc.plen = 128;
3132         } else
3133                 rt->fib6_prefsrc.plen = 0;
3134
3135         rt->fib6_flags = cfg->fc_flags;
3136
3137 install_route:
3138         if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3139             !netif_carrier_ok(dev))
3140                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3141         rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3142         rt->fib6_nh.nh_dev = dev;
3143         rt->fib6_table = table;
3144
3145         cfg->fc_nlinfo.nl_net = dev_net(dev);
3146
3147         if (idev)
3148                 in6_dev_put(idev);
3149
3150         return rt;
3151 out:
3152         if (dev)
3153                 dev_put(dev);
3154         if (idev)
3155                 in6_dev_put(idev);
3156
3157         fib6_info_release(rt);
3158         return ERR_PTR(err);
3159 }
3160
3161 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3162                   struct netlink_ext_ack *extack)
3163 {
3164         struct fib6_info *rt;
3165         int err;
3166
3167         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3168         if (IS_ERR(rt))
3169                 return PTR_ERR(rt);
3170
3171         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3172         fib6_info_release(rt);
3173
3174         return err;
3175 }
3176
3177 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3178 {
3179         struct net *net = info->nl_net;
3180         struct fib6_table *table;
3181         int err;
3182
3183         if (rt == net->ipv6.fib6_null_entry) {
3184                 err = -ENOENT;
3185                 goto out;
3186         }
3187
3188         table = rt->fib6_table;
3189         spin_lock_bh(&table->tb6_lock);
3190         err = fib6_del(rt, info);
3191         spin_unlock_bh(&table->tb6_lock);
3192
3193 out:
3194         fib6_info_release(rt);
3195         return err;
3196 }
3197
3198 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3199 {
3200         struct nl_info info = { .nl_net = net };
3201
3202         return __ip6_del_rt(rt, &info);
3203 }
3204
3205 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3206 {
3207         struct nl_info *info = &cfg->fc_nlinfo;
3208         struct net *net = info->nl_net;
3209         struct sk_buff *skb = NULL;
3210         struct fib6_table *table;
3211         int err = -ENOENT;
3212
3213         if (rt == net->ipv6.fib6_null_entry)
3214                 goto out_put;
3215         table = rt->fib6_table;
3216         spin_lock_bh(&table->tb6_lock);
3217
3218         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3219                 struct fib6_info *sibling, *next_sibling;
3220
3221                 /* prefer to send a single notification with all hops */
3222                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3223                 if (skb) {
3224                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3225
3226                         if (rt6_fill_node(net, skb, rt, NULL,
3227                                           NULL, NULL, 0, RTM_DELROUTE,
3228                                           info->portid, seq, 0) < 0) {
3229                                 kfree_skb(skb);
3230                                 skb = NULL;
3231                         } else
3232                                 info->skip_notify = 1;
3233                 }
3234
3235                 list_for_each_entry_safe(sibling, next_sibling,
3236                                          &rt->fib6_siblings,
3237                                          fib6_siblings) {
3238                         err = fib6_del(sibling, info);
3239                         if (err)
3240                                 goto out_unlock;
3241                 }
3242         }
3243
3244         err = fib6_del(rt, info);
3245 out_unlock:
3246         spin_unlock_bh(&table->tb6_lock);
3247 out_put:
3248         fib6_info_release(rt);
3249
3250         if (skb) {
3251                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3252                             info->nlh, gfp_any());
3253         }
3254         return err;
3255 }
3256
3257 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3258 {
3259         int rc = -ESRCH;
3260
3261         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3262                 goto out;
3263
3264         if (cfg->fc_flags & RTF_GATEWAY &&
3265             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3266                 goto out;
3267         if (dst_hold_safe(&rt->dst))
3268                 rc = rt6_remove_exception_rt(rt);
3269 out:
3270         return rc;
3271 }
3272
3273 static int ip6_route_del(struct fib6_config *cfg,
3274                          struct netlink_ext_ack *extack)
3275 {
3276         struct rt6_info *rt_cache;
3277         struct fib6_table *table;
3278         struct fib6_info *rt;
3279         struct fib6_node *fn;
3280         int err = -ESRCH;
3281
3282         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3283         if (!table) {
3284                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3285                 return err;
3286         }
3287
3288         rcu_read_lock();
3289
3290         fn = fib6_locate(&table->tb6_root,
3291                          &cfg->fc_dst, cfg->fc_dst_len,
3292                          &cfg->fc_src, cfg->fc_src_len,
3293                          !(cfg->fc_flags & RTF_CACHE));
3294
3295         if (fn) {
3296                 for_each_fib6_node_rt_rcu(fn) {
3297                         if (cfg->fc_flags & RTF_CACHE) {
3298                                 int rc;
3299
3300                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3301                                                               &cfg->fc_src);
3302                                 if (rt_cache) {
3303                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3304                                         if (rc != -ESRCH) {
3305                                                 rcu_read_unlock();
3306                                                 return rc;
3307                                         }
3308                                 }
3309                                 continue;
3310                         }
3311                         if (cfg->fc_ifindex &&
3312                             (!rt->fib6_nh.nh_dev ||
3313                              rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3314                                 continue;
3315                         if (cfg->fc_flags & RTF_GATEWAY &&
3316                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3317                                 continue;
3318                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3319                                 continue;
3320                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3321                                 continue;
3322                         if (!fib6_info_hold_safe(rt))
3323                                 continue;
3324                         rcu_read_unlock();
3325
3326                         /* if gateway was specified only delete the one hop */
3327                         if (cfg->fc_flags & RTF_GATEWAY)
3328                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3329
3330                         return __ip6_del_rt_siblings(rt, cfg);
3331                 }
3332         }
3333         rcu_read_unlock();
3334
3335         return err;
3336 }
3337
3338 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3339 {
3340         struct netevent_redirect netevent;
3341         struct rt6_info *rt, *nrt = NULL;
3342         struct ndisc_options ndopts;
3343         struct inet6_dev *in6_dev;
3344         struct neighbour *neigh;
3345         struct fib6_info *from;
3346         struct rd_msg *msg;
3347         int optlen, on_link;
3348         u8 *lladdr;
3349
3350         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3351         optlen -= sizeof(*msg);
3352
3353         if (optlen < 0) {
3354                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3355                 return;
3356         }
3357
3358         msg = (struct rd_msg *)icmp6_hdr(skb);
3359
3360         if (ipv6_addr_is_multicast(&msg->dest)) {
3361                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3362                 return;
3363         }
3364
3365         on_link = 0;
3366         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3367                 on_link = 1;
3368         } else if (ipv6_addr_type(&msg->target) !=
3369                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3370                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3371                 return;
3372         }
3373
3374         in6_dev = __in6_dev_get(skb->dev);
3375         if (!in6_dev)
3376                 return;
3377         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3378                 return;
3379
3380         /* RFC2461 8.1:
3381          *      The IP source address of the Redirect MUST be the same as the current
3382          *      first-hop router for the specified ICMP Destination Address.
3383          */
3384
3385         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3386                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3387                 return;
3388         }
3389
3390         lladdr = NULL;
3391         if (ndopts.nd_opts_tgt_lladdr) {
3392                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3393                                              skb->dev);
3394                 if (!lladdr) {
3395                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3396                         return;
3397                 }
3398         }
3399
3400         rt = (struct rt6_info *) dst;
3401         if (rt->rt6i_flags & RTF_REJECT) {
3402                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3403                 return;
3404         }
3405
3406         /* Redirect received -> path was valid.
3407          * Look, redirects are sent only in response to data packets,
3408          * so that this nexthop apparently is reachable. --ANK
3409          */
3410         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3411
3412         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3413         if (!neigh)
3414                 return;
3415
3416         /*
3417          *      We have finally decided to accept it.
3418          */
3419
3420         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3421                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3422                      NEIGH_UPDATE_F_OVERRIDE|
3423                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3424                                      NEIGH_UPDATE_F_ISROUTER)),
3425                      NDISC_REDIRECT, &ndopts);
3426
3427         rcu_read_lock();
3428         from = rcu_dereference(rt->from);
3429         /* This fib6_info_hold() is safe here because we hold reference to rt
3430          * and rt already holds reference to fib6_info.
3431          */
3432         fib6_info_hold(from);
3433         rcu_read_unlock();
3434
3435         nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3436         if (!nrt)
3437                 goto out;
3438
3439         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3440         if (on_link)
3441                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3442
3443         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3444
3445         /* No need to remove rt from the exception table if rt is
3446          * a cached route because rt6_insert_exception() will
3447          * takes care of it
3448          */
3449         if (rt6_insert_exception(nrt, from)) {
3450                 dst_release_immediate(&nrt->dst);
3451                 goto out;
3452         }
3453
3454         netevent.old = &rt->dst;
3455         netevent.new = &nrt->dst;
3456         netevent.daddr = &msg->dest;
3457         netevent.neigh = neigh;
3458         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3459
3460 out:
3461         fib6_info_release(from);
3462         neigh_release(neigh);
3463 }
3464
3465 #ifdef CONFIG_IPV6_ROUTE_INFO
3466 static struct fib6_info *rt6_get_route_info(struct net *net,
3467                                            const struct in6_addr *prefix, int prefixlen,
3468                                            const struct in6_addr *gwaddr,
3469                                            struct net_device *dev)
3470 {
3471         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3472         int ifindex = dev->ifindex;
3473         struct fib6_node *fn;
3474         struct fib6_info *rt = NULL;
3475         struct fib6_table *table;
3476
3477         table = fib6_get_table(net, tb_id);
3478         if (!table)
3479                 return NULL;
3480
3481         rcu_read_lock();
3482         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3483         if (!fn)
3484                 goto out;
3485
3486         for_each_fib6_node_rt_rcu(fn) {
3487                 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3488                         continue;
3489                 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3490                         continue;
3491                 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3492                         continue;
3493                 if (!fib6_info_hold_safe(rt))
3494                         continue;
3495                 break;
3496         }
3497 out:
3498         rcu_read_unlock();
3499         return rt;
3500 }
3501
3502 static struct fib6_info *rt6_add_route_info(struct net *net,
3503                                            const struct in6_addr *prefix, int prefixlen,
3504                                            const struct in6_addr *gwaddr,
3505                                            struct net_device *dev,
3506                                            unsigned int pref)
3507 {
3508         struct fib6_config cfg = {
3509                 .fc_metric      = IP6_RT_PRIO_USER,
3510                 .fc_ifindex     = dev->ifindex,
3511                 .fc_dst_len     = prefixlen,
3512                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3513                                   RTF_UP | RTF_PREF(pref),
3514                 .fc_protocol = RTPROT_RA,
3515                 .fc_type = RTN_UNICAST,
3516                 .fc_nlinfo.portid = 0,
3517                 .fc_nlinfo.nlh = NULL,
3518                 .fc_nlinfo.nl_net = net,
3519         };
3520
3521         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3522         cfg.fc_dst = *prefix;
3523         cfg.fc_gateway = *gwaddr;
3524
3525         /* We should treat it as a default route if prefix length is 0. */
3526         if (!prefixlen)
3527                 cfg.fc_flags |= RTF_DEFAULT;
3528
3529         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3530
3531         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3532 }
3533 #endif
3534
3535 struct fib6_info *rt6_get_dflt_router(struct net *net,
3536                                      const struct in6_addr *addr,
3537                                      struct net_device *dev)
3538 {
3539         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3540         struct fib6_info *rt;
3541         struct fib6_table *table;
3542
3543         table = fib6_get_table(net, tb_id);
3544         if (!table)
3545                 return NULL;
3546
3547         rcu_read_lock();
3548         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3549                 if (dev == rt->fib6_nh.nh_dev &&
3550                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3551                     ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3552                         break;
3553         }
3554         if (rt && !fib6_info_hold_safe(rt))
3555                 rt = NULL;
3556         rcu_read_unlock();
3557         return rt;
3558 }
3559
3560 struct fib6_info *rt6_add_dflt_router(struct net *net,
3561                                      const struct in6_addr *gwaddr,
3562                                      struct net_device *dev,
3563                                      unsigned int pref)
3564 {
3565         struct fib6_config cfg = {
3566                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3567                 .fc_metric      = IP6_RT_PRIO_USER,
3568                 .fc_ifindex     = dev->ifindex,
3569                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3570                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3571                 .fc_protocol = RTPROT_RA,
3572                 .fc_type = RTN_UNICAST,
3573                 .fc_nlinfo.portid = 0,
3574                 .fc_nlinfo.nlh = NULL,
3575                 .fc_nlinfo.nl_net = net,
3576         };
3577
3578         cfg.fc_gateway = *gwaddr;
3579
3580         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3581                 struct fib6_table *table;
3582
3583                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3584                 if (table)
3585                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3586         }
3587
3588         return rt6_get_dflt_router(net, gwaddr, dev);
3589 }
3590
3591 static void __rt6_purge_dflt_routers(struct net *net,
3592                                      struct fib6_table *table)
3593 {
3594         struct fib6_info *rt;
3595
3596 restart:
3597         rcu_read_lock();
3598         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3599                 struct net_device *dev = fib6_info_nh_dev(rt);
3600                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3601
3602                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3603                     (!idev || idev->cnf.accept_ra != 2) &&
3604                     fib6_info_hold_safe(rt)) {
3605                         rcu_read_unlock();
3606                         ip6_del_rt(net, rt);
3607                         goto restart;
3608                 }
3609         }
3610         rcu_read_unlock();
3611
3612         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3613 }
3614
3615 void rt6_purge_dflt_routers(struct net *net)
3616 {
3617         struct fib6_table *table;
3618         struct hlist_head *head;
3619         unsigned int h;
3620
3621         rcu_read_lock();
3622
3623         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3624                 head = &net->ipv6.fib_table_hash[h];
3625                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3626                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3627                                 __rt6_purge_dflt_routers(net, table);
3628                 }
3629         }
3630
3631         rcu_read_unlock();
3632 }
3633
3634 static void rtmsg_to_fib6_config(struct net *net,
3635                                  struct in6_rtmsg *rtmsg,
3636                                  struct fib6_config *cfg)
3637 {
3638         memset(cfg, 0, sizeof(*cfg));
3639
3640         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3641                          : RT6_TABLE_MAIN;
3642         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3643         cfg->fc_metric = rtmsg->rtmsg_metric;
3644         cfg->fc_expires = rtmsg->rtmsg_info;
3645         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3646         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3647         cfg->fc_flags = rtmsg->rtmsg_flags;
3648         cfg->fc_type = rtmsg->rtmsg_type;
3649
3650         cfg->fc_nlinfo.nl_net = net;
3651
3652         cfg->fc_dst = rtmsg->rtmsg_dst;
3653         cfg->fc_src = rtmsg->rtmsg_src;
3654         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3655 }
3656
3657 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3658 {
3659         struct fib6_config cfg;
3660         struct in6_rtmsg rtmsg;
3661         int err;
3662
3663         switch (cmd) {
3664         case SIOCADDRT:         /* Add a route */
3665         case SIOCDELRT:         /* Delete a route */
3666                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3667                         return -EPERM;
3668                 err = copy_from_user(&rtmsg, arg,
3669                                      sizeof(struct in6_rtmsg));
3670                 if (err)
3671                         return -EFAULT;
3672
3673                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3674
3675                 rtnl_lock();
3676                 switch (cmd) {
3677                 case SIOCADDRT:
3678                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3679                         break;
3680                 case SIOCDELRT:
3681                         err = ip6_route_del(&cfg, NULL);
3682                         break;
3683                 default:
3684                         err = -EINVAL;
3685                 }
3686                 rtnl_unlock();
3687
3688                 return err;
3689         }
3690
3691         return -EINVAL;
3692 }
3693
3694 /*
3695  *      Drop the packet on the floor
3696  */
3697
3698 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3699 {
3700         int type;
3701         struct dst_entry *dst = skb_dst(skb);
3702         switch (ipstats_mib_noroutes) {
3703         case IPSTATS_MIB_INNOROUTES:
3704                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3705                 if (type == IPV6_ADDR_ANY) {
3706                         IP6_INC_STATS(dev_net(dst->dev),
3707                                       __in6_dev_get_safely(skb->dev),
3708                                       IPSTATS_MIB_INADDRERRORS);
3709                         break;
3710                 }
3711                 /* FALLTHROUGH */
3712         case IPSTATS_MIB_OUTNOROUTES:
3713                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3714                               ipstats_mib_noroutes);
3715                 break;
3716         }
3717         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3718         kfree_skb(skb);
3719         return 0;
3720 }
3721
3722 static int ip6_pkt_discard(struct sk_buff *skb)
3723 {
3724         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3725 }
3726
3727 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3728 {
3729         skb->dev = skb_dst(skb)->dev;
3730         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3731 }
3732
3733 static int ip6_pkt_prohibit(struct sk_buff *skb)
3734 {
3735         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3736 }
3737
3738 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3739 {
3740         skb->dev = skb_dst(skb)->dev;
3741         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3742 }
3743
3744 /*
3745  *      Allocate a dst for local (unicast / anycast) address.
3746  */
3747
3748 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3749                                      struct inet6_dev *idev,
3750                                      const struct in6_addr *addr,
3751                                      bool anycast, gfp_t gfp_flags)
3752 {
3753         u32 tb_id;
3754         struct net_device *dev = idev->dev;
3755         struct fib6_info *f6i;
3756
3757         f6i = fib6_info_alloc(gfp_flags);
3758         if (!f6i)
3759                 return ERR_PTR(-ENOMEM);
3760
3761         f6i->dst_nocount = true;
3762         f6i->dst_host = true;
3763         f6i->fib6_protocol = RTPROT_KERNEL;
3764         f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3765         if (anycast) {
3766                 f6i->fib6_type = RTN_ANYCAST;
3767                 f6i->fib6_flags |= RTF_ANYCAST;
3768         } else {
3769                 f6i->fib6_type = RTN_LOCAL;
3770                 f6i->fib6_flags |= RTF_LOCAL;
3771         }
3772
3773         f6i->fib6_nh.nh_gw = *addr;
3774         dev_hold(dev);
3775         f6i->fib6_nh.nh_dev = dev;
3776         f6i->fib6_dst.addr = *addr;
3777         f6i->fib6_dst.plen = 128;
3778         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3779         f6i->fib6_table = fib6_get_table(net, tb_id);
3780
3781         return f6i;
3782 }
3783
3784 /* remove deleted ip from prefsrc entries */
3785 struct arg_dev_net_ip {
3786         struct net_device *dev;
3787         struct net *net;
3788         struct in6_addr *addr;
3789 };
3790
3791 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3792 {
3793         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3794         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3795         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3796
3797         if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3798             rt != net->ipv6.fib6_null_entry &&
3799             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3800                 spin_lock_bh(&rt6_exception_lock);
3801                 /* remove prefsrc entry */
3802                 rt->fib6_prefsrc.plen = 0;
3803                 /* need to update cache as well */
3804                 rt6_exceptions_remove_prefsrc(rt);
3805                 spin_unlock_bh(&rt6_exception_lock);
3806         }
3807         return 0;
3808 }
3809
3810 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3811 {
3812         struct net *net = dev_net(ifp->idev->dev);
3813         struct arg_dev_net_ip adni = {
3814                 .dev = ifp->idev->dev,
3815                 .net = net,
3816                 .addr = &ifp->addr,
3817         };
3818         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3819 }
3820
3821 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3822
3823 /* Remove routers and update dst entries when gateway turn into host. */
3824 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3825 {
3826         struct in6_addr *gateway = (struct in6_addr *)arg;
3827
3828         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3829             ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3830                 return -1;
3831         }
3832
3833         /* Further clean up cached routes in exception table.
3834          * This is needed because cached route may have a different
3835          * gateway than its 'parent' in the case of an ip redirect.
3836          */
3837         rt6_exceptions_clean_tohost(rt, gateway);
3838
3839         return 0;
3840 }
3841
3842 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3843 {
3844         fib6_clean_all(net, fib6_clean_tohost, gateway);
3845 }
3846
3847 struct arg_netdev_event {
3848         const struct net_device *dev;
3849         union {
3850                 unsigned int nh_flags;
3851                 unsigned long event;
3852         };
3853 };
3854
3855 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3856 {
3857         struct fib6_info *iter;
3858         struct fib6_node *fn;
3859
3860         fn = rcu_dereference_protected(rt->fib6_node,
3861                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3862         iter = rcu_dereference_protected(fn->leaf,
3863                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3864         while (iter) {
3865                 if (iter->fib6_metric == rt->fib6_metric &&
3866                     rt6_qualify_for_ecmp(iter))
3867                         return iter;
3868                 iter = rcu_dereference_protected(iter->fib6_next,
3869                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3870         }
3871
3872         return NULL;
3873 }
3874
3875 static bool rt6_is_dead(const struct fib6_info *rt)
3876 {
3877         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3878             (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3879              fib6_ignore_linkdown(rt)))
3880                 return true;
3881
3882         return false;
3883 }
3884
3885 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3886 {
3887         struct fib6_info *iter;
3888         int total = 0;
3889
3890         if (!rt6_is_dead(rt))
3891                 total += rt->fib6_nh.nh_weight;
3892
3893         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3894                 if (!rt6_is_dead(iter))
3895                         total += iter->fib6_nh.nh_weight;
3896         }
3897
3898         return total;
3899 }
3900
3901 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3902 {
3903         int upper_bound = -1;
3904
3905         if (!rt6_is_dead(rt)) {
3906                 *weight += rt->fib6_nh.nh_weight;
3907                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3908                                                     total) - 1;
3909         }
3910         atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3911 }
3912
3913 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3914 {
3915         struct fib6_info *iter;
3916         int weight = 0;
3917
3918         rt6_upper_bound_set(rt, &weight, total);
3919
3920         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3921                 rt6_upper_bound_set(iter, &weight, total);
3922 }
3923
3924 void rt6_multipath_rebalance(struct fib6_info *rt)
3925 {
3926         struct fib6_info *first;
3927         int total;
3928
3929         /* In case the entire multipath route was marked for flushing,
3930          * then there is no need to rebalance upon the removal of every
3931          * sibling route.
3932          */
3933         if (!rt->fib6_nsiblings || rt->should_flush)
3934                 return;
3935
3936         /* During lookup routes are evaluated in order, so we need to
3937          * make sure upper bounds are assigned from the first sibling
3938          * onwards.
3939          */
3940         first = rt6_multipath_first_sibling(rt);
3941         if (WARN_ON_ONCE(!first))
3942                 return;
3943
3944         total = rt6_multipath_total_weight(first);
3945         rt6_multipath_upper_bound_set(first, total);
3946 }
3947
3948 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3949 {
3950         const struct arg_netdev_event *arg = p_arg;
3951         struct net *net = dev_net(arg->dev);
3952
3953         if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3954                 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3955                 fib6_update_sernum_upto_root(net, rt);
3956                 rt6_multipath_rebalance(rt);
3957         }
3958
3959         return 0;
3960 }
3961
3962 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3963 {
3964         struct arg_netdev_event arg = {
3965                 .dev = dev,
3966                 {
3967                         .nh_flags = nh_flags,
3968                 },
3969         };
3970
3971         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3972                 arg.nh_flags |= RTNH_F_LINKDOWN;
3973
3974         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3975 }
3976
3977 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3978                                    const struct net_device *dev)
3979 {
3980         struct fib6_info *iter;
3981
3982         if (rt->fib6_nh.nh_dev == dev)
3983                 return true;
3984         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3985                 if (iter->fib6_nh.nh_dev == dev)
3986                         return true;
3987
3988         return false;
3989 }
3990
3991 static void rt6_multipath_flush(struct fib6_info *rt)
3992 {
3993         struct fib6_info *iter;
3994
3995         rt->should_flush = 1;
3996         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3997                 iter->should_flush = 1;
3998 }
3999
4000 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4001                                              const struct net_device *down_dev)
4002 {
4003         struct fib6_info *iter;
4004         unsigned int dead = 0;
4005
4006         if (rt->fib6_nh.nh_dev == down_dev ||
4007             rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4008                 dead++;
4009         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4010                 if (iter->fib6_nh.nh_dev == down_dev ||
4011                     iter->fib6_nh.nh_flags & RTNH_F_DEAD)
4012                         dead++;
4013
4014         return dead;
4015 }
4016
4017 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4018                                        const struct net_device *dev,
4019                                        unsigned int nh_flags)
4020 {
4021         struct fib6_info *iter;
4022
4023         if (rt->fib6_nh.nh_dev == dev)
4024                 rt->fib6_nh.nh_flags |= nh_flags;
4025         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4026                 if (iter->fib6_nh.nh_dev == dev)
4027                         iter->fib6_nh.nh_flags |= nh_flags;
4028 }
4029
4030 /* called with write lock held for table with rt */
4031 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4032 {
4033         const struct arg_netdev_event *arg = p_arg;
4034         const struct net_device *dev = arg->dev;
4035         struct net *net = dev_net(dev);
4036
4037         if (rt == net->ipv6.fib6_null_entry)
4038                 return 0;
4039
4040         switch (arg->event) {
4041         case NETDEV_UNREGISTER:
4042                 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4043         case NETDEV_DOWN:
4044                 if (rt->should_flush)
4045                         return -1;
4046                 if (!rt->fib6_nsiblings)
4047                         return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4048                 if (rt6_multipath_uses_dev(rt, dev)) {
4049                         unsigned int count;
4050
4051                         count = rt6_multipath_dead_count(rt, dev);
4052                         if (rt->fib6_nsiblings + 1 == count) {
4053                                 rt6_multipath_flush(rt);
4054                                 return -1;
4055                         }
4056                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4057                                                    RTNH_F_LINKDOWN);
4058                         fib6_update_sernum(net, rt);
4059                         rt6_multipath_rebalance(rt);
4060                 }
4061                 return -2;
4062         case NETDEV_CHANGE:
4063                 if (rt->fib6_nh.nh_dev != dev ||
4064                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4065                         break;
4066                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4067                 rt6_multipath_rebalance(rt);
4068                 break;
4069         }
4070
4071         return 0;
4072 }
4073
4074 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4075 {
4076         struct arg_netdev_event arg = {
4077                 .dev = dev,
4078                 {
4079                         .event = event,
4080                 },
4081         };
4082
4083         fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
4084 }
4085
4086 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4087 {
4088         rt6_sync_down_dev(dev, event);
4089         rt6_uncached_list_flush_dev(dev_net(dev), dev);
4090         neigh_ifdown(&nd_tbl, dev);
4091 }
4092
4093 struct rt6_mtu_change_arg {
4094         struct net_device *dev;
4095         unsigned int mtu;
4096 };
4097
4098 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4099 {
4100         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4101         struct inet6_dev *idev;
4102
4103         /* In IPv6 pmtu discovery is not optional,
4104            so that RTAX_MTU lock cannot disable it.
4105            We still use this lock to block changes
4106            caused by addrconf/ndisc.
4107         */
4108
4109         idev = __in6_dev_get(arg->dev);
4110         if (!idev)
4111                 return 0;
4112
4113         /* For administrative MTU increase, there is no way to discover
4114            IPv6 PMTU increase, so PMTU increase should be updated here.
4115            Since RFC 1981 doesn't include administrative MTU increase
4116            update PMTU increase is a MUST. (i.e. jumbo frame)
4117          */
4118         if (rt->fib6_nh.nh_dev == arg->dev &&
4119             !fib6_metric_locked(rt, RTAX_MTU)) {
4120                 u32 mtu = rt->fib6_pmtu;
4121
4122                 if (mtu >= arg->mtu ||
4123                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4124                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4125
4126                 spin_lock_bh(&rt6_exception_lock);
4127                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4128                 spin_unlock_bh(&rt6_exception_lock);
4129         }
4130         return 0;
4131 }
4132
4133 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4134 {
4135         struct rt6_mtu_change_arg arg = {
4136                 .dev = dev,
4137                 .mtu = mtu,
4138         };
4139
4140         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4141 }
4142
4143 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4144         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4145         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
4146         [RTA_OIF]               = { .type = NLA_U32 },
4147         [RTA_IIF]               = { .type = NLA_U32 },
4148         [RTA_PRIORITY]          = { .type = NLA_U32 },
4149         [RTA_METRICS]           = { .type = NLA_NESTED },
4150         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4151         [RTA_PREF]              = { .type = NLA_U8 },
4152         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4153         [RTA_ENCAP]             = { .type = NLA_NESTED },
4154         [RTA_EXPIRES]           = { .type = NLA_U32 },
4155         [RTA_UID]               = { .type = NLA_U32 },
4156         [RTA_MARK]              = { .type = NLA_U32 },
4157         [RTA_TABLE]             = { .type = NLA_U32 },
4158         [RTA_IP_PROTO]          = { .type = NLA_U8 },
4159         [RTA_SPORT]             = { .type = NLA_U16 },
4160         [RTA_DPORT]             = { .type = NLA_U16 },
4161 };
4162
4163 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4164                               struct fib6_config *cfg,
4165                               struct netlink_ext_ack *extack)
4166 {
4167         struct rtmsg *rtm;
4168         struct nlattr *tb[RTA_MAX+1];
4169         unsigned int pref;
4170         int err;
4171
4172         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4173                           NULL);
4174         if (err < 0)
4175                 goto errout;
4176
4177         err = -EINVAL;
4178         rtm = nlmsg_data(nlh);
4179         memset(cfg, 0, sizeof(*cfg));
4180
4181         cfg->fc_table = rtm->rtm_table;
4182         cfg->fc_dst_len = rtm->rtm_dst_len;
4183         cfg->fc_src_len = rtm->rtm_src_len;
4184         cfg->fc_flags = RTF_UP;
4185         cfg->fc_protocol = rtm->rtm_protocol;
4186         cfg->fc_type = rtm->rtm_type;
4187
4188         if (rtm->rtm_type == RTN_UNREACHABLE ||
4189             rtm->rtm_type == RTN_BLACKHOLE ||
4190             rtm->rtm_type == RTN_PROHIBIT ||
4191             rtm->rtm_type == RTN_THROW)
4192                 cfg->fc_flags |= RTF_REJECT;
4193
4194         if (rtm->rtm_type == RTN_LOCAL)
4195                 cfg->fc_flags |= RTF_LOCAL;
4196
4197         if (rtm->rtm_flags & RTM_F_CLONED)
4198                 cfg->fc_flags |= RTF_CACHE;
4199
4200         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4201
4202         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4203         cfg->fc_nlinfo.nlh = nlh;
4204         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4205
4206         if (tb[RTA_GATEWAY]) {
4207                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4208                 cfg->fc_flags |= RTF_GATEWAY;
4209         }
4210
4211         if (tb[RTA_DST]) {
4212                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4213
4214                 if (nla_len(tb[RTA_DST]) < plen)
4215                         goto errout;
4216
4217                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4218         }
4219
4220         if (tb[RTA_SRC]) {
4221                 int plen = (rtm->rtm_src_len + 7) >> 3;
4222
4223                 if (nla_len(tb[RTA_SRC]) < plen)
4224                         goto errout;
4225
4226                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4227         }
4228
4229         if (tb[RTA_PREFSRC])
4230                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4231
4232         if (tb[RTA_OIF])
4233                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4234
4235         if (tb[RTA_PRIORITY])
4236                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4237
4238         if (tb[RTA_METRICS]) {
4239                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4240                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4241         }
4242
4243         if (tb[RTA_TABLE])
4244                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4245
4246         if (tb[RTA_MULTIPATH]) {
4247                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4248                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4249
4250                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4251                                                      cfg->fc_mp_len, extack);
4252                 if (err < 0)
4253                         goto errout;
4254         }
4255
4256         if (tb[RTA_PREF]) {
4257                 pref = nla_get_u8(tb[RTA_PREF]);
4258                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4259                     pref != ICMPV6_ROUTER_PREF_HIGH)
4260                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4261                 cfg->fc_flags |= RTF_PREF(pref);
4262         }
4263
4264         if (tb[RTA_ENCAP])
4265                 cfg->fc_encap = tb[RTA_ENCAP];
4266
4267         if (tb[RTA_ENCAP_TYPE]) {
4268                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4269
4270                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4271                 if (err < 0)
4272                         goto errout;
4273         }
4274
4275         if (tb[RTA_EXPIRES]) {
4276                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4277
4278                 if (addrconf_finite_timeout(timeout)) {
4279                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4280                         cfg->fc_flags |= RTF_EXPIRES;
4281                 }
4282         }
4283
4284         err = 0;
4285 errout:
4286         return err;
4287 }
4288
4289 struct rt6_nh {
4290         struct fib6_info *fib6_info;
4291         struct fib6_config r_cfg;
4292         struct list_head next;
4293 };
4294
4295 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4296 {
4297         struct rt6_nh *nh;
4298
4299         list_for_each_entry(nh, rt6_nh_list, next) {
4300                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4301                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4302                         nh->r_cfg.fc_ifindex);
4303         }
4304 }
4305
4306 static int ip6_route_info_append(struct net *net,
4307                                  struct list_head *rt6_nh_list,
4308                                  struct fib6_info *rt,
4309                                  struct fib6_config *r_cfg)
4310 {
4311         struct rt6_nh *nh;
4312         int err = -EEXIST;
4313
4314         list_for_each_entry(nh, rt6_nh_list, next) {
4315                 /* check if fib6_info already exists */
4316                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4317                         return err;
4318         }
4319
4320         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4321         if (!nh)
4322                 return -ENOMEM;
4323         nh->fib6_info = rt;
4324         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4325         list_add_tail(&nh->next, rt6_nh_list);
4326
4327         return 0;
4328 }
4329
4330 static void ip6_route_mpath_notify(struct fib6_info *rt,
4331                                    struct fib6_info *rt_last,
4332                                    struct nl_info *info,
4333                                    __u16 nlflags)
4334 {
4335         /* if this is an APPEND route, then rt points to the first route
4336          * inserted and rt_last points to last route inserted. Userspace
4337          * wants a consistent dump of the route which starts at the first
4338          * nexthop. Since sibling routes are always added at the end of
4339          * the list, find the first sibling of the last route appended
4340          */
4341         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4342                 rt = list_first_entry(&rt_last->fib6_siblings,
4343                                       struct fib6_info,
4344                                       fib6_siblings);
4345         }
4346
4347         if (rt)
4348                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4349 }
4350
4351 static int ip6_route_multipath_add(struct fib6_config *cfg,
4352                                    struct netlink_ext_ack *extack)
4353 {
4354         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4355         struct nl_info *info = &cfg->fc_nlinfo;
4356         struct fib6_config r_cfg;
4357         struct rtnexthop *rtnh;
4358         struct fib6_info *rt;
4359         struct rt6_nh *err_nh;
4360         struct rt6_nh *nh, *nh_safe;
4361         __u16 nlflags;
4362         int remaining;
4363         int attrlen;
4364         int err = 1;
4365         int nhn = 0;
4366         int replace = (cfg->fc_nlinfo.nlh &&
4367                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4368         LIST_HEAD(rt6_nh_list);
4369
4370         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4371         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4372                 nlflags |= NLM_F_APPEND;
4373
4374         remaining = cfg->fc_mp_len;
4375         rtnh = (struct rtnexthop *)cfg->fc_mp;
4376
4377         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4378          * fib6_info structs per nexthop
4379          */
4380         while (rtnh_ok(rtnh, remaining)) {
4381                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4382                 if (rtnh->rtnh_ifindex)
4383                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4384
4385                 attrlen = rtnh_attrlen(rtnh);
4386                 if (attrlen > 0) {
4387                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4388
4389                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4390                         if (nla) {
4391                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4392                                 r_cfg.fc_flags |= RTF_GATEWAY;
4393                         }
4394                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4395                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4396                         if (nla)
4397                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4398                 }
4399
4400                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4401                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4402                 if (IS_ERR(rt)) {
4403                         err = PTR_ERR(rt);
4404                         rt = NULL;
4405                         goto cleanup;
4406                 }
4407                 if (!rt6_qualify_for_ecmp(rt)) {
4408                         err = -EINVAL;
4409                         NL_SET_ERR_MSG(extack,
4410                                        "Device only routes can not be added for IPv6 using the multipath API.");
4411                         fib6_info_release(rt);
4412                         goto cleanup;
4413                 }
4414
4415                 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4416
4417                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4418                                             rt, &r_cfg);
4419                 if (err) {
4420                         fib6_info_release(rt);
4421                         goto cleanup;
4422                 }
4423
4424                 rtnh = rtnh_next(rtnh, &remaining);
4425         }
4426
4427         /* for add and replace send one notification with all nexthops.
4428          * Skip the notification in fib6_add_rt2node and send one with
4429          * the full route when done
4430          */
4431         info->skip_notify = 1;
4432
4433         err_nh = NULL;
4434         list_for_each_entry(nh, &rt6_nh_list, next) {
4435                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4436                 fib6_info_release(nh->fib6_info);
4437
4438                 if (!err) {
4439                         /* save reference to last route successfully inserted */
4440                         rt_last = nh->fib6_info;
4441
4442                         /* save reference to first route for notification */
4443                         if (!rt_notif)
4444                                 rt_notif = nh->fib6_info;
4445                 }
4446
4447                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4448                 nh->fib6_info = NULL;
4449                 if (err) {
4450                         if (replace && nhn)
4451                                 ip6_print_replace_route_err(&rt6_nh_list);
4452                         err_nh = nh;
4453                         goto add_errout;
4454                 }
4455
4456                 /* Because each route is added like a single route we remove
4457                  * these flags after the first nexthop: if there is a collision,
4458                  * we have already failed to add the first nexthop:
4459                  * fib6_add_rt2node() has rejected it; when replacing, old
4460                  * nexthops have been replaced by first new, the rest should
4461                  * be added to it.
4462                  */
4463                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4464                                                      NLM_F_REPLACE);
4465                 nhn++;
4466         }
4467
4468         /* success ... tell user about new route */
4469         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4470         goto cleanup;
4471
4472 add_errout:
4473         /* send notification for routes that were added so that
4474          * the delete notifications sent by ip6_route_del are
4475          * coherent
4476          */
4477         if (rt_notif)
4478                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4479
4480         /* Delete routes that were already added */
4481         list_for_each_entry(nh, &rt6_nh_list, next) {
4482                 if (err_nh == nh)
4483                         break;
4484                 ip6_route_del(&nh->r_cfg, extack);
4485         }
4486
4487 cleanup:
4488         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4489                 if (nh->fib6_info)
4490                         fib6_info_release(nh->fib6_info);
4491                 list_del(&nh->next);
4492                 kfree(nh);
4493         }
4494
4495         return err;
4496 }
4497
4498 static int ip6_route_multipath_del(struct fib6_config *cfg,
4499                                    struct netlink_ext_ack *extack)
4500 {
4501         struct fib6_config r_cfg;
4502         struct rtnexthop *rtnh;
4503         int remaining;
4504         int attrlen;
4505         int err = 1, last_err = 0;
4506
4507         remaining = cfg->fc_mp_len;
4508         rtnh = (struct rtnexthop *)cfg->fc_mp;
4509
4510         /* Parse a Multipath Entry */
4511         while (rtnh_ok(rtnh, remaining)) {
4512                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4513                 if (rtnh->rtnh_ifindex)
4514                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4515
4516                 attrlen = rtnh_attrlen(rtnh);
4517                 if (attrlen > 0) {
4518                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4519
4520                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4521                         if (nla) {
4522                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4523                                 r_cfg.fc_flags |= RTF_GATEWAY;
4524                         }
4525                 }
4526                 err = ip6_route_del(&r_cfg, extack);
4527                 if (err)
4528                         last_err = err;
4529
4530                 rtnh = rtnh_next(rtnh, &remaining);
4531         }
4532
4533         return last_err;
4534 }
4535
4536 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4537                               struct netlink_ext_ack *extack)
4538 {
4539         struct fib6_config cfg;
4540         int err;
4541
4542         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4543         if (err < 0)
4544                 return err;
4545
4546         if (cfg.fc_mp)
4547                 return ip6_route_multipath_del(&cfg, extack);
4548         else {
4549                 cfg.fc_delete_all_nh = 1;
4550                 return ip6_route_del(&cfg, extack);
4551         }
4552 }
4553
4554 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4555                               struct netlink_ext_ack *extack)
4556 {
4557         struct fib6_config cfg;
4558         int err;
4559
4560         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4561         if (err < 0)
4562                 return err;
4563
4564         if (cfg.fc_mp)
4565                 return ip6_route_multipath_add(&cfg, extack);
4566         else
4567                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4568 }
4569
4570 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4571 {
4572         int nexthop_len = 0;
4573
4574         if (rt->fib6_nsiblings) {
4575                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4576                             + NLA_ALIGN(sizeof(struct rtnexthop))
4577                             + nla_total_size(16) /* RTA_GATEWAY */
4578                             + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4579
4580                 nexthop_len *= rt->fib6_nsiblings;
4581         }
4582
4583         return NLMSG_ALIGN(sizeof(struct rtmsg))
4584                + nla_total_size(16) /* RTA_SRC */
4585                + nla_total_size(16) /* RTA_DST */
4586                + nla_total_size(16) /* RTA_GATEWAY */
4587                + nla_total_size(16) /* RTA_PREFSRC */
4588                + nla_total_size(4) /* RTA_TABLE */
4589                + nla_total_size(4) /* RTA_IIF */
4590                + nla_total_size(4) /* RTA_OIF */
4591                + nla_total_size(4) /* RTA_PRIORITY */
4592                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4593                + nla_total_size(sizeof(struct rta_cacheinfo))
4594                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4595                + nla_total_size(1) /* RTA_PREF */
4596                + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4597                + nexthop_len;
4598 }
4599
4600 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4601                             unsigned int *flags, bool skip_oif)
4602 {
4603         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4604                 *flags |= RTNH_F_DEAD;
4605
4606         if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4607                 *flags |= RTNH_F_LINKDOWN;
4608
4609                 rcu_read_lock();
4610                 if (fib6_ignore_linkdown(rt))
4611                         *flags |= RTNH_F_DEAD;
4612                 rcu_read_unlock();
4613         }
4614
4615         if (rt->fib6_flags & RTF_GATEWAY) {
4616                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4617                         goto nla_put_failure;
4618         }
4619
4620         *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4621         if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4622                 *flags |= RTNH_F_OFFLOAD;
4623
4624         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4625         if (!skip_oif && rt->fib6_nh.nh_dev &&
4626             nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4627                 goto nla_put_failure;
4628
4629         if (rt->fib6_nh.nh_lwtstate &&
4630             lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4631                 goto nla_put_failure;
4632
4633         return 0;
4634
4635 nla_put_failure:
4636         return -EMSGSIZE;
4637 }
4638
4639 /* add multipath next hop */
4640 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4641 {
4642         const struct net_device *dev = rt->fib6_nh.nh_dev;
4643         struct rtnexthop *rtnh;
4644         unsigned int flags = 0;
4645
4646         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4647         if (!rtnh)
4648                 goto nla_put_failure;
4649
4650         rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4651         rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4652
4653         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4654                 goto nla_put_failure;
4655
4656         rtnh->rtnh_flags = flags;
4657
4658         /* length of rtnetlink header + attributes */
4659         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4660
4661         return 0;
4662
4663 nla_put_failure:
4664         return -EMSGSIZE;
4665 }
4666
4667 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4668                          struct fib6_info *rt, struct dst_entry *dst,
4669                          struct in6_addr *dest, struct in6_addr *src,
4670                          int iif, int type, u32 portid, u32 seq,
4671                          unsigned int flags)
4672 {
4673         struct rt6_info *rt6 = (struct rt6_info *)dst;
4674         struct rt6key *rt6_dst, *rt6_src;
4675         u32 *pmetrics, table, rt6_flags;
4676         struct nlmsghdr *nlh;
4677         struct rtmsg *rtm;
4678         long expires = 0;
4679
4680         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4681         if (!nlh)
4682                 return -EMSGSIZE;
4683
4684         if (rt6) {
4685                 rt6_dst = &rt6->rt6i_dst;
4686                 rt6_src = &rt6->rt6i_src;
4687                 rt6_flags = rt6->rt6i_flags;
4688         } else {
4689                 rt6_dst = &rt->fib6_dst;
4690                 rt6_src = &rt->fib6_src;
4691                 rt6_flags = rt->fib6_flags;
4692         }
4693
4694         rtm = nlmsg_data(nlh);
4695         rtm->rtm_family = AF_INET6;
4696         rtm->rtm_dst_len = rt6_dst->plen;
4697         rtm->rtm_src_len = rt6_src->plen;
4698         rtm->rtm_tos = 0;
4699         if (rt->fib6_table)
4700                 table = rt->fib6_table->tb6_id;
4701         else
4702                 table = RT6_TABLE_UNSPEC;
4703         rtm->rtm_table = table;
4704         if (nla_put_u32(skb, RTA_TABLE, table))
4705                 goto nla_put_failure;
4706
4707         rtm->rtm_type = rt->fib6_type;
4708         rtm->rtm_flags = 0;
4709         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4710         rtm->rtm_protocol = rt->fib6_protocol;
4711
4712         if (rt6_flags & RTF_CACHE)
4713                 rtm->rtm_flags |= RTM_F_CLONED;
4714
4715         if (dest) {
4716                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4717                         goto nla_put_failure;
4718                 rtm->rtm_dst_len = 128;
4719         } else if (rtm->rtm_dst_len)
4720                 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4721                         goto nla_put_failure;
4722 #ifdef CONFIG_IPV6_SUBTREES
4723         if (src) {
4724                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4725                         goto nla_put_failure;
4726                 rtm->rtm_src_len = 128;
4727         } else if (rtm->rtm_src_len &&
4728                    nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4729                 goto nla_put_failure;
4730 #endif
4731         if (iif) {
4732 #ifdef CONFIG_IPV6_MROUTE
4733                 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4734                         int err = ip6mr_get_route(net, skb, rtm, portid);
4735
4736                         if (err == 0)
4737                                 return 0;
4738                         if (err < 0)
4739                                 goto nla_put_failure;
4740                 } else
4741 #endif
4742                         if (nla_put_u32(skb, RTA_IIF, iif))
4743                                 goto nla_put_failure;
4744         } else if (dest) {
4745                 struct in6_addr saddr_buf;
4746                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4747                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4748                         goto nla_put_failure;
4749         }
4750
4751         if (rt->fib6_prefsrc.plen) {
4752                 struct in6_addr saddr_buf;
4753                 saddr_buf = rt->fib6_prefsrc.addr;
4754                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4755                         goto nla_put_failure;
4756         }
4757
4758         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4759         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4760                 goto nla_put_failure;
4761
4762         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4763                 goto nla_put_failure;
4764
4765         /* For multipath routes, walk the siblings list and add
4766          * each as a nexthop within RTA_MULTIPATH.
4767          */
4768         if (rt6) {
4769                 if (rt6_flags & RTF_GATEWAY &&
4770                     nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4771                         goto nla_put_failure;
4772
4773                 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4774                         goto nla_put_failure;
4775         } else if (rt->fib6_nsiblings) {
4776                 struct fib6_info *sibling, *next_sibling;
4777                 struct nlattr *mp;
4778
4779                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4780                 if (!mp)
4781                         goto nla_put_failure;
4782
4783                 if (rt6_add_nexthop(skb, rt) < 0)
4784                         goto nla_put_failure;
4785
4786                 list_for_each_entry_safe(sibling, next_sibling,
4787                                          &rt->fib6_siblings, fib6_siblings) {
4788                         if (rt6_add_nexthop(skb, sibling) < 0)
4789                                 goto nla_put_failure;
4790                 }
4791
4792                 nla_nest_end(skb, mp);
4793         } else {
4794                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4795                         goto nla_put_failure;
4796         }
4797
4798         if (rt6_flags & RTF_EXPIRES) {
4799                 expires = dst ? dst->expires : rt->expires;
4800                 expires -= jiffies;
4801         }
4802
4803         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4804                 goto nla_put_failure;
4805
4806         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4807                 goto nla_put_failure;
4808
4809
4810         nlmsg_end(skb, nlh);
4811         return 0;
4812
4813 nla_put_failure:
4814         nlmsg_cancel(skb, nlh);
4815         return -EMSGSIZE;
4816 }
4817
4818 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4819 {
4820         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4821         struct net *net = arg->net;
4822
4823         if (rt == net->ipv6.fib6_null_entry)
4824                 return 0;
4825
4826         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4827                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4828
4829                 /* user wants prefix routes only */
4830                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4831                     !(rt->fib6_flags & RTF_PREFIX_RT)) {
4832                         /* success since this is not a prefix route */
4833                         return 1;
4834                 }
4835         }
4836
4837         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4838                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4839                              arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4840 }
4841
4842 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4843                               struct netlink_ext_ack *extack)
4844 {
4845         struct net *net = sock_net(in_skb->sk);
4846         struct nlattr *tb[RTA_MAX+1];
4847         int err, iif = 0, oif = 0;
4848         struct fib6_info *from;
4849         struct dst_entry *dst;
4850         struct rt6_info *rt;
4851         struct sk_buff *skb;
4852         struct rtmsg *rtm;
4853         struct flowi6 fl6;
4854         bool fibmatch;
4855
4856         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4857                           extack);
4858         if (err < 0)
4859                 goto errout;
4860
4861         err = -EINVAL;
4862         memset(&fl6, 0, sizeof(fl6));
4863         rtm = nlmsg_data(nlh);
4864         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4865         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4866
4867         if (tb[RTA_SRC]) {
4868                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4869                         goto errout;
4870
4871                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4872         }
4873
4874         if (tb[RTA_DST]) {
4875                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4876                         goto errout;
4877
4878                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4879         }
4880
4881         if (tb[RTA_IIF])
4882                 iif = nla_get_u32(tb[RTA_IIF]);
4883
4884         if (tb[RTA_OIF])
4885                 oif = nla_get_u32(tb[RTA_OIF]);
4886
4887         if (tb[RTA_MARK])
4888                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4889
4890         if (tb[RTA_UID])
4891                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4892                                            nla_get_u32(tb[RTA_UID]));
4893         else
4894                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4895
4896         if (tb[RTA_SPORT])
4897                 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4898
4899         if (tb[RTA_DPORT])
4900                 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4901
4902         if (tb[RTA_IP_PROTO]) {
4903                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4904                                                   &fl6.flowi6_proto, extack);
4905                 if (err)
4906                         goto errout;
4907         }
4908
4909         if (iif) {
4910                 struct net_device *dev;
4911                 int flags = 0;
4912
4913                 rcu_read_lock();
4914
4915                 dev = dev_get_by_index_rcu(net, iif);
4916                 if (!dev) {
4917                         rcu_read_unlock();
4918                         err = -ENODEV;
4919                         goto errout;
4920                 }
4921
4922                 fl6.flowi6_iif = iif;
4923
4924                 if (!ipv6_addr_any(&fl6.saddr))
4925                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4926
4927                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4928
4929                 rcu_read_unlock();
4930         } else {
4931                 fl6.flowi6_oif = oif;
4932
4933                 dst = ip6_route_output(net, NULL, &fl6);
4934         }
4935
4936
4937         rt = container_of(dst, struct rt6_info, dst);
4938         if (rt->dst.error) {
4939                 err = rt->dst.error;
4940                 ip6_rt_put(rt);
4941                 goto errout;
4942         }
4943
4944         if (rt == net->ipv6.ip6_null_entry) {
4945                 err = rt->dst.error;
4946                 ip6_rt_put(rt);
4947                 goto errout;
4948         }
4949
4950         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4951         if (!skb) {
4952                 ip6_rt_put(rt);
4953                 err = -ENOBUFS;
4954                 goto errout;
4955         }
4956
4957         skb_dst_set(skb, &rt->dst);
4958
4959         rcu_read_lock();
4960         from = rcu_dereference(rt->from);
4961
4962         if (fibmatch)
4963                 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4964                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4965                                     nlh->nlmsg_seq, 0);
4966         else
4967                 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4968                                     &fl6.saddr, iif, RTM_NEWROUTE,
4969                                     NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4970                                     0);
4971         rcu_read_unlock();
4972
4973         if (err < 0) {
4974                 kfree_skb(skb);
4975                 goto errout;
4976         }
4977
4978         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4979 errout:
4980         return err;
4981 }
4982
4983 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4984                      unsigned int nlm_flags)
4985 {
4986         struct sk_buff *skb;
4987         struct net *net = info->nl_net;
4988         u32 seq;
4989         int err;
4990
4991         err = -ENOBUFS;
4992         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4993
4994         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4995         if (!skb)
4996                 goto errout;
4997
4998         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4999                             event, info->portid, seq, nlm_flags);
5000         if (err < 0) {
5001                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5002                 WARN_ON(err == -EMSGSIZE);
5003                 kfree_skb(skb);
5004                 goto errout;
5005         }
5006         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5007                     info->nlh, gfp_any());
5008         return;
5009 errout:
5010         if (err < 0)
5011                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5012 }
5013
5014 static int ip6_route_dev_notify(struct notifier_block *this,
5015                                 unsigned long event, void *ptr)
5016 {
5017         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5018         struct net *net = dev_net(dev);
5019
5020         if (!(dev->flags & IFF_LOOPBACK))
5021                 return NOTIFY_OK;
5022
5023         if (event == NETDEV_REGISTER) {
5024                 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5025                 net->ipv6.ip6_null_entry->dst.dev = dev;
5026                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5027 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5028                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5029                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5030                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5031                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5032 #endif
5033          } else if (event == NETDEV_UNREGISTER &&
5034                     dev->reg_state != NETREG_UNREGISTERED) {
5035                 /* NETDEV_UNREGISTER could be fired for multiple times by
5036                  * netdev_wait_allrefs(). Make sure we only call this once.
5037                  */
5038                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5039 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5040                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5041                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5042 #endif
5043         }
5044
5045         return NOTIFY_OK;
5046 }
5047
5048 /*
5049  *      /proc
5050  */
5051
5052 #ifdef CONFIG_PROC_FS
5053 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5054 {
5055         struct net *net = (struct net *)seq->private;
5056         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5057                    net->ipv6.rt6_stats->fib_nodes,
5058                    net->ipv6.rt6_stats->fib_route_nodes,
5059                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5060                    net->ipv6.rt6_stats->fib_rt_entries,
5061                    net->ipv6.rt6_stats->fib_rt_cache,
5062                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5063                    net->ipv6.rt6_stats->fib_discarded_routes);
5064
5065         return 0;
5066 }
5067 #endif  /* CONFIG_PROC_FS */
5068
5069 #ifdef CONFIG_SYSCTL
5070
5071 static
5072 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5073                               void __user *buffer, size_t *lenp, loff_t *ppos)
5074 {
5075         struct net *net;
5076         int delay;
5077         if (!write)
5078                 return -EINVAL;
5079
5080         net = (struct net *)ctl->extra1;
5081         delay = net->ipv6.sysctl.flush_delay;
5082         proc_dointvec(ctl, write, buffer, lenp, ppos);
5083         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5084         return 0;
5085 }
5086
5087 struct ctl_table ipv6_route_table_template[] = {
5088         {
5089                 .procname       =       "flush",
5090                 .data           =       &init_net.ipv6.sysctl.flush_delay,
5091                 .maxlen         =       sizeof(int),
5092                 .mode           =       0200,
5093                 .proc_handler   =       ipv6_sysctl_rtcache_flush
5094         },
5095         {
5096                 .procname       =       "gc_thresh",
5097                 .data           =       &ip6_dst_ops_template.gc_thresh,
5098                 .maxlen         =       sizeof(int),
5099                 .mode           =       0644,
5100                 .proc_handler   =       proc_dointvec,
5101         },
5102         {
5103                 .procname       =       "max_size",
5104                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
5105                 .maxlen         =       sizeof(int),
5106                 .mode           =       0644,
5107                 .proc_handler   =       proc_dointvec,
5108         },
5109         {
5110                 .procname       =       "gc_min_interval",
5111                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5112                 .maxlen         =       sizeof(int),
5113                 .mode           =       0644,
5114                 .proc_handler   =       proc_dointvec_jiffies,
5115         },
5116         {
5117                 .procname       =       "gc_timeout",
5118                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5119                 .maxlen         =       sizeof(int),
5120                 .mode           =       0644,
5121                 .proc_handler   =       proc_dointvec_jiffies,
5122         },
5123         {
5124                 .procname       =       "gc_interval",
5125                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5126                 .maxlen         =       sizeof(int),
5127                 .mode           =       0644,
5128                 .proc_handler   =       proc_dointvec_jiffies,
5129         },
5130         {
5131                 .procname       =       "gc_elasticity",
5132                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5133                 .maxlen         =       sizeof(int),
5134                 .mode           =       0644,
5135                 .proc_handler   =       proc_dointvec,
5136         },
5137         {
5138                 .procname       =       "mtu_expires",
5139                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5140                 .maxlen         =       sizeof(int),
5141                 .mode           =       0644,
5142                 .proc_handler   =       proc_dointvec_jiffies,
5143         },
5144         {
5145                 .procname       =       "min_adv_mss",
5146                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5147                 .maxlen         =       sizeof(int),
5148                 .mode           =       0644,
5149                 .proc_handler   =       proc_dointvec,
5150         },
5151         {
5152                 .procname       =       "gc_min_interval_ms",
5153                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5154                 .maxlen         =       sizeof(int),
5155                 .mode           =       0644,
5156                 .proc_handler   =       proc_dointvec_ms_jiffies,
5157         },
5158         { }
5159 };
5160
5161 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5162 {
5163         struct ctl_table *table;
5164
5165         table = kmemdup(ipv6_route_table_template,
5166                         sizeof(ipv6_route_table_template),
5167                         GFP_KERNEL);
5168
5169         if (table) {
5170                 table[0].data = &net->ipv6.sysctl.flush_delay;
5171                 table[0].extra1 = net;
5172                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5173                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5174                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5175                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5176                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5177                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5178                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5179                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5180                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5181
5182                 /* Don't export sysctls to unprivileged users */
5183                 if (net->user_ns != &init_user_ns)
5184                         table[0].procname = NULL;
5185         }
5186
5187         return table;
5188 }
5189 #endif
5190
5191 static int __net_init ip6_route_net_init(struct net *net)
5192 {
5193         int ret = -ENOMEM;
5194
5195         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5196                sizeof(net->ipv6.ip6_dst_ops));
5197
5198         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5199                 goto out_ip6_dst_ops;
5200
5201         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5202                                             sizeof(*net->ipv6.fib6_null_entry),
5203                                             GFP_KERNEL);
5204         if (!net->ipv6.fib6_null_entry)
5205                 goto out_ip6_dst_entries;
5206
5207         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5208                                            sizeof(*net->ipv6.ip6_null_entry),
5209                                            GFP_KERNEL);
5210         if (!net->ipv6.ip6_null_entry)
5211                 goto out_fib6_null_entry;
5212         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5213         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5214                          ip6_template_metrics, true);
5215
5216 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5217         net->ipv6.fib6_has_custom_rules = false;
5218         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5219                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5220                                                GFP_KERNEL);
5221         if (!net->ipv6.ip6_prohibit_entry)
5222                 goto out_ip6_null_entry;
5223         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5224         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5225                          ip6_template_metrics, true);
5226
5227         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5228                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5229                                                GFP_KERNEL);
5230         if (!net->ipv6.ip6_blk_hole_entry)
5231                 goto out_ip6_prohibit_entry;
5232         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5233         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5234                          ip6_template_metrics, true);
5235 #endif
5236
5237         net->ipv6.sysctl.flush_delay = 0;
5238         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5239         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5240         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5241         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5242         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5243         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5244         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5245
5246         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5247
5248         ret = 0;
5249 out:
5250         return ret;
5251
5252 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5253 out_ip6_prohibit_entry:
5254         kfree(net->ipv6.ip6_prohibit_entry);
5255 out_ip6_null_entry:
5256         kfree(net->ipv6.ip6_null_entry);
5257 #endif
5258 out_fib6_null_entry:
5259         kfree(net->ipv6.fib6_null_entry);
5260 out_ip6_dst_entries:
5261         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5262 out_ip6_dst_ops:
5263         goto out;
5264 }
5265
5266 static void __net_exit ip6_route_net_exit(struct net *net)
5267 {
5268         kfree(net->ipv6.fib6_null_entry);
5269         kfree(net->ipv6.ip6_null_entry);
5270 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5271         kfree(net->ipv6.ip6_prohibit_entry);
5272         kfree(net->ipv6.ip6_blk_hole_entry);
5273 #endif
5274         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5275 }
5276
5277 static int __net_init ip6_route_net_init_late(struct net *net)
5278 {
5279 #ifdef CONFIG_PROC_FS
5280         proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5281                         sizeof(struct ipv6_route_iter));
5282         proc_create_net_single("rt6_stats", 0444, net->proc_net,
5283                         rt6_stats_seq_show, NULL);
5284 #endif
5285         return 0;
5286 }
5287
5288 static void __net_exit ip6_route_net_exit_late(struct net *net)
5289 {
5290 #ifdef CONFIG_PROC_FS
5291         remove_proc_entry("ipv6_route", net->proc_net);
5292         remove_proc_entry("rt6_stats", net->proc_net);
5293 #endif
5294 }
5295
5296 static struct pernet_operations ip6_route_net_ops = {
5297         .init = ip6_route_net_init,
5298         .exit = ip6_route_net_exit,
5299 };
5300
5301 static int __net_init ipv6_inetpeer_init(struct net *net)
5302 {
5303         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5304
5305         if (!bp)
5306                 return -ENOMEM;
5307         inet_peer_base_init(bp);
5308         net->ipv6.peers = bp;
5309         return 0;
5310 }
5311
5312 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5313 {
5314         struct inet_peer_base *bp = net->ipv6.peers;
5315
5316         net->ipv6.peers = NULL;
5317         inetpeer_invalidate_tree(bp);
5318         kfree(bp);
5319 }
5320
5321 static struct pernet_operations ipv6_inetpeer_ops = {
5322         .init   =       ipv6_inetpeer_init,
5323         .exit   =       ipv6_inetpeer_exit,
5324 };
5325
5326 static struct pernet_operations ip6_route_net_late_ops = {
5327         .init = ip6_route_net_init_late,
5328         .exit = ip6_route_net_exit_late,
5329 };
5330
5331 static struct notifier_block ip6_route_dev_notifier = {
5332         .notifier_call = ip6_route_dev_notify,
5333         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5334 };
5335
5336 void __init ip6_route_init_special_entries(void)
5337 {
5338         /* Registering of the loopback is done before this portion of code,
5339          * the loopback reference in rt6_info will not be taken, do it
5340          * manually for init_net */
5341         init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5342         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5343         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5344   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5345         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5346         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5347         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5348         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5349   #endif
5350 }
5351
5352 int __init ip6_route_init(void)
5353 {
5354         int ret;
5355         int cpu;
5356
5357         ret = -ENOMEM;
5358         ip6_dst_ops_template.kmem_cachep =
5359                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5360                                   SLAB_HWCACHE_ALIGN, NULL);
5361         if (!ip6_dst_ops_template.kmem_cachep)
5362                 goto out;
5363
5364         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5365         if (ret)
5366                 goto out_kmem_cache;
5367
5368         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5369         if (ret)
5370                 goto out_dst_entries;
5371
5372         ret = register_pernet_subsys(&ip6_route_net_ops);
5373         if (ret)
5374                 goto out_register_inetpeer;
5375
5376         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5377
5378         ret = fib6_init();
5379         if (ret)
5380                 goto out_register_subsys;
5381
5382         ret = xfrm6_init();
5383         if (ret)
5384                 goto out_fib6_init;
5385
5386         ret = fib6_rules_init();
5387         if (ret)
5388                 goto xfrm6_init;
5389
5390         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5391         if (ret)
5392                 goto fib6_rules_init;
5393
5394         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5395                                    inet6_rtm_newroute, NULL, 0);
5396         if (ret < 0)
5397                 goto out_register_late_subsys;
5398
5399         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5400                                    inet6_rtm_delroute, NULL, 0);
5401         if (ret < 0)
5402                 goto out_register_late_subsys;
5403
5404         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5405                                    inet6_rtm_getroute, NULL,
5406                                    RTNL_FLAG_DOIT_UNLOCKED);
5407         if (ret < 0)
5408                 goto out_register_late_subsys;
5409
5410         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5411         if (ret)
5412                 goto out_register_late_subsys;
5413
5414         for_each_possible_cpu(cpu) {
5415                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5416
5417                 INIT_LIST_HEAD(&ul->head);
5418                 spin_lock_init(&ul->lock);
5419         }
5420
5421 out:
5422         return ret;
5423
5424 out_register_late_subsys:
5425         rtnl_unregister_all(PF_INET6);
5426         unregister_pernet_subsys(&ip6_route_net_late_ops);
5427 fib6_rules_init:
5428         fib6_rules_cleanup();
5429 xfrm6_init:
5430         xfrm6_fini();
5431 out_fib6_init:
5432         fib6_gc_cleanup();
5433 out_register_subsys:
5434         unregister_pernet_subsys(&ip6_route_net_ops);
5435 out_register_inetpeer:
5436         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5437 out_dst_entries:
5438         dst_entries_destroy(&ip6_dst_blackhole_ops);
5439 out_kmem_cache:
5440         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5441         goto out;
5442 }
5443
5444 void ip6_route_cleanup(void)
5445 {
5446         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5447         unregister_pernet_subsys(&ip6_route_net_late_ops);
5448         fib6_rules_cleanup();
5449         xfrm6_fini();
5450         fib6_gc_cleanup();
5451         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5452         unregister_pernet_subsys(&ip6_route_net_ops);
5453         dst_entries_destroy(&ip6_dst_blackhole_ops);
5454         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5455 }