]> asedeno.scripts.mit.edu Git - linux.git/blob - net/ipv4/route.c
net: ipv4: Set addr_type in hash_keys for forwarded case
[linux.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <linux/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #include <linux/kmemleak.h>
112 #endif
113 #include <net/secure_seq.h>
114 #include <net/ip_tunnels.h>
115 #include <net/l3mdev.h>
116
117 #include "fib_lookup.h"
118
119 #define RT_FL_TOS(oldflp4) \
120         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
121
122 #define RT_GC_TIMEOUT (300*HZ)
123
124 static int ip_rt_max_size;
125 static int ip_rt_redirect_number __read_mostly  = 9;
126 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost __read_mostly       = HZ;
129 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
130 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
131 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly       = 256;
133
134 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
135 /*
136  *      Interface to generic destination cache.
137  */
138
139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
140 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
141 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
143 static void              ipv4_link_failure(struct sk_buff *skb);
144 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
145                                            struct sk_buff *skb, u32 mtu);
146 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
147                                         struct sk_buff *skb);
148 static void             ipv4_dst_destroy(struct dst_entry *dst);
149
150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151 {
152         WARN_ON(1);
153         return NULL;
154 }
155
156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157                                            struct sk_buff *skb,
158                                            const void *daddr);
159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
160
161 static struct dst_ops ipv4_dst_ops = {
162         .family =               AF_INET,
163         .check =                ipv4_dst_check,
164         .default_advmss =       ipv4_default_advmss,
165         .mtu =                  ipv4_mtu,
166         .cow_metrics =          ipv4_cow_metrics,
167         .destroy =              ipv4_dst_destroy,
168         .negative_advice =      ipv4_negative_advice,
169         .link_failure =         ipv4_link_failure,
170         .update_pmtu =          ip_rt_update_pmtu,
171         .redirect =             ip_do_redirect,
172         .local_out =            __ip_local_out,
173         .neigh_lookup =         ipv4_neigh_lookup,
174         .confirm_neigh =        ipv4_confirm_neigh,
175 };
176
177 #define ECN_OR_COST(class)      TC_PRIO_##class
178
179 const __u8 ip_tos2prio[16] = {
180         TC_PRIO_BESTEFFORT,
181         ECN_OR_COST(BESTEFFORT),
182         TC_PRIO_BESTEFFORT,
183         ECN_OR_COST(BESTEFFORT),
184         TC_PRIO_BULK,
185         ECN_OR_COST(BULK),
186         TC_PRIO_BULK,
187         ECN_OR_COST(BULK),
188         TC_PRIO_INTERACTIVE,
189         ECN_OR_COST(INTERACTIVE),
190         TC_PRIO_INTERACTIVE,
191         ECN_OR_COST(INTERACTIVE),
192         TC_PRIO_INTERACTIVE_BULK,
193         ECN_OR_COST(INTERACTIVE_BULK),
194         TC_PRIO_INTERACTIVE_BULK,
195         ECN_OR_COST(INTERACTIVE_BULK)
196 };
197 EXPORT_SYMBOL(ip_tos2prio);
198
199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
201
202 #ifdef CONFIG_PROC_FS
203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204 {
205         if (*pos)
206                 return NULL;
207         return SEQ_START_TOKEN;
208 }
209
210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211 {
212         ++*pos;
213         return NULL;
214 }
215
216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217 {
218 }
219
220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
221 {
222         if (v == SEQ_START_TOKEN)
223                 seq_printf(seq, "%-127s\n",
224                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226                            "HHUptod\tSpecDst");
227         return 0;
228 }
229
230 static const struct seq_operations rt_cache_seq_ops = {
231         .start  = rt_cache_seq_start,
232         .next   = rt_cache_seq_next,
233         .stop   = rt_cache_seq_stop,
234         .show   = rt_cache_seq_show,
235 };
236
237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
238 {
239         return seq_open(file, &rt_cache_seq_ops);
240 }
241
242 static const struct file_operations rt_cache_seq_fops = {
243         .open    = rt_cache_seq_open,
244         .read    = seq_read,
245         .llseek  = seq_lseek,
246         .release = seq_release,
247 };
248
249
250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251 {
252         int cpu;
253
254         if (*pos == 0)
255                 return SEQ_START_TOKEN;
256
257         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
258                 if (!cpu_possible(cpu))
259                         continue;
260                 *pos = cpu+1;
261                 return &per_cpu(rt_cache_stat, cpu);
262         }
263         return NULL;
264 }
265
266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267 {
268         int cpu;
269
270         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
271                 if (!cpu_possible(cpu))
272                         continue;
273                 *pos = cpu+1;
274                 return &per_cpu(rt_cache_stat, cpu);
275         }
276         return NULL;
277
278 }
279
280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 {
282
283 }
284
285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 {
287         struct rt_cache_stat *st = v;
288
289         if (v == SEQ_START_TOKEN) {
290                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291                 return 0;
292         }
293
294         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
295                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296                    dst_entries_get_slow(&ipv4_dst_ops),
297                    0, /* st->in_hit */
298                    st->in_slow_tot,
299                    st->in_slow_mc,
300                    st->in_no_route,
301                    st->in_brd,
302                    st->in_martian_dst,
303                    st->in_martian_src,
304
305                    0, /* st->out_hit */
306                    st->out_slow_tot,
307                    st->out_slow_mc,
308
309                    0, /* st->gc_total */
310                    0, /* st->gc_ignored */
311                    0, /* st->gc_goal_miss */
312                    0, /* st->gc_dst_overflow */
313                    0, /* st->in_hlist_search */
314                    0  /* st->out_hlist_search */
315                 );
316         return 0;
317 }
318
319 static const struct seq_operations rt_cpu_seq_ops = {
320         .start  = rt_cpu_seq_start,
321         .next   = rt_cpu_seq_next,
322         .stop   = rt_cpu_seq_stop,
323         .show   = rt_cpu_seq_show,
324 };
325
326
327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 {
329         return seq_open(file, &rt_cpu_seq_ops);
330 }
331
332 static const struct file_operations rt_cpu_seq_fops = {
333         .open    = rt_cpu_seq_open,
334         .read    = seq_read,
335         .llseek  = seq_lseek,
336         .release = seq_release,
337 };
338
339 #ifdef CONFIG_IP_ROUTE_CLASSID
340 static int rt_acct_proc_show(struct seq_file *m, void *v)
341 {
342         struct ip_rt_acct *dst, *src;
343         unsigned int i, j;
344
345         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
346         if (!dst)
347                 return -ENOMEM;
348
349         for_each_possible_cpu(i) {
350                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
351                 for (j = 0; j < 256; j++) {
352                         dst[j].o_bytes   += src[j].o_bytes;
353                         dst[j].o_packets += src[j].o_packets;
354                         dst[j].i_bytes   += src[j].i_bytes;
355                         dst[j].i_packets += src[j].i_packets;
356                 }
357         }
358
359         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
360         kfree(dst);
361         return 0;
362 }
363
364 static int rt_acct_proc_open(struct inode *inode, struct file *file)
365 {
366         return single_open(file, rt_acct_proc_show, NULL);
367 }
368
369 static const struct file_operations rt_acct_proc_fops = {
370         .open           = rt_acct_proc_open,
371         .read           = seq_read,
372         .llseek         = seq_lseek,
373         .release        = single_release,
374 };
375 #endif
376
377 static int __net_init ip_rt_do_proc_init(struct net *net)
378 {
379         struct proc_dir_entry *pde;
380
381         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
382                           &rt_cache_seq_fops);
383         if (!pde)
384                 goto err1;
385
386         pde = proc_create("rt_cache", S_IRUGO,
387                           net->proc_net_stat, &rt_cpu_seq_fops);
388         if (!pde)
389                 goto err2;
390
391 #ifdef CONFIG_IP_ROUTE_CLASSID
392         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
393         if (!pde)
394                 goto err3;
395 #endif
396         return 0;
397
398 #ifdef CONFIG_IP_ROUTE_CLASSID
399 err3:
400         remove_proc_entry("rt_cache", net->proc_net_stat);
401 #endif
402 err2:
403         remove_proc_entry("rt_cache", net->proc_net);
404 err1:
405         return -ENOMEM;
406 }
407
408 static void __net_exit ip_rt_do_proc_exit(struct net *net)
409 {
410         remove_proc_entry("rt_cache", net->proc_net_stat);
411         remove_proc_entry("rt_cache", net->proc_net);
412 #ifdef CONFIG_IP_ROUTE_CLASSID
413         remove_proc_entry("rt_acct", net->proc_net);
414 #endif
415 }
416
417 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
418         .init = ip_rt_do_proc_init,
419         .exit = ip_rt_do_proc_exit,
420 };
421
422 static int __init ip_rt_proc_init(void)
423 {
424         return register_pernet_subsys(&ip_rt_proc_ops);
425 }
426
427 #else
428 static inline int ip_rt_proc_init(void)
429 {
430         return 0;
431 }
432 #endif /* CONFIG_PROC_FS */
433
434 static inline bool rt_is_expired(const struct rtable *rth)
435 {
436         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
437 }
438
439 void rt_cache_flush(struct net *net)
440 {
441         rt_genid_bump_ipv4(net);
442 }
443
444 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
445                                            struct sk_buff *skb,
446                                            const void *daddr)
447 {
448         struct net_device *dev = dst->dev;
449         const __be32 *pkey = daddr;
450         const struct rtable *rt;
451         struct neighbour *n;
452
453         rt = (const struct rtable *) dst;
454         if (rt->rt_gateway)
455                 pkey = (const __be32 *) &rt->rt_gateway;
456         else if (skb)
457                 pkey = &ip_hdr(skb)->daddr;
458
459         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
460         if (n)
461                 return n;
462         return neigh_create(&arp_tbl, pkey, dev);
463 }
464
465 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
466 {
467         struct net_device *dev = dst->dev;
468         const __be32 *pkey = daddr;
469         const struct rtable *rt;
470
471         rt = (const struct rtable *)dst;
472         if (rt->rt_gateway)
473                 pkey = (const __be32 *)&rt->rt_gateway;
474         else if (!daddr ||
475                  (rt->rt_flags &
476                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
477                 return;
478
479         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
480 }
481
482 #define IP_IDENTS_SZ 2048u
483
484 static atomic_t *ip_idents __read_mostly;
485 static u32 *ip_tstamps __read_mostly;
486
487 /* In order to protect privacy, we add a perturbation to identifiers
488  * if one generator is seldom used. This makes hard for an attacker
489  * to infer how many packets were sent between two points in time.
490  */
491 u32 ip_idents_reserve(u32 hash, int segs)
492 {
493         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
494         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
495         u32 old = READ_ONCE(*p_tstamp);
496         u32 now = (u32)jiffies;
497         u32 new, delta = 0;
498
499         if (old != now && cmpxchg(p_tstamp, old, now) == old)
500                 delta = prandom_u32_max(now - old);
501
502         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
503         do {
504                 old = (u32)atomic_read(p_id);
505                 new = old + delta + segs;
506         } while (atomic_cmpxchg(p_id, old, new) != old);
507
508         return new - segs;
509 }
510 EXPORT_SYMBOL(ip_idents_reserve);
511
512 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
513 {
514         static u32 ip_idents_hashrnd __read_mostly;
515         u32 hash, id;
516
517         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
518
519         hash = jhash_3words((__force u32)iph->daddr,
520                             (__force u32)iph->saddr,
521                             iph->protocol ^ net_hash_mix(net),
522                             ip_idents_hashrnd);
523         id = ip_idents_reserve(hash, segs);
524         iph->id = htons(id);
525 }
526 EXPORT_SYMBOL(__ip_select_ident);
527
528 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
529                              const struct sock *sk,
530                              const struct iphdr *iph,
531                              int oif, u8 tos,
532                              u8 prot, u32 mark, int flow_flags)
533 {
534         if (sk) {
535                 const struct inet_sock *inet = inet_sk(sk);
536
537                 oif = sk->sk_bound_dev_if;
538                 mark = sk->sk_mark;
539                 tos = RT_CONN_FLAGS(sk);
540                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
541         }
542         flowi4_init_output(fl4, oif, mark, tos,
543                            RT_SCOPE_UNIVERSE, prot,
544                            flow_flags,
545                            iph->daddr, iph->saddr, 0, 0,
546                            sock_net_uid(net, sk));
547 }
548
549 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
550                                const struct sock *sk)
551 {
552         const struct net *net = dev_net(skb->dev);
553         const struct iphdr *iph = ip_hdr(skb);
554         int oif = skb->dev->ifindex;
555         u8 tos = RT_TOS(iph->tos);
556         u8 prot = iph->protocol;
557         u32 mark = skb->mark;
558
559         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
560 }
561
562 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
563 {
564         const struct inet_sock *inet = inet_sk(sk);
565         const struct ip_options_rcu *inet_opt;
566         __be32 daddr = inet->inet_daddr;
567
568         rcu_read_lock();
569         inet_opt = rcu_dereference(inet->inet_opt);
570         if (inet_opt && inet_opt->opt.srr)
571                 daddr = inet_opt->opt.faddr;
572         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
573                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
574                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
575                            inet_sk_flowi_flags(sk),
576                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
577         rcu_read_unlock();
578 }
579
580 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
581                                  const struct sk_buff *skb)
582 {
583         if (skb)
584                 build_skb_flow_key(fl4, skb, sk);
585         else
586                 build_sk_flow_key(fl4, sk);
587 }
588
589 static DEFINE_SPINLOCK(fnhe_lock);
590
591 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
592 {
593         struct rtable *rt;
594
595         rt = rcu_dereference(fnhe->fnhe_rth_input);
596         if (rt) {
597                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
598                 dst_dev_put(&rt->dst);
599                 dst_release(&rt->dst);
600         }
601         rt = rcu_dereference(fnhe->fnhe_rth_output);
602         if (rt) {
603                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
604                 dst_dev_put(&rt->dst);
605                 dst_release(&rt->dst);
606         }
607 }
608
609 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
610 {
611         struct fib_nh_exception *fnhe, *oldest;
612
613         oldest = rcu_dereference(hash->chain);
614         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
615              fnhe = rcu_dereference(fnhe->fnhe_next)) {
616                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
617                         oldest = fnhe;
618         }
619         fnhe_flush_routes(oldest);
620         return oldest;
621 }
622
623 static inline u32 fnhe_hashfun(__be32 daddr)
624 {
625         static u32 fnhe_hashrnd __read_mostly;
626         u32 hval;
627
628         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
629         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
630         return hash_32(hval, FNHE_HASH_SHIFT);
631 }
632
633 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
634 {
635         rt->rt_pmtu = fnhe->fnhe_pmtu;
636         rt->dst.expires = fnhe->fnhe_expires;
637
638         if (fnhe->fnhe_gw) {
639                 rt->rt_flags |= RTCF_REDIRECTED;
640                 rt->rt_gateway = fnhe->fnhe_gw;
641                 rt->rt_uses_gateway = 1;
642         }
643 }
644
645 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
646                                   u32 pmtu, unsigned long expires)
647 {
648         struct fnhe_hash_bucket *hash;
649         struct fib_nh_exception *fnhe;
650         struct rtable *rt;
651         u32 genid, hval;
652         unsigned int i;
653         int depth;
654
655         genid = fnhe_genid(dev_net(nh->nh_dev));
656         hval = fnhe_hashfun(daddr);
657
658         spin_lock_bh(&fnhe_lock);
659
660         hash = rcu_dereference(nh->nh_exceptions);
661         if (!hash) {
662                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
663                 if (!hash)
664                         goto out_unlock;
665                 rcu_assign_pointer(nh->nh_exceptions, hash);
666         }
667
668         hash += hval;
669
670         depth = 0;
671         for (fnhe = rcu_dereference(hash->chain); fnhe;
672              fnhe = rcu_dereference(fnhe->fnhe_next)) {
673                 if (fnhe->fnhe_daddr == daddr)
674                         break;
675                 depth++;
676         }
677
678         if (fnhe) {
679                 if (fnhe->fnhe_genid != genid)
680                         fnhe->fnhe_genid = genid;
681                 if (gw)
682                         fnhe->fnhe_gw = gw;
683                 if (pmtu)
684                         fnhe->fnhe_pmtu = pmtu;
685                 fnhe->fnhe_expires = max(1UL, expires);
686                 /* Update all cached dsts too */
687                 rt = rcu_dereference(fnhe->fnhe_rth_input);
688                 if (rt)
689                         fill_route_from_fnhe(rt, fnhe);
690                 rt = rcu_dereference(fnhe->fnhe_rth_output);
691                 if (rt)
692                         fill_route_from_fnhe(rt, fnhe);
693         } else {
694                 if (depth > FNHE_RECLAIM_DEPTH)
695                         fnhe = fnhe_oldest(hash);
696                 else {
697                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
698                         if (!fnhe)
699                                 goto out_unlock;
700
701                         fnhe->fnhe_next = hash->chain;
702                         rcu_assign_pointer(hash->chain, fnhe);
703                 }
704                 fnhe->fnhe_genid = genid;
705                 fnhe->fnhe_daddr = daddr;
706                 fnhe->fnhe_gw = gw;
707                 fnhe->fnhe_pmtu = pmtu;
708                 fnhe->fnhe_expires = expires;
709
710                 /* Exception created; mark the cached routes for the nexthop
711                  * stale, so anyone caching it rechecks if this exception
712                  * applies to them.
713                  */
714                 rt = rcu_dereference(nh->nh_rth_input);
715                 if (rt)
716                         rt->dst.obsolete = DST_OBSOLETE_KILL;
717
718                 for_each_possible_cpu(i) {
719                         struct rtable __rcu **prt;
720                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
721                         rt = rcu_dereference(*prt);
722                         if (rt)
723                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
724                 }
725         }
726
727         fnhe->fnhe_stamp = jiffies;
728
729 out_unlock:
730         spin_unlock_bh(&fnhe_lock);
731 }
732
733 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
734                              bool kill_route)
735 {
736         __be32 new_gw = icmp_hdr(skb)->un.gateway;
737         __be32 old_gw = ip_hdr(skb)->saddr;
738         struct net_device *dev = skb->dev;
739         struct in_device *in_dev;
740         struct fib_result res;
741         struct neighbour *n;
742         struct net *net;
743
744         switch (icmp_hdr(skb)->code & 7) {
745         case ICMP_REDIR_NET:
746         case ICMP_REDIR_NETTOS:
747         case ICMP_REDIR_HOST:
748         case ICMP_REDIR_HOSTTOS:
749                 break;
750
751         default:
752                 return;
753         }
754
755         if (rt->rt_gateway != old_gw)
756                 return;
757
758         in_dev = __in_dev_get_rcu(dev);
759         if (!in_dev)
760                 return;
761
762         net = dev_net(dev);
763         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
764             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
765             ipv4_is_zeronet(new_gw))
766                 goto reject_redirect;
767
768         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
769                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
770                         goto reject_redirect;
771                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
772                         goto reject_redirect;
773         } else {
774                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
775                         goto reject_redirect;
776         }
777
778         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
779         if (!n)
780                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
781         if (!IS_ERR(n)) {
782                 if (!(n->nud_state & NUD_VALID)) {
783                         neigh_event_send(n, NULL);
784                 } else {
785                         if (fib_lookup(net, fl4, &res, 0) == 0) {
786                                 struct fib_nh *nh = &FIB_RES_NH(res);
787
788                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
789                                                 0, jiffies + ip_rt_gc_timeout);
790                         }
791                         if (kill_route)
792                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
793                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
794                 }
795                 neigh_release(n);
796         }
797         return;
798
799 reject_redirect:
800 #ifdef CONFIG_IP_ROUTE_VERBOSE
801         if (IN_DEV_LOG_MARTIANS(in_dev)) {
802                 const struct iphdr *iph = (const struct iphdr *) skb->data;
803                 __be32 daddr = iph->daddr;
804                 __be32 saddr = iph->saddr;
805
806                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
807                                      "  Advised path = %pI4 -> %pI4\n",
808                                      &old_gw, dev->name, &new_gw,
809                                      &saddr, &daddr);
810         }
811 #endif
812         ;
813 }
814
815 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
816 {
817         struct rtable *rt;
818         struct flowi4 fl4;
819         const struct iphdr *iph = (const struct iphdr *) skb->data;
820         struct net *net = dev_net(skb->dev);
821         int oif = skb->dev->ifindex;
822         u8 tos = RT_TOS(iph->tos);
823         u8 prot = iph->protocol;
824         u32 mark = skb->mark;
825
826         rt = (struct rtable *) dst;
827
828         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
829         __ip_do_redirect(rt, skb, &fl4, true);
830 }
831
832 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
833 {
834         struct rtable *rt = (struct rtable *)dst;
835         struct dst_entry *ret = dst;
836
837         if (rt) {
838                 if (dst->obsolete > 0) {
839                         ip_rt_put(rt);
840                         ret = NULL;
841                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
842                            rt->dst.expires) {
843                         ip_rt_put(rt);
844                         ret = NULL;
845                 }
846         }
847         return ret;
848 }
849
850 /*
851  * Algorithm:
852  *      1. The first ip_rt_redirect_number redirects are sent
853  *         with exponential backoff, then we stop sending them at all,
854  *         assuming that the host ignores our redirects.
855  *      2. If we did not see packets requiring redirects
856  *         during ip_rt_redirect_silence, we assume that the host
857  *         forgot redirected route and start to send redirects again.
858  *
859  * This algorithm is much cheaper and more intelligent than dumb load limiting
860  * in icmp.c.
861  *
862  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
863  * and "frag. need" (breaks PMTU discovery) in icmp.c.
864  */
865
866 void ip_rt_send_redirect(struct sk_buff *skb)
867 {
868         struct rtable *rt = skb_rtable(skb);
869         struct in_device *in_dev;
870         struct inet_peer *peer;
871         struct net *net;
872         int log_martians;
873         int vif;
874
875         rcu_read_lock();
876         in_dev = __in_dev_get_rcu(rt->dst.dev);
877         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
878                 rcu_read_unlock();
879                 return;
880         }
881         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
882         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
883         rcu_read_unlock();
884
885         net = dev_net(rt->dst.dev);
886         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
887         if (!peer) {
888                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
889                           rt_nexthop(rt, ip_hdr(skb)->daddr));
890                 return;
891         }
892
893         /* No redirected packets during ip_rt_redirect_silence;
894          * reset the algorithm.
895          */
896         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
897                 peer->rate_tokens = 0;
898
899         /* Too many ignored redirects; do not send anything
900          * set dst.rate_last to the last seen redirected packet.
901          */
902         if (peer->rate_tokens >= ip_rt_redirect_number) {
903                 peer->rate_last = jiffies;
904                 goto out_put_peer;
905         }
906
907         /* Check for load limit; set rate_last to the latest sent
908          * redirect.
909          */
910         if (peer->rate_tokens == 0 ||
911             time_after(jiffies,
912                        (peer->rate_last +
913                         (ip_rt_redirect_load << peer->rate_tokens)))) {
914                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
915
916                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
917                 peer->rate_last = jiffies;
918                 ++peer->rate_tokens;
919 #ifdef CONFIG_IP_ROUTE_VERBOSE
920                 if (log_martians &&
921                     peer->rate_tokens == ip_rt_redirect_number)
922                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
923                                              &ip_hdr(skb)->saddr, inet_iif(skb),
924                                              &ip_hdr(skb)->daddr, &gw);
925 #endif
926         }
927 out_put_peer:
928         inet_putpeer(peer);
929 }
930
931 static int ip_error(struct sk_buff *skb)
932 {
933         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
934         struct rtable *rt = skb_rtable(skb);
935         struct inet_peer *peer;
936         unsigned long now;
937         struct net *net;
938         bool send;
939         int code;
940
941         /* IP on this device is disabled. */
942         if (!in_dev)
943                 goto out;
944
945         net = dev_net(rt->dst.dev);
946         if (!IN_DEV_FORWARD(in_dev)) {
947                 switch (rt->dst.error) {
948                 case EHOSTUNREACH:
949                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
950                         break;
951
952                 case ENETUNREACH:
953                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
954                         break;
955                 }
956                 goto out;
957         }
958
959         switch (rt->dst.error) {
960         case EINVAL:
961         default:
962                 goto out;
963         case EHOSTUNREACH:
964                 code = ICMP_HOST_UNREACH;
965                 break;
966         case ENETUNREACH:
967                 code = ICMP_NET_UNREACH;
968                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
969                 break;
970         case EACCES:
971                 code = ICMP_PKT_FILTERED;
972                 break;
973         }
974
975         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
976                                l3mdev_master_ifindex(skb->dev), 1);
977
978         send = true;
979         if (peer) {
980                 now = jiffies;
981                 peer->rate_tokens += now - peer->rate_last;
982                 if (peer->rate_tokens > ip_rt_error_burst)
983                         peer->rate_tokens = ip_rt_error_burst;
984                 peer->rate_last = now;
985                 if (peer->rate_tokens >= ip_rt_error_cost)
986                         peer->rate_tokens -= ip_rt_error_cost;
987                 else
988                         send = false;
989                 inet_putpeer(peer);
990         }
991         if (send)
992                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
993
994 out:    kfree_skb(skb);
995         return 0;
996 }
997
998 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
999 {
1000         struct dst_entry *dst = &rt->dst;
1001         struct fib_result res;
1002
1003         if (dst_metric_locked(dst, RTAX_MTU))
1004                 return;
1005
1006         if (ipv4_mtu(dst) < mtu)
1007                 return;
1008
1009         if (mtu < ip_rt_min_pmtu)
1010                 mtu = ip_rt_min_pmtu;
1011
1012         if (rt->rt_pmtu == mtu &&
1013             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1014                 return;
1015
1016         rcu_read_lock();
1017         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1018                 struct fib_nh *nh = &FIB_RES_NH(res);
1019
1020                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
1021                                       jiffies + ip_rt_mtu_expires);
1022         }
1023         rcu_read_unlock();
1024 }
1025
1026 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1027                               struct sk_buff *skb, u32 mtu)
1028 {
1029         struct rtable *rt = (struct rtable *) dst;
1030         struct flowi4 fl4;
1031
1032         ip_rt_build_flow_key(&fl4, sk, skb);
1033         __ip_rt_update_pmtu(rt, &fl4, mtu);
1034 }
1035
1036 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1037                       int oif, u32 mark, u8 protocol, int flow_flags)
1038 {
1039         const struct iphdr *iph = (const struct iphdr *) skb->data;
1040         struct flowi4 fl4;
1041         struct rtable *rt;
1042
1043         if (!mark)
1044                 mark = IP4_REPLY_MARK(net, skb->mark);
1045
1046         __build_flow_key(net, &fl4, NULL, iph, oif,
1047                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1048         rt = __ip_route_output_key(net, &fl4);
1049         if (!IS_ERR(rt)) {
1050                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1051                 ip_rt_put(rt);
1052         }
1053 }
1054 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1055
1056 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1057 {
1058         const struct iphdr *iph = (const struct iphdr *) skb->data;
1059         struct flowi4 fl4;
1060         struct rtable *rt;
1061
1062         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1063
1064         if (!fl4.flowi4_mark)
1065                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1066
1067         rt = __ip_route_output_key(sock_net(sk), &fl4);
1068         if (!IS_ERR(rt)) {
1069                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1070                 ip_rt_put(rt);
1071         }
1072 }
1073
1074 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1075 {
1076         const struct iphdr *iph = (const struct iphdr *) skb->data;
1077         struct flowi4 fl4;
1078         struct rtable *rt;
1079         struct dst_entry *odst = NULL;
1080         bool new = false;
1081         struct net *net = sock_net(sk);
1082
1083         bh_lock_sock(sk);
1084
1085         if (!ip_sk_accept_pmtu(sk))
1086                 goto out;
1087
1088         odst = sk_dst_get(sk);
1089
1090         if (sock_owned_by_user(sk) || !odst) {
1091                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1092                 goto out;
1093         }
1094
1095         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1096
1097         rt = (struct rtable *)odst;
1098         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1099                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1100                 if (IS_ERR(rt))
1101                         goto out;
1102
1103                 new = true;
1104         }
1105
1106         __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1107
1108         if (!dst_check(&rt->dst, 0)) {
1109                 if (new)
1110                         dst_release(&rt->dst);
1111
1112                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1113                 if (IS_ERR(rt))
1114                         goto out;
1115
1116                 new = true;
1117         }
1118
1119         if (new)
1120                 sk_dst_set(sk, &rt->dst);
1121
1122 out:
1123         bh_unlock_sock(sk);
1124         dst_release(odst);
1125 }
1126 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1127
1128 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1129                    int oif, u32 mark, u8 protocol, int flow_flags)
1130 {
1131         const struct iphdr *iph = (const struct iphdr *) skb->data;
1132         struct flowi4 fl4;
1133         struct rtable *rt;
1134
1135         __build_flow_key(net, &fl4, NULL, iph, oif,
1136                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1137         rt = __ip_route_output_key(net, &fl4);
1138         if (!IS_ERR(rt)) {
1139                 __ip_do_redirect(rt, skb, &fl4, false);
1140                 ip_rt_put(rt);
1141         }
1142 }
1143 EXPORT_SYMBOL_GPL(ipv4_redirect);
1144
1145 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1146 {
1147         const struct iphdr *iph = (const struct iphdr *) skb->data;
1148         struct flowi4 fl4;
1149         struct rtable *rt;
1150         struct net *net = sock_net(sk);
1151
1152         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1153         rt = __ip_route_output_key(net, &fl4);
1154         if (!IS_ERR(rt)) {
1155                 __ip_do_redirect(rt, skb, &fl4, false);
1156                 ip_rt_put(rt);
1157         }
1158 }
1159 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1160
1161 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1162 {
1163         struct rtable *rt = (struct rtable *) dst;
1164
1165         /* All IPV4 dsts are created with ->obsolete set to the value
1166          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1167          * into this function always.
1168          *
1169          * When a PMTU/redirect information update invalidates a route,
1170          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1171          * DST_OBSOLETE_DEAD by dst_free().
1172          */
1173         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1174                 return NULL;
1175         return dst;
1176 }
1177
1178 static void ipv4_link_failure(struct sk_buff *skb)
1179 {
1180         struct rtable *rt;
1181
1182         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1183
1184         rt = skb_rtable(skb);
1185         if (rt)
1186                 dst_set_expires(&rt->dst, 0);
1187 }
1188
1189 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1190 {
1191         pr_debug("%s: %pI4 -> %pI4, %s\n",
1192                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1193                  skb->dev ? skb->dev->name : "?");
1194         kfree_skb(skb);
1195         WARN_ON(1);
1196         return 0;
1197 }
1198
1199 /*
1200    We do not cache source address of outgoing interface,
1201    because it is used only by IP RR, TS and SRR options,
1202    so that it out of fast path.
1203
1204    BTW remember: "addr" is allowed to be not aligned
1205    in IP options!
1206  */
1207
1208 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1209 {
1210         __be32 src;
1211
1212         if (rt_is_output_route(rt))
1213                 src = ip_hdr(skb)->saddr;
1214         else {
1215                 struct fib_result res;
1216                 struct flowi4 fl4;
1217                 struct iphdr *iph;
1218
1219                 iph = ip_hdr(skb);
1220
1221                 memset(&fl4, 0, sizeof(fl4));
1222                 fl4.daddr = iph->daddr;
1223                 fl4.saddr = iph->saddr;
1224                 fl4.flowi4_tos = RT_TOS(iph->tos);
1225                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1226                 fl4.flowi4_iif = skb->dev->ifindex;
1227                 fl4.flowi4_mark = skb->mark;
1228
1229                 rcu_read_lock();
1230                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1231                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1232                 else
1233                         src = inet_select_addr(rt->dst.dev,
1234                                                rt_nexthop(rt, iph->daddr),
1235                                                RT_SCOPE_UNIVERSE);
1236                 rcu_read_unlock();
1237         }
1238         memcpy(addr, &src, 4);
1239 }
1240
1241 #ifdef CONFIG_IP_ROUTE_CLASSID
1242 static void set_class_tag(struct rtable *rt, u32 tag)
1243 {
1244         if (!(rt->dst.tclassid & 0xFFFF))
1245                 rt->dst.tclassid |= tag & 0xFFFF;
1246         if (!(rt->dst.tclassid & 0xFFFF0000))
1247                 rt->dst.tclassid |= tag & 0xFFFF0000;
1248 }
1249 #endif
1250
1251 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1252 {
1253         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1254         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1255                                     ip_rt_min_advmss);
1256
1257         return min(advmss, IPV4_MAX_PMTU - header_size);
1258 }
1259
1260 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1261 {
1262         const struct rtable *rt = (const struct rtable *) dst;
1263         unsigned int mtu = rt->rt_pmtu;
1264
1265         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1266                 mtu = dst_metric_raw(dst, RTAX_MTU);
1267
1268         if (mtu)
1269                 return mtu;
1270
1271         mtu = READ_ONCE(dst->dev->mtu);
1272
1273         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1274                 if (rt->rt_uses_gateway && mtu > 576)
1275                         mtu = 576;
1276         }
1277
1278         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1279
1280         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1281 }
1282
1283 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1284 {
1285         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1286         struct fib_nh_exception *fnhe;
1287         u32 hval;
1288
1289         if (!hash)
1290                 return NULL;
1291
1292         hval = fnhe_hashfun(daddr);
1293
1294         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1295              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1296                 if (fnhe->fnhe_daddr == daddr)
1297                         return fnhe;
1298         }
1299         return NULL;
1300 }
1301
1302 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1303                               __be32 daddr, const bool do_cache)
1304 {
1305         bool ret = false;
1306
1307         spin_lock_bh(&fnhe_lock);
1308
1309         if (daddr == fnhe->fnhe_daddr) {
1310                 struct rtable __rcu **porig;
1311                 struct rtable *orig;
1312                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1313
1314                 if (rt_is_input_route(rt))
1315                         porig = &fnhe->fnhe_rth_input;
1316                 else
1317                         porig = &fnhe->fnhe_rth_output;
1318                 orig = rcu_dereference(*porig);
1319
1320                 if (fnhe->fnhe_genid != genid) {
1321                         fnhe->fnhe_genid = genid;
1322                         fnhe->fnhe_gw = 0;
1323                         fnhe->fnhe_pmtu = 0;
1324                         fnhe->fnhe_expires = 0;
1325                         fnhe_flush_routes(fnhe);
1326                         orig = NULL;
1327                 }
1328                 fill_route_from_fnhe(rt, fnhe);
1329                 if (!rt->rt_gateway)
1330                         rt->rt_gateway = daddr;
1331
1332                 if (do_cache) {
1333                         dst_hold(&rt->dst);
1334                         rcu_assign_pointer(*porig, rt);
1335                         if (orig) {
1336                                 dst_dev_put(&orig->dst);
1337                                 dst_release(&orig->dst);
1338                         }
1339                         ret = true;
1340                 }
1341
1342                 fnhe->fnhe_stamp = jiffies;
1343         }
1344         spin_unlock_bh(&fnhe_lock);
1345
1346         return ret;
1347 }
1348
1349 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1350 {
1351         struct rtable *orig, *prev, **p;
1352         bool ret = true;
1353
1354         if (rt_is_input_route(rt)) {
1355                 p = (struct rtable **)&nh->nh_rth_input;
1356         } else {
1357                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1358         }
1359         orig = *p;
1360
1361         /* hold dst before doing cmpxchg() to avoid race condition
1362          * on this dst
1363          */
1364         dst_hold(&rt->dst);
1365         prev = cmpxchg(p, orig, rt);
1366         if (prev == orig) {
1367                 if (orig) {
1368                         dst_dev_put(&orig->dst);
1369                         dst_release(&orig->dst);
1370                 }
1371         } else {
1372                 dst_release(&rt->dst);
1373                 ret = false;
1374         }
1375
1376         return ret;
1377 }
1378
1379 struct uncached_list {
1380         spinlock_t              lock;
1381         struct list_head        head;
1382 };
1383
1384 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1385
1386 static void rt_add_uncached_list(struct rtable *rt)
1387 {
1388         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1389
1390         rt->rt_uncached_list = ul;
1391
1392         spin_lock_bh(&ul->lock);
1393         list_add_tail(&rt->rt_uncached, &ul->head);
1394         spin_unlock_bh(&ul->lock);
1395 }
1396
1397 static void ipv4_dst_destroy(struct dst_entry *dst)
1398 {
1399         struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1400         struct rtable *rt = (struct rtable *) dst;
1401
1402         if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1403                 kfree(p);
1404
1405         if (!list_empty(&rt->rt_uncached)) {
1406                 struct uncached_list *ul = rt->rt_uncached_list;
1407
1408                 spin_lock_bh(&ul->lock);
1409                 list_del(&rt->rt_uncached);
1410                 spin_unlock_bh(&ul->lock);
1411         }
1412 }
1413
1414 void rt_flush_dev(struct net_device *dev)
1415 {
1416         struct net *net = dev_net(dev);
1417         struct rtable *rt;
1418         int cpu;
1419
1420         for_each_possible_cpu(cpu) {
1421                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1422
1423                 spin_lock_bh(&ul->lock);
1424                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1425                         if (rt->dst.dev != dev)
1426                                 continue;
1427                         rt->dst.dev = net->loopback_dev;
1428                         dev_hold(rt->dst.dev);
1429                         dev_put(dev);
1430                 }
1431                 spin_unlock_bh(&ul->lock);
1432         }
1433 }
1434
1435 static bool rt_cache_valid(const struct rtable *rt)
1436 {
1437         return  rt &&
1438                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1439                 !rt_is_expired(rt);
1440 }
1441
1442 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1443                            const struct fib_result *res,
1444                            struct fib_nh_exception *fnhe,
1445                            struct fib_info *fi, u16 type, u32 itag,
1446                            const bool do_cache)
1447 {
1448         bool cached = false;
1449
1450         if (fi) {
1451                 struct fib_nh *nh = &FIB_RES_NH(*res);
1452
1453                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1454                         rt->rt_gateway = nh->nh_gw;
1455                         rt->rt_uses_gateway = 1;
1456                 }
1457                 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1458                 if (fi->fib_metrics != &dst_default_metrics) {
1459                         rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1460                         refcount_inc(&fi->fib_metrics->refcnt);
1461                 }
1462 #ifdef CONFIG_IP_ROUTE_CLASSID
1463                 rt->dst.tclassid = nh->nh_tclassid;
1464 #endif
1465                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1466                 if (unlikely(fnhe))
1467                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1468                 else if (do_cache)
1469                         cached = rt_cache_route(nh, rt);
1470                 if (unlikely(!cached)) {
1471                         /* Routes we intend to cache in nexthop exception or
1472                          * FIB nexthop have the DST_NOCACHE bit clear.
1473                          * However, if we are unsuccessful at storing this
1474                          * route into the cache we really need to set it.
1475                          */
1476                         if (!rt->rt_gateway)
1477                                 rt->rt_gateway = daddr;
1478                         rt_add_uncached_list(rt);
1479                 }
1480         } else
1481                 rt_add_uncached_list(rt);
1482
1483 #ifdef CONFIG_IP_ROUTE_CLASSID
1484 #ifdef CONFIG_IP_MULTIPLE_TABLES
1485         set_class_tag(rt, res->tclassid);
1486 #endif
1487         set_class_tag(rt, itag);
1488 #endif
1489 }
1490
1491 struct rtable *rt_dst_alloc(struct net_device *dev,
1492                             unsigned int flags, u16 type,
1493                             bool nopolicy, bool noxfrm, bool will_cache)
1494 {
1495         struct rtable *rt;
1496
1497         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1498                        (will_cache ? 0 : DST_HOST) |
1499                        (nopolicy ? DST_NOPOLICY : 0) |
1500                        (noxfrm ? DST_NOXFRM : 0));
1501
1502         if (rt) {
1503                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1504                 rt->rt_flags = flags;
1505                 rt->rt_type = type;
1506                 rt->rt_is_input = 0;
1507                 rt->rt_iif = 0;
1508                 rt->rt_pmtu = 0;
1509                 rt->rt_gateway = 0;
1510                 rt->rt_uses_gateway = 0;
1511                 rt->rt_table_id = 0;
1512                 INIT_LIST_HEAD(&rt->rt_uncached);
1513
1514                 rt->dst.output = ip_output;
1515                 if (flags & RTCF_LOCAL)
1516                         rt->dst.input = ip_local_deliver;
1517         }
1518
1519         return rt;
1520 }
1521 EXPORT_SYMBOL(rt_dst_alloc);
1522
1523 /* called in rcu_read_lock() section */
1524 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1525                           u8 tos, struct net_device *dev,
1526                           struct in_device *in_dev, u32 *itag)
1527 {
1528         int err;
1529
1530         /* Primary sanity checks. */
1531         if (!in_dev)
1532                 return -EINVAL;
1533
1534         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1535             skb->protocol != htons(ETH_P_IP))
1536                 return -EINVAL;
1537
1538         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1539                 return -EINVAL;
1540
1541         if (ipv4_is_zeronet(saddr)) {
1542                 if (!ipv4_is_local_multicast(daddr))
1543                         return -EINVAL;
1544         } else {
1545                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1546                                           in_dev, itag);
1547                 if (err < 0)
1548                         return err;
1549         }
1550         return 0;
1551 }
1552
1553 /* called in rcu_read_lock() section */
1554 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1555                              u8 tos, struct net_device *dev, int our)
1556 {
1557         struct in_device *in_dev = __in_dev_get_rcu(dev);
1558         unsigned int flags = RTCF_MULTICAST;
1559         struct rtable *rth;
1560         u32 itag = 0;
1561         int err;
1562
1563         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1564         if (err)
1565                 return err;
1566
1567         if (our)
1568                 flags |= RTCF_LOCAL;
1569
1570         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1571                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1572         if (!rth)
1573                 return -ENOBUFS;
1574
1575 #ifdef CONFIG_IP_ROUTE_CLASSID
1576         rth->dst.tclassid = itag;
1577 #endif
1578         rth->dst.output = ip_rt_bug;
1579         rth->rt_is_input= 1;
1580
1581 #ifdef CONFIG_IP_MROUTE
1582         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1583                 rth->dst.input = ip_mr_input;
1584 #endif
1585         RT_CACHE_STAT_INC(in_slow_mc);
1586
1587         skb_dst_set(skb, &rth->dst);
1588         return 0;
1589 }
1590
1591
1592 static void ip_handle_martian_source(struct net_device *dev,
1593                                      struct in_device *in_dev,
1594                                      struct sk_buff *skb,
1595                                      __be32 daddr,
1596                                      __be32 saddr)
1597 {
1598         RT_CACHE_STAT_INC(in_martian_src);
1599 #ifdef CONFIG_IP_ROUTE_VERBOSE
1600         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1601                 /*
1602                  *      RFC1812 recommendation, if source is martian,
1603                  *      the only hint is MAC header.
1604                  */
1605                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1606                         &daddr, &saddr, dev->name);
1607                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1608                         print_hex_dump(KERN_WARNING, "ll header: ",
1609                                        DUMP_PREFIX_OFFSET, 16, 1,
1610                                        skb_mac_header(skb),
1611                                        dev->hard_header_len, true);
1612                 }
1613         }
1614 #endif
1615 }
1616
1617 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1618 {
1619         struct fnhe_hash_bucket *hash;
1620         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1621         u32 hval = fnhe_hashfun(daddr);
1622
1623         spin_lock_bh(&fnhe_lock);
1624
1625         hash = rcu_dereference_protected(nh->nh_exceptions,
1626                                          lockdep_is_held(&fnhe_lock));
1627         hash += hval;
1628
1629         fnhe_p = &hash->chain;
1630         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1631         while (fnhe) {
1632                 if (fnhe->fnhe_daddr == daddr) {
1633                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1634                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1635                         fnhe_flush_routes(fnhe);
1636                         kfree_rcu(fnhe, rcu);
1637                         break;
1638                 }
1639                 fnhe_p = &fnhe->fnhe_next;
1640                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1641                                                  lockdep_is_held(&fnhe_lock));
1642         }
1643
1644         spin_unlock_bh(&fnhe_lock);
1645 }
1646
1647 static void set_lwt_redirect(struct rtable *rth)
1648 {
1649         if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1650                 rth->dst.lwtstate->orig_output = rth->dst.output;
1651                 rth->dst.output = lwtunnel_output;
1652         }
1653
1654         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1655                 rth->dst.lwtstate->orig_input = rth->dst.input;
1656                 rth->dst.input = lwtunnel_input;
1657         }
1658 }
1659
1660 /* called in rcu_read_lock() section */
1661 static int __mkroute_input(struct sk_buff *skb,
1662                            const struct fib_result *res,
1663                            struct in_device *in_dev,
1664                            __be32 daddr, __be32 saddr, u32 tos)
1665 {
1666         struct fib_nh_exception *fnhe;
1667         struct rtable *rth;
1668         int err;
1669         struct in_device *out_dev;
1670         bool do_cache;
1671         u32 itag = 0;
1672
1673         /* get a working reference to the output device */
1674         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1675         if (!out_dev) {
1676                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1677                 return -EINVAL;
1678         }
1679
1680         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1681                                   in_dev->dev, in_dev, &itag);
1682         if (err < 0) {
1683                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1684                                          saddr);
1685
1686                 goto cleanup;
1687         }
1688
1689         do_cache = res->fi && !itag;
1690         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1691             skb->protocol == htons(ETH_P_IP) &&
1692             (IN_DEV_SHARED_MEDIA(out_dev) ||
1693              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1694                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1695
1696         if (skb->protocol != htons(ETH_P_IP)) {
1697                 /* Not IP (i.e. ARP). Do not create route, if it is
1698                  * invalid for proxy arp. DNAT routes are always valid.
1699                  *
1700                  * Proxy arp feature have been extended to allow, ARP
1701                  * replies back to the same interface, to support
1702                  * Private VLAN switch technologies. See arp.c.
1703                  */
1704                 if (out_dev == in_dev &&
1705                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1706                         err = -EINVAL;
1707                         goto cleanup;
1708                 }
1709         }
1710
1711         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1712         if (do_cache) {
1713                 if (fnhe) {
1714                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1715                         if (rth && rth->dst.expires &&
1716                             time_after(jiffies, rth->dst.expires)) {
1717                                 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1718                                 fnhe = NULL;
1719                         } else {
1720                                 goto rt_cache;
1721                         }
1722                 }
1723
1724                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1725
1726 rt_cache:
1727                 if (rt_cache_valid(rth)) {
1728                         skb_dst_set_noref(skb, &rth->dst);
1729                         goto out;
1730                 }
1731         }
1732
1733         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1734                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1735                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1736         if (!rth) {
1737                 err = -ENOBUFS;
1738                 goto cleanup;
1739         }
1740
1741         rth->rt_is_input = 1;
1742         if (res->table)
1743                 rth->rt_table_id = res->table->tb_id;
1744         RT_CACHE_STAT_INC(in_slow_tot);
1745
1746         rth->dst.input = ip_forward;
1747
1748         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1749                        do_cache);
1750         set_lwt_redirect(rth);
1751         skb_dst_set(skb, &rth->dst);
1752 out:
1753         err = 0;
1754  cleanup:
1755         return err;
1756 }
1757
1758 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1759 /* To make ICMP packets follow the right flow, the multipath hash is
1760  * calculated from the inner IP addresses.
1761  */
1762 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1763                                  struct flow_keys *hash_keys)
1764 {
1765         const struct iphdr *outer_iph = ip_hdr(skb);
1766         const struct iphdr *inner_iph;
1767         const struct icmphdr *icmph;
1768         struct iphdr _inner_iph;
1769         struct icmphdr _icmph;
1770
1771         hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1772         hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1773         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1774                 return;
1775
1776         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1777                 return;
1778
1779         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1780                                    &_icmph);
1781         if (!icmph)
1782                 return;
1783
1784         if (icmph->type != ICMP_DEST_UNREACH &&
1785             icmph->type != ICMP_REDIRECT &&
1786             icmph->type != ICMP_TIME_EXCEEDED &&
1787             icmph->type != ICMP_PARAMETERPROB)
1788                 return;
1789
1790         inner_iph = skb_header_pointer(skb,
1791                                        outer_iph->ihl * 4 + sizeof(_icmph),
1792                                        sizeof(_inner_iph), &_inner_iph);
1793         if (!inner_iph)
1794                 return;
1795         hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1796         hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1797 }
1798
1799 /* if skb is set it will be used and fl4 can be NULL */
1800 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1801                        const struct sk_buff *skb)
1802 {
1803         struct net *net = fi->fib_net;
1804         struct flow_keys hash_keys;
1805         u32 mhash;
1806
1807         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1808         case 0:
1809                 memset(&hash_keys, 0, sizeof(hash_keys));
1810                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1811                 if (skb) {
1812                         ip_multipath_l3_keys(skb, &hash_keys);
1813                 } else {
1814                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1815                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1816                 }
1817                 break;
1818         case 1:
1819                 /* skb is currently provided only when forwarding */
1820                 if (skb) {
1821                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1822                         struct flow_keys keys;
1823
1824                         /* short-circuit if we already have L4 hash present */
1825                         if (skb->l4_hash)
1826                                 return skb_get_hash_raw(skb) >> 1;
1827                         memset(&hash_keys, 0, sizeof(hash_keys));
1828                         skb_flow_dissect_flow_keys(skb, &keys, flag);
1829
1830                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1831                         hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1832                         hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1833                         hash_keys.ports.src = keys.ports.src;
1834                         hash_keys.ports.dst = keys.ports.dst;
1835                         hash_keys.basic.ip_proto = keys.basic.ip_proto;
1836                 } else {
1837                         memset(&hash_keys, 0, sizeof(hash_keys));
1838                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1839                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1840                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1841                         hash_keys.ports.src = fl4->fl4_sport;
1842                         hash_keys.ports.dst = fl4->fl4_dport;
1843                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1844                 }
1845                 break;
1846         }
1847         mhash = flow_hash_from_keys(&hash_keys);
1848
1849         return mhash >> 1;
1850 }
1851 EXPORT_SYMBOL_GPL(fib_multipath_hash);
1852 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1853
1854 static int ip_mkroute_input(struct sk_buff *skb,
1855                             struct fib_result *res,
1856                             struct in_device *in_dev,
1857                             __be32 daddr, __be32 saddr, u32 tos)
1858 {
1859 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1860         if (res->fi && res->fi->fib_nhs > 1) {
1861                 int h = fib_multipath_hash(res->fi, NULL, skb);
1862
1863                 fib_select_multipath(res, h);
1864         }
1865 #endif
1866
1867         /* create a routing cache entry */
1868         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1869 }
1870
1871 /*
1872  *      NOTE. We drop all the packets that has local source
1873  *      addresses, because every properly looped back packet
1874  *      must have correct destination already attached by output routine.
1875  *
1876  *      Such approach solves two big problems:
1877  *      1. Not simplex devices are handled properly.
1878  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1879  *      called with rcu_read_lock()
1880  */
1881
1882 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1883                                u8 tos, struct net_device *dev,
1884                                struct fib_result *res)
1885 {
1886         struct in_device *in_dev = __in_dev_get_rcu(dev);
1887         struct ip_tunnel_info *tun_info;
1888         struct flowi4   fl4;
1889         unsigned int    flags = 0;
1890         u32             itag = 0;
1891         struct rtable   *rth;
1892         int             err = -EINVAL;
1893         struct net    *net = dev_net(dev);
1894         bool do_cache;
1895
1896         /* IP on this device is disabled. */
1897
1898         if (!in_dev)
1899                 goto out;
1900
1901         /* Check for the most weird martians, which can be not detected
1902            by fib_lookup.
1903          */
1904
1905         tun_info = skb_tunnel_info(skb);
1906         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1907                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1908         else
1909                 fl4.flowi4_tun_key.tun_id = 0;
1910         skb_dst_drop(skb);
1911
1912         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1913                 goto martian_source;
1914
1915         res->fi = NULL;
1916         res->table = NULL;
1917         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1918                 goto brd_input;
1919
1920         /* Accept zero addresses only to limited broadcast;
1921          * I even do not know to fix it or not. Waiting for complains :-)
1922          */
1923         if (ipv4_is_zeronet(saddr))
1924                 goto martian_source;
1925
1926         if (ipv4_is_zeronet(daddr))
1927                 goto martian_destination;
1928
1929         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1930          * and call it once if daddr or/and saddr are loopback addresses
1931          */
1932         if (ipv4_is_loopback(daddr)) {
1933                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1934                         goto martian_destination;
1935         } else if (ipv4_is_loopback(saddr)) {
1936                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1937                         goto martian_source;
1938         }
1939
1940         /*
1941          *      Now we are ready to route packet.
1942          */
1943         fl4.flowi4_oif = 0;
1944         fl4.flowi4_iif = dev->ifindex;
1945         fl4.flowi4_mark = skb->mark;
1946         fl4.flowi4_tos = tos;
1947         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1948         fl4.flowi4_flags = 0;
1949         fl4.daddr = daddr;
1950         fl4.saddr = saddr;
1951         fl4.flowi4_uid = sock_net_uid(net, NULL);
1952         err = fib_lookup(net, &fl4, res, 0);
1953         if (err != 0) {
1954                 if (!IN_DEV_FORWARD(in_dev))
1955                         err = -EHOSTUNREACH;
1956                 goto no_route;
1957         }
1958
1959         if (res->type == RTN_BROADCAST)
1960                 goto brd_input;
1961
1962         if (res->type == RTN_LOCAL) {
1963                 err = fib_validate_source(skb, saddr, daddr, tos,
1964                                           0, dev, in_dev, &itag);
1965                 if (err < 0)
1966                         goto martian_source;
1967                 goto local_input;
1968         }
1969
1970         if (!IN_DEV_FORWARD(in_dev)) {
1971                 err = -EHOSTUNREACH;
1972                 goto no_route;
1973         }
1974         if (res->type != RTN_UNICAST)
1975                 goto martian_destination;
1976
1977         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1978 out:    return err;
1979
1980 brd_input:
1981         if (skb->protocol != htons(ETH_P_IP))
1982                 goto e_inval;
1983
1984         if (!ipv4_is_zeronet(saddr)) {
1985                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1986                                           in_dev, &itag);
1987                 if (err < 0)
1988                         goto martian_source;
1989         }
1990         flags |= RTCF_BROADCAST;
1991         res->type = RTN_BROADCAST;
1992         RT_CACHE_STAT_INC(in_brd);
1993
1994 local_input:
1995         do_cache = false;
1996         if (res->fi) {
1997                 if (!itag) {
1998                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1999                         if (rt_cache_valid(rth)) {
2000                                 skb_dst_set_noref(skb, &rth->dst);
2001                                 err = 0;
2002                                 goto out;
2003                         }
2004                         do_cache = true;
2005                 }
2006         }
2007
2008         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2009                            flags | RTCF_LOCAL, res->type,
2010                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2011         if (!rth)
2012                 goto e_nobufs;
2013
2014         rth->dst.output= ip_rt_bug;
2015 #ifdef CONFIG_IP_ROUTE_CLASSID
2016         rth->dst.tclassid = itag;
2017 #endif
2018         rth->rt_is_input = 1;
2019         if (res->table)
2020                 rth->rt_table_id = res->table->tb_id;
2021
2022         RT_CACHE_STAT_INC(in_slow_tot);
2023         if (res->type == RTN_UNREACHABLE) {
2024                 rth->dst.input= ip_error;
2025                 rth->dst.error= -err;
2026                 rth->rt_flags   &= ~RTCF_LOCAL;
2027         }
2028
2029         if (do_cache) {
2030                 struct fib_nh *nh = &FIB_RES_NH(*res);
2031
2032                 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2033                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2034                         WARN_ON(rth->dst.input == lwtunnel_input);
2035                         rth->dst.lwtstate->orig_input = rth->dst.input;
2036                         rth->dst.input = lwtunnel_input;
2037                 }
2038
2039                 if (unlikely(!rt_cache_route(nh, rth)))
2040                         rt_add_uncached_list(rth);
2041         }
2042         skb_dst_set(skb, &rth->dst);
2043         err = 0;
2044         goto out;
2045
2046 no_route:
2047         RT_CACHE_STAT_INC(in_no_route);
2048         res->type = RTN_UNREACHABLE;
2049         res->fi = NULL;
2050         res->table = NULL;
2051         goto local_input;
2052
2053         /*
2054          *      Do not cache martian addresses: they should be logged (RFC1812)
2055          */
2056 martian_destination:
2057         RT_CACHE_STAT_INC(in_martian_dst);
2058 #ifdef CONFIG_IP_ROUTE_VERBOSE
2059         if (IN_DEV_LOG_MARTIANS(in_dev))
2060                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2061                                      &daddr, &saddr, dev->name);
2062 #endif
2063
2064 e_inval:
2065         err = -EINVAL;
2066         goto out;
2067
2068 e_nobufs:
2069         err = -ENOBUFS;
2070         goto out;
2071
2072 martian_source:
2073         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2074         goto out;
2075 }
2076
2077 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2078                          u8 tos, struct net_device *dev)
2079 {
2080         struct fib_result res;
2081         int err;
2082
2083         tos &= IPTOS_RT_MASK;
2084         rcu_read_lock();
2085         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2086         rcu_read_unlock();
2087
2088         return err;
2089 }
2090 EXPORT_SYMBOL(ip_route_input_noref);
2091
2092 /* called with rcu_read_lock held */
2093 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2094                        u8 tos, struct net_device *dev, struct fib_result *res)
2095 {
2096         /* Multicast recognition logic is moved from route cache to here.
2097            The problem was that too many Ethernet cards have broken/missing
2098            hardware multicast filters :-( As result the host on multicasting
2099            network acquires a lot of useless route cache entries, sort of
2100            SDR messages from all the world. Now we try to get rid of them.
2101            Really, provided software IP multicast filter is organized
2102            reasonably (at least, hashed), it does not result in a slowdown
2103            comparing with route cache reject entries.
2104            Note, that multicast routers are not affected, because
2105            route cache entry is created eventually.
2106          */
2107         if (ipv4_is_multicast(daddr)) {
2108                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2109                 int our = 0;
2110                 int err = -EINVAL;
2111
2112                 if (in_dev)
2113                         our = ip_check_mc_rcu(in_dev, daddr, saddr,
2114                                               ip_hdr(skb)->protocol);
2115
2116                 /* check l3 master if no match yet */
2117                 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2118                         struct in_device *l3_in_dev;
2119
2120                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2121                         if (l3_in_dev)
2122                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2123                                                       ip_hdr(skb)->protocol);
2124                 }
2125
2126                 if (our
2127 #ifdef CONFIG_IP_MROUTE
2128                         ||
2129                     (!ipv4_is_local_multicast(daddr) &&
2130                      IN_DEV_MFORWARD(in_dev))
2131 #endif
2132                    ) {
2133                         err = ip_route_input_mc(skb, daddr, saddr,
2134                                                 tos, dev, our);
2135                 }
2136                 return err;
2137         }
2138
2139         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2140 }
2141
2142 /* called with rcu_read_lock() */
2143 static struct rtable *__mkroute_output(const struct fib_result *res,
2144                                        const struct flowi4 *fl4, int orig_oif,
2145                                        struct net_device *dev_out,
2146                                        unsigned int flags)
2147 {
2148         struct fib_info *fi = res->fi;
2149         struct fib_nh_exception *fnhe;
2150         struct in_device *in_dev;
2151         u16 type = res->type;
2152         struct rtable *rth;
2153         bool do_cache;
2154
2155         in_dev = __in_dev_get_rcu(dev_out);
2156         if (!in_dev)
2157                 return ERR_PTR(-EINVAL);
2158
2159         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2160                 if (ipv4_is_loopback(fl4->saddr) &&
2161                     !(dev_out->flags & IFF_LOOPBACK) &&
2162                     !netif_is_l3_master(dev_out))
2163                         return ERR_PTR(-EINVAL);
2164
2165         if (ipv4_is_lbcast(fl4->daddr))
2166                 type = RTN_BROADCAST;
2167         else if (ipv4_is_multicast(fl4->daddr))
2168                 type = RTN_MULTICAST;
2169         else if (ipv4_is_zeronet(fl4->daddr))
2170                 return ERR_PTR(-EINVAL);
2171
2172         if (dev_out->flags & IFF_LOOPBACK)
2173                 flags |= RTCF_LOCAL;
2174
2175         do_cache = true;
2176         if (type == RTN_BROADCAST) {
2177                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2178                 fi = NULL;
2179         } else if (type == RTN_MULTICAST) {
2180                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2181                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2182                                      fl4->flowi4_proto))
2183                         flags &= ~RTCF_LOCAL;
2184                 else
2185                         do_cache = false;
2186                 /* If multicast route do not exist use
2187                  * default one, but do not gateway in this case.
2188                  * Yes, it is hack.
2189                  */
2190                 if (fi && res->prefixlen < 4)
2191                         fi = NULL;
2192         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2193                    (orig_oif != dev_out->ifindex)) {
2194                 /* For local routes that require a particular output interface
2195                  * we do not want to cache the result.  Caching the result
2196                  * causes incorrect behaviour when there are multiple source
2197                  * addresses on the interface, the end result being that if the
2198                  * intended recipient is waiting on that interface for the
2199                  * packet he won't receive it because it will be delivered on
2200                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2201                  * be set to the loopback interface as well.
2202                  */
2203                 fi = NULL;
2204         }
2205
2206         fnhe = NULL;
2207         do_cache &= fi != NULL;
2208         if (do_cache) {
2209                 struct rtable __rcu **prth;
2210                 struct fib_nh *nh = &FIB_RES_NH(*res);
2211
2212                 fnhe = find_exception(nh, fl4->daddr);
2213                 if (fnhe) {
2214                         prth = &fnhe->fnhe_rth_output;
2215                         rth = rcu_dereference(*prth);
2216                         if (rth && rth->dst.expires &&
2217                             time_after(jiffies, rth->dst.expires)) {
2218                                 ip_del_fnhe(nh, fl4->daddr);
2219                                 fnhe = NULL;
2220                         } else {
2221                                 goto rt_cache;
2222                         }
2223                 }
2224
2225                 if (unlikely(fl4->flowi4_flags &
2226                              FLOWI_FLAG_KNOWN_NH &&
2227                              !(nh->nh_gw &&
2228                                nh->nh_scope == RT_SCOPE_LINK))) {
2229                         do_cache = false;
2230                         goto add;
2231                 }
2232                 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2233                 rth = rcu_dereference(*prth);
2234
2235 rt_cache:
2236                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2237                         return rth;
2238         }
2239
2240 add:
2241         rth = rt_dst_alloc(dev_out, flags, type,
2242                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2243                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2244                            do_cache);
2245         if (!rth)
2246                 return ERR_PTR(-ENOBUFS);
2247
2248         rth->rt_iif = orig_oif;
2249         if (res->table)
2250                 rth->rt_table_id = res->table->tb_id;
2251
2252         RT_CACHE_STAT_INC(out_slow_tot);
2253
2254         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2255                 if (flags & RTCF_LOCAL &&
2256                     !(dev_out->flags & IFF_LOOPBACK)) {
2257                         rth->dst.output = ip_mc_output;
2258                         RT_CACHE_STAT_INC(out_slow_mc);
2259                 }
2260 #ifdef CONFIG_IP_MROUTE
2261                 if (type == RTN_MULTICAST) {
2262                         if (IN_DEV_MFORWARD(in_dev) &&
2263                             !ipv4_is_local_multicast(fl4->daddr)) {
2264                                 rth->dst.input = ip_mr_input;
2265                                 rth->dst.output = ip_mc_output;
2266                         }
2267                 }
2268 #endif
2269         }
2270
2271         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2272         set_lwt_redirect(rth);
2273
2274         return rth;
2275 }
2276
2277 /*
2278  * Major route resolver routine.
2279  */
2280
2281 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2282                                         const struct sk_buff *skb)
2283 {
2284         __u8 tos = RT_FL_TOS(fl4);
2285         struct fib_result res;
2286         struct rtable *rth;
2287
2288         res.tclassid    = 0;
2289         res.fi          = NULL;
2290         res.table       = NULL;
2291
2292         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2293         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2294         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2295                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2296
2297         rcu_read_lock();
2298         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2299         rcu_read_unlock();
2300
2301         return rth;
2302 }
2303 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2304
2305 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2306                                             struct fib_result *res,
2307                                             const struct sk_buff *skb)
2308 {
2309         struct net_device *dev_out = NULL;
2310         int orig_oif = fl4->flowi4_oif;
2311         unsigned int flags = 0;
2312         struct rtable *rth;
2313         int err = -ENETUNREACH;
2314
2315         if (fl4->saddr) {
2316                 rth = ERR_PTR(-EINVAL);
2317                 if (ipv4_is_multicast(fl4->saddr) ||
2318                     ipv4_is_lbcast(fl4->saddr) ||
2319                     ipv4_is_zeronet(fl4->saddr))
2320                         goto out;
2321
2322                 /* I removed check for oif == dev_out->oif here.
2323                    It was wrong for two reasons:
2324                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2325                       is assigned to multiple interfaces.
2326                    2. Moreover, we are allowed to send packets with saddr
2327                       of another iface. --ANK
2328                  */
2329
2330                 if (fl4->flowi4_oif == 0 &&
2331                     (ipv4_is_multicast(fl4->daddr) ||
2332                      ipv4_is_lbcast(fl4->daddr))) {
2333                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2334                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2335                         if (!dev_out)
2336                                 goto out;
2337
2338                         /* Special hack: user can direct multicasts
2339                            and limited broadcast via necessary interface
2340                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2341                            This hack is not just for fun, it allows
2342                            vic,vat and friends to work.
2343                            They bind socket to loopback, set ttl to zero
2344                            and expect that it will work.
2345                            From the viewpoint of routing cache they are broken,
2346                            because we are not allowed to build multicast path
2347                            with loopback source addr (look, routing cache
2348                            cannot know, that ttl is zero, so that packet
2349                            will not leave this host and route is valid).
2350                            Luckily, this hack is good workaround.
2351                          */
2352
2353                         fl4->flowi4_oif = dev_out->ifindex;
2354                         goto make_route;
2355                 }
2356
2357                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2358                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2359                         if (!__ip_dev_find(net, fl4->saddr, false))
2360                                 goto out;
2361                 }
2362         }
2363
2364
2365         if (fl4->flowi4_oif) {
2366                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2367                 rth = ERR_PTR(-ENODEV);
2368                 if (!dev_out)
2369                         goto out;
2370
2371                 /* RACE: Check return value of inet_select_addr instead. */
2372                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2373                         rth = ERR_PTR(-ENETUNREACH);
2374                         goto out;
2375                 }
2376                 if (ipv4_is_local_multicast(fl4->daddr) ||
2377                     ipv4_is_lbcast(fl4->daddr) ||
2378                     fl4->flowi4_proto == IPPROTO_IGMP) {
2379                         if (!fl4->saddr)
2380                                 fl4->saddr = inet_select_addr(dev_out, 0,
2381                                                               RT_SCOPE_LINK);
2382                         goto make_route;
2383                 }
2384                 if (!fl4->saddr) {
2385                         if (ipv4_is_multicast(fl4->daddr))
2386                                 fl4->saddr = inet_select_addr(dev_out, 0,
2387                                                               fl4->flowi4_scope);
2388                         else if (!fl4->daddr)
2389                                 fl4->saddr = inet_select_addr(dev_out, 0,
2390                                                               RT_SCOPE_HOST);
2391                 }
2392         }
2393
2394         if (!fl4->daddr) {
2395                 fl4->daddr = fl4->saddr;
2396                 if (!fl4->daddr)
2397                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2398                 dev_out = net->loopback_dev;
2399                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2400                 res->type = RTN_LOCAL;
2401                 flags |= RTCF_LOCAL;
2402                 goto make_route;
2403         }
2404
2405         err = fib_lookup(net, fl4, res, 0);
2406         if (err) {
2407                 res->fi = NULL;
2408                 res->table = NULL;
2409                 if (fl4->flowi4_oif &&
2410                     (ipv4_is_multicast(fl4->daddr) ||
2411                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2412                         /* Apparently, routing tables are wrong. Assume,
2413                            that the destination is on link.
2414
2415                            WHY? DW.
2416                            Because we are allowed to send to iface
2417                            even if it has NO routes and NO assigned
2418                            addresses. When oif is specified, routing
2419                            tables are looked up with only one purpose:
2420                            to catch if destination is gatewayed, rather than
2421                            direct. Moreover, if MSG_DONTROUTE is set,
2422                            we send packet, ignoring both routing tables
2423                            and ifaddr state. --ANK
2424
2425
2426                            We could make it even if oif is unknown,
2427                            likely IPv6, but we do not.
2428                          */
2429
2430                         if (fl4->saddr == 0)
2431                                 fl4->saddr = inet_select_addr(dev_out, 0,
2432                                                               RT_SCOPE_LINK);
2433                         res->type = RTN_UNICAST;
2434                         goto make_route;
2435                 }
2436                 rth = ERR_PTR(err);
2437                 goto out;
2438         }
2439
2440         if (res->type == RTN_LOCAL) {
2441                 if (!fl4->saddr) {
2442                         if (res->fi->fib_prefsrc)
2443                                 fl4->saddr = res->fi->fib_prefsrc;
2444                         else
2445                                 fl4->saddr = fl4->daddr;
2446                 }
2447
2448                 /* L3 master device is the loopback for that domain */
2449                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2450                         net->loopback_dev;
2451
2452                 /* make sure orig_oif points to fib result device even
2453                  * though packet rx/tx happens over loopback or l3mdev
2454                  */
2455                 orig_oif = FIB_RES_OIF(*res);
2456
2457                 fl4->flowi4_oif = dev_out->ifindex;
2458                 flags |= RTCF_LOCAL;
2459                 goto make_route;
2460         }
2461
2462         fib_select_path(net, res, fl4, skb);
2463
2464         dev_out = FIB_RES_DEV(*res);
2465         fl4->flowi4_oif = dev_out->ifindex;
2466
2467
2468 make_route:
2469         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2470
2471 out:
2472         return rth;
2473 }
2474
2475 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2476 {
2477         return NULL;
2478 }
2479
2480 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2481 {
2482         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2483
2484         return mtu ? : dst->dev->mtu;
2485 }
2486
2487 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2488                                           struct sk_buff *skb, u32 mtu)
2489 {
2490 }
2491
2492 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2493                                        struct sk_buff *skb)
2494 {
2495 }
2496
2497 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2498                                           unsigned long old)
2499 {
2500         return NULL;
2501 }
2502
2503 static struct dst_ops ipv4_dst_blackhole_ops = {
2504         .family                 =       AF_INET,
2505         .check                  =       ipv4_blackhole_dst_check,
2506         .mtu                    =       ipv4_blackhole_mtu,
2507         .default_advmss         =       ipv4_default_advmss,
2508         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2509         .redirect               =       ipv4_rt_blackhole_redirect,
2510         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2511         .neigh_lookup           =       ipv4_neigh_lookup,
2512 };
2513
2514 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2515 {
2516         struct rtable *ort = (struct rtable *) dst_orig;
2517         struct rtable *rt;
2518
2519         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2520         if (rt) {
2521                 struct dst_entry *new = &rt->dst;
2522
2523                 new->__use = 1;
2524                 new->input = dst_discard;
2525                 new->output = dst_discard_out;
2526
2527                 new->dev = net->loopback_dev;
2528                 if (new->dev)
2529                         dev_hold(new->dev);
2530
2531                 rt->rt_is_input = ort->rt_is_input;
2532                 rt->rt_iif = ort->rt_iif;
2533                 rt->rt_pmtu = ort->rt_pmtu;
2534
2535                 rt->rt_genid = rt_genid_ipv4(net);
2536                 rt->rt_flags = ort->rt_flags;
2537                 rt->rt_type = ort->rt_type;
2538                 rt->rt_gateway = ort->rt_gateway;
2539                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2540
2541                 INIT_LIST_HEAD(&rt->rt_uncached);
2542         }
2543
2544         dst_release(dst_orig);
2545
2546         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2547 }
2548
2549 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2550                                     const struct sock *sk)
2551 {
2552         struct rtable *rt = __ip_route_output_key(net, flp4);
2553
2554         if (IS_ERR(rt))
2555                 return rt;
2556
2557         if (flp4->flowi4_proto)
2558                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2559                                                         flowi4_to_flowi(flp4),
2560                                                         sk, 0);
2561
2562         return rt;
2563 }
2564 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2565
2566 /* called with rcu_read_lock held */
2567 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2568                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2569                         u32 seq)
2570 {
2571         struct rtable *rt = skb_rtable(skb);
2572         struct rtmsg *r;
2573         struct nlmsghdr *nlh;
2574         unsigned long expires = 0;
2575         u32 error;
2576         u32 metrics[RTAX_MAX];
2577
2578         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2579         if (!nlh)
2580                 return -EMSGSIZE;
2581
2582         r = nlmsg_data(nlh);
2583         r->rtm_family    = AF_INET;
2584         r->rtm_dst_len  = 32;
2585         r->rtm_src_len  = 0;
2586         r->rtm_tos      = fl4->flowi4_tos;
2587         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2588         if (nla_put_u32(skb, RTA_TABLE, table_id))
2589                 goto nla_put_failure;
2590         r->rtm_type     = rt->rt_type;
2591         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2592         r->rtm_protocol = RTPROT_UNSPEC;
2593         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2594         if (rt->rt_flags & RTCF_NOTIFY)
2595                 r->rtm_flags |= RTM_F_NOTIFY;
2596         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2597                 r->rtm_flags |= RTCF_DOREDIRECT;
2598
2599         if (nla_put_in_addr(skb, RTA_DST, dst))
2600                 goto nla_put_failure;
2601         if (src) {
2602                 r->rtm_src_len = 32;
2603                 if (nla_put_in_addr(skb, RTA_SRC, src))
2604                         goto nla_put_failure;
2605         }
2606         if (rt->dst.dev &&
2607             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2608                 goto nla_put_failure;
2609 #ifdef CONFIG_IP_ROUTE_CLASSID
2610         if (rt->dst.tclassid &&
2611             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2612                 goto nla_put_failure;
2613 #endif
2614         if (!rt_is_input_route(rt) &&
2615             fl4->saddr != src) {
2616                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2617                         goto nla_put_failure;
2618         }
2619         if (rt->rt_uses_gateway &&
2620             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2621                 goto nla_put_failure;
2622
2623         expires = rt->dst.expires;
2624         if (expires) {
2625                 unsigned long now = jiffies;
2626
2627                 if (time_before(now, expires))
2628                         expires -= now;
2629                 else
2630                         expires = 0;
2631         }
2632
2633         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2634         if (rt->rt_pmtu && expires)
2635                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2636         if (rtnetlink_put_metrics(skb, metrics) < 0)
2637                 goto nla_put_failure;
2638
2639         if (fl4->flowi4_mark &&
2640             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2641                 goto nla_put_failure;
2642
2643         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2644             nla_put_u32(skb, RTA_UID,
2645                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2646                 goto nla_put_failure;
2647
2648         error = rt->dst.error;
2649
2650         if (rt_is_input_route(rt)) {
2651 #ifdef CONFIG_IP_MROUTE
2652                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2653                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2654                         int err = ipmr_get_route(net, skb,
2655                                                  fl4->saddr, fl4->daddr,
2656                                                  r, portid);
2657
2658                         if (err <= 0) {
2659                                 if (err == 0)
2660                                         return 0;
2661                                 goto nla_put_failure;
2662                         }
2663                 } else
2664 #endif
2665                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2666                                 goto nla_put_failure;
2667         }
2668
2669         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2670                 goto nla_put_failure;
2671
2672         nlmsg_end(skb, nlh);
2673         return 0;
2674
2675 nla_put_failure:
2676         nlmsg_cancel(skb, nlh);
2677         return -EMSGSIZE;
2678 }
2679
2680 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2681                              struct netlink_ext_ack *extack)
2682 {
2683         struct net *net = sock_net(in_skb->sk);
2684         struct rtmsg *rtm;
2685         struct nlattr *tb[RTA_MAX+1];
2686         struct fib_result res = {};
2687         struct rtable *rt = NULL;
2688         struct flowi4 fl4;
2689         __be32 dst = 0;
2690         __be32 src = 0;
2691         u32 iif;
2692         int err;
2693         int mark;
2694         struct sk_buff *skb;
2695         u32 table_id = RT_TABLE_MAIN;
2696         kuid_t uid;
2697
2698         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2699                           extack);
2700         if (err < 0)
2701                 goto errout;
2702
2703         rtm = nlmsg_data(nlh);
2704
2705         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2706         if (!skb) {
2707                 err = -ENOBUFS;
2708                 goto errout;
2709         }
2710
2711         /* Reserve room for dummy headers, this skb can pass
2712            through good chunk of routing engine.
2713          */
2714         skb_reset_mac_header(skb);
2715         skb_reset_network_header(skb);
2716
2717         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2718         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2719         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2720         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2721         if (tb[RTA_UID])
2722                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2723         else
2724                 uid = (iif ? INVALID_UID : current_uid());
2725
2726         /* Bugfix: need to give ip_route_input enough of an IP header to
2727          * not gag.
2728          */
2729         ip_hdr(skb)->protocol = IPPROTO_UDP;
2730         ip_hdr(skb)->saddr = src;
2731         ip_hdr(skb)->daddr = dst;
2732
2733         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2734
2735         memset(&fl4, 0, sizeof(fl4));
2736         fl4.daddr = dst;
2737         fl4.saddr = src;
2738         fl4.flowi4_tos = rtm->rtm_tos;
2739         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2740         fl4.flowi4_mark = mark;
2741         fl4.flowi4_uid = uid;
2742
2743         rcu_read_lock();
2744
2745         if (iif) {
2746                 struct net_device *dev;
2747
2748                 dev = dev_get_by_index_rcu(net, iif);
2749                 if (!dev) {
2750                         err = -ENODEV;
2751                         goto errout_free;
2752                 }
2753
2754                 skb->protocol   = htons(ETH_P_IP);
2755                 skb->dev        = dev;
2756                 skb->mark       = mark;
2757                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2758                                          dev, &res);
2759
2760                 rt = skb_rtable(skb);
2761                 if (err == 0 && rt->dst.error)
2762                         err = -rt->dst.error;
2763         } else {
2764                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
2765                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2766                 err = 0;
2767                 if (IS_ERR(rt))
2768                         err = PTR_ERR(rt);
2769                 else
2770                         skb_dst_set(skb, &rt->dst);
2771         }
2772
2773         if (err)
2774                 goto errout_free;
2775
2776         if (rtm->rtm_flags & RTM_F_NOTIFY)
2777                 rt->rt_flags |= RTCF_NOTIFY;
2778
2779         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2780                 table_id = rt->rt_table_id;
2781
2782         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2783                 if (!res.fi) {
2784                         err = fib_props[res.type].error;
2785                         if (!err)
2786                                 err = -EHOSTUNREACH;
2787                         goto errout_free;
2788                 }
2789                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2790                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2791                                     rt->rt_type, res.prefix, res.prefixlen,
2792                                     fl4.flowi4_tos, res.fi, 0);
2793         } else {
2794                 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2795                                    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2796         }
2797         if (err < 0)
2798                 goto errout_free;
2799
2800         rcu_read_unlock();
2801
2802         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2803 errout:
2804         return err;
2805
2806 errout_free:
2807         rcu_read_unlock();
2808         kfree_skb(skb);
2809         goto errout;
2810 }
2811
2812 void ip_rt_multicast_event(struct in_device *in_dev)
2813 {
2814         rt_cache_flush(dev_net(in_dev->dev));
2815 }
2816
2817 #ifdef CONFIG_SYSCTL
2818 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2819 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2820 static int ip_rt_gc_elasticity __read_mostly    = 8;
2821
2822 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2823                                         void __user *buffer,
2824                                         size_t *lenp, loff_t *ppos)
2825 {
2826         struct net *net = (struct net *)__ctl->extra1;
2827
2828         if (write) {
2829                 rt_cache_flush(net);
2830                 fnhe_genid_bump(net);
2831                 return 0;
2832         }
2833
2834         return -EINVAL;
2835 }
2836
2837 static struct ctl_table ipv4_route_table[] = {
2838         {
2839                 .procname       = "gc_thresh",
2840                 .data           = &ipv4_dst_ops.gc_thresh,
2841                 .maxlen         = sizeof(int),
2842                 .mode           = 0644,
2843                 .proc_handler   = proc_dointvec,
2844         },
2845         {
2846                 .procname       = "max_size",
2847                 .data           = &ip_rt_max_size,
2848                 .maxlen         = sizeof(int),
2849                 .mode           = 0644,
2850                 .proc_handler   = proc_dointvec,
2851         },
2852         {
2853                 /*  Deprecated. Use gc_min_interval_ms */
2854
2855                 .procname       = "gc_min_interval",
2856                 .data           = &ip_rt_gc_min_interval,
2857                 .maxlen         = sizeof(int),
2858                 .mode           = 0644,
2859                 .proc_handler   = proc_dointvec_jiffies,
2860         },
2861         {
2862                 .procname       = "gc_min_interval_ms",
2863                 .data           = &ip_rt_gc_min_interval,
2864                 .maxlen         = sizeof(int),
2865                 .mode           = 0644,
2866                 .proc_handler   = proc_dointvec_ms_jiffies,
2867         },
2868         {
2869                 .procname       = "gc_timeout",
2870                 .data           = &ip_rt_gc_timeout,
2871                 .maxlen         = sizeof(int),
2872                 .mode           = 0644,
2873                 .proc_handler   = proc_dointvec_jiffies,
2874         },
2875         {
2876                 .procname       = "gc_interval",
2877                 .data           = &ip_rt_gc_interval,
2878                 .maxlen         = sizeof(int),
2879                 .mode           = 0644,
2880                 .proc_handler   = proc_dointvec_jiffies,
2881         },
2882         {
2883                 .procname       = "redirect_load",
2884                 .data           = &ip_rt_redirect_load,
2885                 .maxlen         = sizeof(int),
2886                 .mode           = 0644,
2887                 .proc_handler   = proc_dointvec,
2888         },
2889         {
2890                 .procname       = "redirect_number",
2891                 .data           = &ip_rt_redirect_number,
2892                 .maxlen         = sizeof(int),
2893                 .mode           = 0644,
2894                 .proc_handler   = proc_dointvec,
2895         },
2896         {
2897                 .procname       = "redirect_silence",
2898                 .data           = &ip_rt_redirect_silence,
2899                 .maxlen         = sizeof(int),
2900                 .mode           = 0644,
2901                 .proc_handler   = proc_dointvec,
2902         },
2903         {
2904                 .procname       = "error_cost",
2905                 .data           = &ip_rt_error_cost,
2906                 .maxlen         = sizeof(int),
2907                 .mode           = 0644,
2908                 .proc_handler   = proc_dointvec,
2909         },
2910         {
2911                 .procname       = "error_burst",
2912                 .data           = &ip_rt_error_burst,
2913                 .maxlen         = sizeof(int),
2914                 .mode           = 0644,
2915                 .proc_handler   = proc_dointvec,
2916         },
2917         {
2918                 .procname       = "gc_elasticity",
2919                 .data           = &ip_rt_gc_elasticity,
2920                 .maxlen         = sizeof(int),
2921                 .mode           = 0644,
2922                 .proc_handler   = proc_dointvec,
2923         },
2924         {
2925                 .procname       = "mtu_expires",
2926                 .data           = &ip_rt_mtu_expires,
2927                 .maxlen         = sizeof(int),
2928                 .mode           = 0644,
2929                 .proc_handler   = proc_dointvec_jiffies,
2930         },
2931         {
2932                 .procname       = "min_pmtu",
2933                 .data           = &ip_rt_min_pmtu,
2934                 .maxlen         = sizeof(int),
2935                 .mode           = 0644,
2936                 .proc_handler   = proc_dointvec,
2937         },
2938         {
2939                 .procname       = "min_adv_mss",
2940                 .data           = &ip_rt_min_advmss,
2941                 .maxlen         = sizeof(int),
2942                 .mode           = 0644,
2943                 .proc_handler   = proc_dointvec,
2944         },
2945         { }
2946 };
2947
2948 static struct ctl_table ipv4_route_flush_table[] = {
2949         {
2950                 .procname       = "flush",
2951                 .maxlen         = sizeof(int),
2952                 .mode           = 0200,
2953                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2954         },
2955         { },
2956 };
2957
2958 static __net_init int sysctl_route_net_init(struct net *net)
2959 {
2960         struct ctl_table *tbl;
2961
2962         tbl = ipv4_route_flush_table;
2963         if (!net_eq(net, &init_net)) {
2964                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2965                 if (!tbl)
2966                         goto err_dup;
2967
2968                 /* Don't export sysctls to unprivileged users */
2969                 if (net->user_ns != &init_user_ns)
2970                         tbl[0].procname = NULL;
2971         }
2972         tbl[0].extra1 = net;
2973
2974         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2975         if (!net->ipv4.route_hdr)
2976                 goto err_reg;
2977         return 0;
2978
2979 err_reg:
2980         if (tbl != ipv4_route_flush_table)
2981                 kfree(tbl);
2982 err_dup:
2983         return -ENOMEM;
2984 }
2985
2986 static __net_exit void sysctl_route_net_exit(struct net *net)
2987 {
2988         struct ctl_table *tbl;
2989
2990         tbl = net->ipv4.route_hdr->ctl_table_arg;
2991         unregister_net_sysctl_table(net->ipv4.route_hdr);
2992         BUG_ON(tbl == ipv4_route_flush_table);
2993         kfree(tbl);
2994 }
2995
2996 static __net_initdata struct pernet_operations sysctl_route_ops = {
2997         .init = sysctl_route_net_init,
2998         .exit = sysctl_route_net_exit,
2999 };
3000 #endif
3001
3002 static __net_init int rt_genid_init(struct net *net)
3003 {
3004         atomic_set(&net->ipv4.rt_genid, 0);
3005         atomic_set(&net->fnhe_genid, 0);
3006         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3007         return 0;
3008 }
3009
3010 static __net_initdata struct pernet_operations rt_genid_ops = {
3011         .init = rt_genid_init,
3012 };
3013
3014 static int __net_init ipv4_inetpeer_init(struct net *net)
3015 {
3016         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3017
3018         if (!bp)
3019                 return -ENOMEM;
3020         inet_peer_base_init(bp);
3021         net->ipv4.peers = bp;
3022         return 0;
3023 }
3024
3025 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3026 {
3027         struct inet_peer_base *bp = net->ipv4.peers;
3028
3029         net->ipv4.peers = NULL;
3030         inetpeer_invalidate_tree(bp);
3031         kfree(bp);
3032 }
3033
3034 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3035         .init   =       ipv4_inetpeer_init,
3036         .exit   =       ipv4_inetpeer_exit,
3037 };
3038
3039 #ifdef CONFIG_IP_ROUTE_CLASSID
3040 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3041 #endif /* CONFIG_IP_ROUTE_CLASSID */
3042
3043 int __init ip_rt_init(void)
3044 {
3045         int cpu;
3046
3047         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3048         if (!ip_idents)
3049                 panic("IP: failed to allocate ip_idents\n");
3050
3051         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3052
3053         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3054         if (!ip_tstamps)
3055                 panic("IP: failed to allocate ip_tstamps\n");
3056
3057         for_each_possible_cpu(cpu) {
3058                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3059
3060                 INIT_LIST_HEAD(&ul->head);
3061                 spin_lock_init(&ul->lock);
3062         }
3063 #ifdef CONFIG_IP_ROUTE_CLASSID
3064         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3065         if (!ip_rt_acct)
3066                 panic("IP: failed to allocate ip_rt_acct\n");
3067 #endif
3068
3069         ipv4_dst_ops.kmem_cachep =
3070                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3071                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3072
3073         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3074
3075         if (dst_entries_init(&ipv4_dst_ops) < 0)
3076                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3077
3078         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3079                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3080
3081         ipv4_dst_ops.gc_thresh = ~0;
3082         ip_rt_max_size = INT_MAX;
3083
3084         devinet_init();
3085         ip_fib_init();
3086
3087         if (ip_rt_proc_init())
3088                 pr_err("Unable to create route proc files\n");
3089 #ifdef CONFIG_XFRM
3090         xfrm_init();
3091         xfrm4_init();
3092 #endif
3093         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3094                       RTNL_FLAG_DOIT_UNLOCKED);
3095
3096 #ifdef CONFIG_SYSCTL
3097         register_pernet_subsys(&sysctl_route_ops);
3098 #endif
3099         register_pernet_subsys(&rt_genid_ops);
3100         register_pernet_subsys(&ipv4_inetpeer_ops);
3101         return 0;
3102 }
3103
3104 #ifdef CONFIG_SYSCTL
3105 /*
3106  * We really need to sanitize the damn ipv4 init order, then all
3107  * this nonsense will go away.
3108  */
3109 void __init ip_static_sysctl_init(void)
3110 {
3111         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3112 }
3113 #endif