net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/skbuff.h>
  83 #include <linux/inetdevice.h>
  84 #include <linux/igmp.h>
  85 #include <linux/pkt_sched.h>
  86 #include <linux/mroute.h>
  87 #include <linux/netfilter_ipv4.h>
  88 #include <linux/random.h>
  89 #include <linux/rcupdate.h>
  90 #include <linux/times.h>
  91 #include <linux/slab.h>
  92 #include <linux/jhash.h>
  93 #include <net/dst.h>
  94 #include <net/dst_metadata.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/lwtunnel.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #endif
 112 #include <net/secure_seq.h>
 113 #include <net/ip_tunnels.h>
 114 #include <net/l3mdev.h>
 115
 116 #include "fib_lookup.h"
 117
 118 #define RT_FL_TOS(oldflp4) \
 119         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 120
 121 #define RT_GC_TIMEOUT (300*HZ)
 122
 123 static int ip_rt_max_size;
 124 static int ip_rt_redirect_number __read_mostly  = 9;
 125 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 127 static int ip_rt_error_cost __read_mostly       = HZ;
 128 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 130 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 131 static int ip_rt_min_advmss __read_mostly       = 256;
 132
 133 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 134
 135 /*
 136  *      Interface to generic destination cache.
 137  */
 138
 139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 140 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 141 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 143 static void              ipv4_link_failure(struct sk_buff *skb);
 144 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 145                                            struct sk_buff *skb, u32 mtu);
 146 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 147                                         struct sk_buff *skb);
 148 static void             ipv4_dst_destroy(struct dst_entry *dst);
 149
 150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 151 {
 152         WARN_ON(1);
 153         return NULL;
 154 }
 155
 156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 157                                            struct sk_buff *skb,
 158                                            const void *daddr);
 159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 160
 161 static struct dst_ops ipv4_dst_ops = {
 162         .family =               AF_INET,
 163         .check =                ipv4_dst_check,
 164         .default_advmss =       ipv4_default_advmss,
 165         .mtu =                  ipv4_mtu,
 166         .cow_metrics =          ipv4_cow_metrics,
 167         .destroy =              ipv4_dst_destroy,
 168         .negative_advice =      ipv4_negative_advice,
 169         .link_failure =         ipv4_link_failure,
 170         .update_pmtu =          ip_rt_update_pmtu,
 171         .redirect =             ip_do_redirect,
 172         .local_out =            __ip_local_out,
 173         .neigh_lookup =         ipv4_neigh_lookup,
 174         .confirm_neigh =        ipv4_confirm_neigh,
 175 };
 176
 177 #define ECN_OR_COST(class)      TC_PRIO_##class
 178
 179 const __u8 ip_tos2prio[16] = {
 180         TC_PRIO_BESTEFFORT,
 181         ECN_OR_COST(BESTEFFORT),
 182         TC_PRIO_BESTEFFORT,
 183         ECN_OR_COST(BESTEFFORT),
 184         TC_PRIO_BULK,
 185         ECN_OR_COST(BULK),
 186         TC_PRIO_BULK,
 187         ECN_OR_COST(BULK),
 188         TC_PRIO_INTERACTIVE,
 189         ECN_OR_COST(INTERACTIVE),
 190         TC_PRIO_INTERACTIVE,
 191         ECN_OR_COST(INTERACTIVE),
 192         TC_PRIO_INTERACTIVE_BULK,
 193         ECN_OR_COST(INTERACTIVE_BULK),
 194         TC_PRIO_INTERACTIVE_BULK,
 195         ECN_OR_COST(INTERACTIVE_BULK)
 196 };
 197 EXPORT_SYMBOL(ip_tos2prio);
 198
 199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 201
 202 #ifdef CONFIG_PROC_FS
 203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 204 {
 205         if (*pos)
 206                 return NULL;
 207         return SEQ_START_TOKEN;
 208 }
 209
 210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 211 {
 212         ++*pos;
 213         return NULL;
 214 }
 215
 216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 217 {
 218 }
 219
 220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 221 {
 222         if (v == SEQ_START_TOKEN)
 223                 seq_printf(seq, "%-127s\n",
 224                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 225                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 226                            "HHUptod\tSpecDst");
 227         return 0;
 228 }
 229
 230 static const struct seq_operations rt_cache_seq_ops = {
 231         .start  = rt_cache_seq_start,
 232         .next   = rt_cache_seq_next,
 233         .stop   = rt_cache_seq_stop,
 234         .show   = rt_cache_seq_show,
 235 };
 236
 237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 238 {
 239         return seq_open(file, &rt_cache_seq_ops);
 240 }
 241
 242 static const struct file_operations rt_cache_seq_fops = {
 243         .open    = rt_cache_seq_open,
 244         .read    = seq_read,
 245         .llseek  = seq_lseek,
 246         .release = seq_release,
 247 };
 248
 249
 250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 251 {
 252         int cpu;
 253
 254         if (*pos == 0)
 255                 return SEQ_START_TOKEN;
 256
 257         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 258                 if (!cpu_possible(cpu))
 259                         continue;
 260                 *pos = cpu+1;
 261                 return &per_cpu(rt_cache_stat, cpu);
 262         }
 263         return NULL;
 264 }
 265
 266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 267 {
 268         int cpu;
 269
 270         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 271                 if (!cpu_possible(cpu))
 272                         continue;
 273                 *pos = cpu+1;
 274                 return &per_cpu(rt_cache_stat, cpu);
 275         }
 276         return NULL;
 277
 278 }
 279
 280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 281 {
 282
 283 }
 284
 285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 286 {
 287         struct rt_cache_stat *st = v;
 288
 289         if (v == SEQ_START_TOKEN) {
 290                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 291                 return 0;
 292         }
 293
 294         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 295                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 296                    dst_entries_get_slow(&ipv4_dst_ops),
 297                    0, /* st->in_hit */
 298                    st->in_slow_tot,
 299                    st->in_slow_mc,
 300                    st->in_no_route,
 301                    st->in_brd,
 302                    st->in_martian_dst,
 303                    st->in_martian_src,
 304
 305                    0, /* st->out_hit */
 306                    st->out_slow_tot,
 307                    st->out_slow_mc,
 308
 309                    0, /* st->gc_total */
 310                    0, /* st->gc_ignored */
 311                    0, /* st->gc_goal_miss */
 312                    0, /* st->gc_dst_overflow */
 313                    0, /* st->in_hlist_search */
 314                    0  /* st->out_hlist_search */
 315                 );
 316         return 0;
 317 }
 318
 319 static const struct seq_operations rt_cpu_seq_ops = {
 320         .start  = rt_cpu_seq_start,
 321         .next   = rt_cpu_seq_next,
 322         .stop   = rt_cpu_seq_stop,
 323         .show   = rt_cpu_seq_show,
 324 };
 325
 326
 327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 328 {
 329         return seq_open(file, &rt_cpu_seq_ops);
 330 }
 331
 332 static const struct file_operations rt_cpu_seq_fops = {
 333         .open    = rt_cpu_seq_open,
 334         .read    = seq_read,
 335         .llseek  = seq_lseek,
 336         .release = seq_release,
 337 };
 338
 339 #ifdef CONFIG_IP_ROUTE_CLASSID
 340 static int rt_acct_proc_show(struct seq_file *m, void *v)
 341 {
 342         struct ip_rt_acct *dst, *src;
 343         unsigned int i, j;
 344
 345         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 346         if (!dst)
 347                 return -ENOMEM;
 348
 349         for_each_possible_cpu(i) {
 350                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 351                 for (j = 0; j < 256; j++) {
 352                         dst[j].o_bytes   += src[j].o_bytes;
 353                         dst[j].o_packets += src[j].o_packets;
 354                         dst[j].i_bytes   += src[j].i_bytes;
 355                         dst[j].i_packets += src[j].i_packets;
 356                 }
 357         }
 358
 359         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 360         kfree(dst);
 361         return 0;
 362 }
 363 #endif
 364
 365 static int __net_init ip_rt_do_proc_init(struct net *net)
 366 {
 367         struct proc_dir_entry *pde;
 368
 369         pde = proc_create("rt_cache", 0444, net->proc_net,
 370                           &rt_cache_seq_fops);
 371         if (!pde)
 372                 goto err1;
 373
 374         pde = proc_create("rt_cache", 0444,
 375                           net->proc_net_stat, &rt_cpu_seq_fops);
 376         if (!pde)
 377                 goto err2;
 378
 379 #ifdef CONFIG_IP_ROUTE_CLASSID
 380         pde = proc_create_single("rt_acct", 0, net->proc_net,
 381                         rt_acct_proc_show);
 382         if (!pde)
 383                 goto err3;
 384 #endif
 385         return 0;
 386
 387 #ifdef CONFIG_IP_ROUTE_CLASSID
 388 err3:
 389         remove_proc_entry("rt_cache", net->proc_net_stat);
 390 #endif
 391 err2:
 392         remove_proc_entry("rt_cache", net->proc_net);
 393 err1:
 394         return -ENOMEM;
 395 }
 396
 397 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 398 {
 399         remove_proc_entry("rt_cache", net->proc_net_stat);
 400         remove_proc_entry("rt_cache", net->proc_net);
 401 #ifdef CONFIG_IP_ROUTE_CLASSID
 402         remove_proc_entry("rt_acct", net->proc_net);
 403 #endif
 404 }
 405
 406 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 407         .init = ip_rt_do_proc_init,
 408         .exit = ip_rt_do_proc_exit,
 409 };
 410
 411 static int __init ip_rt_proc_init(void)
 412 {
 413         return register_pernet_subsys(&ip_rt_proc_ops);
 414 }
 415
 416 #else
 417 static inline int ip_rt_proc_init(void)
 418 {
 419         return 0;
 420 }
 421 #endif /* CONFIG_PROC_FS */
 422
 423 static inline bool rt_is_expired(const struct rtable *rth)
 424 {
 425         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 426 }
 427
 428 void rt_cache_flush(struct net *net)
 429 {
 430         rt_genid_bump_ipv4(net);
 431 }
 432
 433 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 434                                            struct sk_buff *skb,
 435                                            const void *daddr)
 436 {
 437         const struct rtable *rt = container_of(dst, struct rtable, dst);
 438         struct net_device *dev = dst->dev;
 439         struct neighbour *n;
 440
 441         rcu_read_lock_bh();
 442
 443         if (likely(rt->rt_gw_family == AF_INET)) {
 444                 n = ip_neigh_gw4(dev, rt->rt_gw4);
 445         } else if (rt->rt_gw_family == AF_INET6) {
 446                 n = ip_neigh_gw6(dev, &rt->rt_gw6);
 447         } else {
 448                 __be32 pkey;
 449
 450                 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
 451                 n = ip_neigh_gw4(dev, pkey);
 452         }
 453
 454         if (n && !refcount_inc_not_zero(&n->refcnt))
 455                 n = NULL;
 456
 457         rcu_read_unlock_bh();
 458
 459         return n;
 460 }
 461
 462 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 463 {
 464         const struct rtable *rt = container_of(dst, struct rtable, dst);
 465         struct net_device *dev = dst->dev;
 466         const __be32 *pkey = daddr;
 467
 468         if (rt->rt_gw_family == AF_INET) {
 469                 pkey = (const __be32 *)&rt->rt_gw4;
 470         } else if (rt->rt_gw_family == AF_INET6) {
 471                 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
 472         } else if (!daddr ||
 473                  (rt->rt_flags &
 474                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
 475                 return;
 476         }
 477         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 478 }
 479
 480 #define IP_IDENTS_SZ 2048u
 481
 482 static atomic_t *ip_idents __read_mostly;
 483 static u32 *ip_tstamps __read_mostly;
 484
 485 /* In order to protect privacy, we add a perturbation to identifiers
 486  * if one generator is seldom used. This makes hard for an attacker
 487  * to infer how many packets were sent between two points in time.
 488  */
 489 u32 ip_idents_reserve(u32 hash, int segs)
 490 {
 491         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 492         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 493         u32 old = READ_ONCE(*p_tstamp);
 494         u32 now = (u32)jiffies;
 495         u32 new, delta = 0;
 496
 497         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 498                 delta = prandom_u32_max(now - old);
 499
 500         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
 501         do {
 502                 old = (u32)atomic_read(p_id);
 503                 new = old + delta + segs;
 504         } while (atomic_cmpxchg(p_id, old, new) != old);
 505
 506         return new - segs;
 507 }
 508 EXPORT_SYMBOL(ip_idents_reserve);
 509
 510 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 511 {
 512         u32 hash, id;
 513
 514         /* Note the following code is not safe, but this is okay. */
 515         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
 516                 get_random_bytes(&net->ipv4.ip_id_key,
 517                                  sizeof(net->ipv4.ip_id_key));
 518
 519         hash = siphash_3u32((__force u32)iph->daddr,
 520                             (__force u32)iph->saddr,
 521                             iph->protocol,
 522                             &net->ipv4.ip_id_key);
 523         id = ip_idents_reserve(hash, segs);
 524         iph->id = htons(id);
 525 }
 526 EXPORT_SYMBOL(__ip_select_ident);
 527
 528 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 529                              const struct sock *sk,
 530                              const struct iphdr *iph,
 531                              int oif, u8 tos,
 532                              u8 prot, u32 mark, int flow_flags)
 533 {
 534         if (sk) {
 535                 const struct inet_sock *inet = inet_sk(sk);
 536
 537                 oif = sk->sk_bound_dev_if;
 538                 mark = sk->sk_mark;
 539                 tos = RT_CONN_FLAGS(sk);
 540                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 541         }
 542         flowi4_init_output(fl4, oif, mark, tos,
 543                            RT_SCOPE_UNIVERSE, prot,
 544                            flow_flags,
 545                            iph->daddr, iph->saddr, 0, 0,
 546                            sock_net_uid(net, sk));
 547 }
 548
 549 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 550                                const struct sock *sk)
 551 {
 552         const struct net *net = dev_net(skb->dev);
 553         const struct iphdr *iph = ip_hdr(skb);
 554         int oif = skb->dev->ifindex;
 555         u8 tos = RT_TOS(iph->tos);
 556         u8 prot = iph->protocol;
 557         u32 mark = skb->mark;
 558
 559         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 560 }
 561
 562 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 563 {
 564         const struct inet_sock *inet = inet_sk(sk);
 565         const struct ip_options_rcu *inet_opt;
 566         __be32 daddr = inet->inet_daddr;
 567
 568         rcu_read_lock();
 569         inet_opt = rcu_dereference(inet->inet_opt);
 570         if (inet_opt && inet_opt->opt.srr)
 571                 daddr = inet_opt->opt.faddr;
 572         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 573                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 574                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 575                            inet_sk_flowi_flags(sk),
 576                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 577         rcu_read_unlock();
 578 }
 579
 580 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 581                                  const struct sk_buff *skb)
 582 {
 583         if (skb)
 584                 build_skb_flow_key(fl4, skb, sk);
 585         else
 586                 build_sk_flow_key(fl4, sk);
 587 }
 588
 589 static DEFINE_SPINLOCK(fnhe_lock);
 590
 591 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 592 {
 593         struct rtable *rt;
 594
 595         rt = rcu_dereference(fnhe->fnhe_rth_input);
 596         if (rt) {
 597                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 598                 dst_dev_put(&rt->dst);
 599                 dst_release(&rt->dst);
 600         }
 601         rt = rcu_dereference(fnhe->fnhe_rth_output);
 602         if (rt) {
 603                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 604                 dst_dev_put(&rt->dst);
 605                 dst_release(&rt->dst);
 606         }
 607 }
 608
 609 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 610 {
 611         struct fib_nh_exception *fnhe, *oldest;
 612
 613         oldest = rcu_dereference(hash->chain);
 614         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 615              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 616                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 617                         oldest = fnhe;
 618         }
 619         fnhe_flush_routes(oldest);
 620         return oldest;
 621 }
 622
 623 static inline u32 fnhe_hashfun(__be32 daddr)
 624 {
 625         static u32 fnhe_hashrnd __read_mostly;
 626         u32 hval;
 627
 628         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 629         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 630         return hash_32(hval, FNHE_HASH_SHIFT);
 631 }
 632
 633 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 634 {
 635         rt->rt_pmtu = fnhe->fnhe_pmtu;
 636         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 637         rt->dst.expires = fnhe->fnhe_expires;
 638
 639         if (fnhe->fnhe_gw) {
 640                 rt->rt_flags |= RTCF_REDIRECTED;
 641                 rt->rt_gw_family = AF_INET;
 642                 rt->rt_gw4 = fnhe->fnhe_gw;
 643         }
 644 }
 645
 646 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 647                                   u32 pmtu, bool lock, unsigned long expires)
 648 {
 649         struct fib_nh_common *nhc = &nh->nh_common;
 650         struct fnhe_hash_bucket *hash;
 651         struct fib_nh_exception *fnhe;
 652         struct rtable *rt;
 653         u32 genid, hval;
 654         unsigned int i;
 655         int depth;
 656
 657         genid = fnhe_genid(dev_net(nh->fib_nh_dev));
 658         hval = fnhe_hashfun(daddr);
 659
 660         spin_lock_bh(&fnhe_lock);
 661
 662         hash = rcu_dereference(nh->nh_exceptions);
 663         if (!hash) {
 664                 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
 665                 if (!hash)
 666                         goto out_unlock;
 667                 rcu_assign_pointer(nh->nh_exceptions, hash);
 668         }
 669
 670         hash += hval;
 671
 672         depth = 0;
 673         for (fnhe = rcu_dereference(hash->chain); fnhe;
 674              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 675                 if (fnhe->fnhe_daddr == daddr)
 676                         break;
 677                 depth++;
 678         }
 679
 680         if (fnhe) {
 681                 if (fnhe->fnhe_genid != genid)
 682                         fnhe->fnhe_genid = genid;
 683                 if (gw)
 684                         fnhe->fnhe_gw = gw;
 685                 if (pmtu) {
 686                         fnhe->fnhe_pmtu = pmtu;
 687                         fnhe->fnhe_mtu_locked = lock;
 688                 }
 689                 fnhe->fnhe_expires = max(1UL, expires);
 690                 /* Update all cached dsts too */
 691                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 692                 if (rt)
 693                         fill_route_from_fnhe(rt, fnhe);
 694                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 695                 if (rt)
 696                         fill_route_from_fnhe(rt, fnhe);
 697         } else {
 698                 if (depth > FNHE_RECLAIM_DEPTH)
 699                         fnhe = fnhe_oldest(hash);
 700                 else {
 701                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 702                         if (!fnhe)
 703                                 goto out_unlock;
 704
 705                         fnhe->fnhe_next = hash->chain;
 706                         rcu_assign_pointer(hash->chain, fnhe);
 707                 }
 708                 fnhe->fnhe_genid = genid;
 709                 fnhe->fnhe_daddr = daddr;
 710                 fnhe->fnhe_gw = gw;
 711                 fnhe->fnhe_pmtu = pmtu;
 712                 fnhe->fnhe_mtu_locked = lock;
 713                 fnhe->fnhe_expires = max(1UL, expires);
 714
 715                 /* Exception created; mark the cached routes for the nexthop
 716                  * stale, so anyone caching it rechecks if this exception
 717                  * applies to them.
 718                  */
 719                 rt = rcu_dereference(nhc->nhc_rth_input);
 720                 if (rt)
 721                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 722
 723                 for_each_possible_cpu(i) {
 724                         struct rtable __rcu **prt;
 725                         prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
 726                         rt = rcu_dereference(*prt);
 727                         if (rt)
 728                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 729                 }
 730         }
 731
 732         fnhe->fnhe_stamp = jiffies;
 733
 734 out_unlock:
 735         spin_unlock_bh(&fnhe_lock);
 736 }
 737
 738 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 739                              bool kill_route)
 740 {
 741         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 742         __be32 old_gw = ip_hdr(skb)->saddr;
 743         struct net_device *dev = skb->dev;
 744         struct in_device *in_dev;
 745         struct fib_result res;
 746         struct neighbour *n;
 747         struct net *net;
 748
 749         switch (icmp_hdr(skb)->code & 7) {
 750         case ICMP_REDIR_NET:
 751         case ICMP_REDIR_NETTOS:
 752         case ICMP_REDIR_HOST:
 753         case ICMP_REDIR_HOSTTOS:
 754                 break;
 755
 756         default:
 757                 return;
 758         }
 759
 760         if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
 761                 return;
 762
 763         in_dev = __in_dev_get_rcu(dev);
 764         if (!in_dev)
 765                 return;
 766
 767         net = dev_net(dev);
 768         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 769             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 770             ipv4_is_zeronet(new_gw))
 771                 goto reject_redirect;
 772
 773         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 774                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 775                         goto reject_redirect;
 776                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 777                         goto reject_redirect;
 778         } else {
 779                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 780                         goto reject_redirect;
 781         }
 782
 783         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 784         if (!n)
 785                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 786         if (!IS_ERR(n)) {
 787                 if (!(n->nud_state & NUD_VALID)) {
 788                         neigh_event_send(n, NULL);
 789                 } else {
 790                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 791                                 struct fib_nh_common *nhc = FIB_RES_NHC(res);
 792                                 struct fib_nh *nh;
 793
 794                                 nh = container_of(nhc, struct fib_nh, nh_common);
 795                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 796                                                 0, false,
 797                                                 jiffies + ip_rt_gc_timeout);
 798                         }
 799                         if (kill_route)
 800                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 801                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 802                 }
 803                 neigh_release(n);
 804         }
 805         return;
 806
 807 reject_redirect:
 808 #ifdef CONFIG_IP_ROUTE_VERBOSE
 809         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 810                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 811                 __be32 daddr = iph->daddr;
 812                 __be32 saddr = iph->saddr;
 813
 814                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 815                                      "  Advised path = %pI4 -> %pI4\n",
 816                                      &old_gw, dev->name, &new_gw,
 817                                      &saddr, &daddr);
 818         }
 819 #endif
 820         ;
 821 }
 822
 823 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 824 {
 825         struct rtable *rt;
 826         struct flowi4 fl4;
 827         const struct iphdr *iph = (const struct iphdr *) skb->data;
 828         struct net *net = dev_net(skb->dev);
 829         int oif = skb->dev->ifindex;
 830         u8 tos = RT_TOS(iph->tos);
 831         u8 prot = iph->protocol;
 832         u32 mark = skb->mark;
 833
 834         rt = (struct rtable *) dst;
 835
 836         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 837         __ip_do_redirect(rt, skb, &fl4, true);
 838 }
 839
 840 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 841 {
 842         struct rtable *rt = (struct rtable *)dst;
 843         struct dst_entry *ret = dst;
 844
 845         if (rt) {
 846                 if (dst->obsolete > 0) {
 847                         ip_rt_put(rt);
 848                         ret = NULL;
 849                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 850                            rt->dst.expires) {
 851                         ip_rt_put(rt);
 852                         ret = NULL;
 853                 }
 854         }
 855         return ret;
 856 }
 857
 858 /*
 859  * Algorithm:
 860  *      1. The first ip_rt_redirect_number redirects are sent
 861  *         with exponential backoff, then we stop sending them at all,
 862  *         assuming that the host ignores our redirects.
 863  *      2. If we did not see packets requiring redirects
 864  *         during ip_rt_redirect_silence, we assume that the host
 865  *         forgot redirected route and start to send redirects again.
 866  *
 867  * This algorithm is much cheaper and more intelligent than dumb load limiting
 868  * in icmp.c.
 869  *
 870  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 871  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 872  */
 873
 874 void ip_rt_send_redirect(struct sk_buff *skb)
 875 {
 876         struct rtable *rt = skb_rtable(skb);
 877         struct in_device *in_dev;
 878         struct inet_peer *peer;
 879         struct net *net;
 880         int log_martians;
 881         int vif;
 882
 883         rcu_read_lock();
 884         in_dev = __in_dev_get_rcu(rt->dst.dev);
 885         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 886                 rcu_read_unlock();
 887                 return;
 888         }
 889         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 890         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 891         rcu_read_unlock();
 892
 893         net = dev_net(rt->dst.dev);
 894         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 895         if (!peer) {
 896                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 897                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 898                 return;
 899         }
 900
 901         /* No redirected packets during ip_rt_redirect_silence;
 902          * reset the algorithm.
 903          */
 904         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 905                 peer->rate_tokens = 0;
 906                 peer->n_redirects = 0;
 907         }
 908
 909         /* Too many ignored redirects; do not send anything
 910          * set dst.rate_last to the last seen redirected packet.
 911          */
 912         if (peer->n_redirects >= ip_rt_redirect_number) {
 913                 peer->rate_last = jiffies;
 914                 goto out_put_peer;
 915         }
 916
 917         /* Check for load limit; set rate_last to the latest sent
 918          * redirect.
 919          */
 920         if (peer->rate_tokens == 0 ||
 921             time_after(jiffies,
 922                        (peer->rate_last +
 923                         (ip_rt_redirect_load << peer->rate_tokens)))) {
 924                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 925
 926                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 927                 peer->rate_last = jiffies;
 928                 ++peer->rate_tokens;
 929                 ++peer->n_redirects;
 930 #ifdef CONFIG_IP_ROUTE_VERBOSE
 931                 if (log_martians &&
 932                     peer->rate_tokens == ip_rt_redirect_number)
 933                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 934                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 935                                              &ip_hdr(skb)->daddr, &gw);
 936 #endif
 937         }
 938 out_put_peer:
 939         inet_putpeer(peer);
 940 }
 941
 942 static int ip_error(struct sk_buff *skb)
 943 {
 944         struct rtable *rt = skb_rtable(skb);
 945         struct net_device *dev = skb->dev;
 946         struct in_device *in_dev;
 947         struct inet_peer *peer;
 948         unsigned long now;
 949         struct net *net;
 950         bool send;
 951         int code;
 952
 953         if (netif_is_l3_master(skb->dev)) {
 954                 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
 955                 if (!dev)
 956                         goto out;
 957         }
 958
 959         in_dev = __in_dev_get_rcu(dev);
 960
 961         /* IP on this device is disabled. */
 962         if (!in_dev)
 963                 goto out;
 964
 965         net = dev_net(rt->dst.dev);
 966         if (!IN_DEV_FORWARD(in_dev)) {
 967                 switch (rt->dst.error) {
 968                 case EHOSTUNREACH:
 969                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 970                         break;
 971
 972                 case ENETUNREACH:
 973                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 974                         break;
 975                 }
 976                 goto out;
 977         }
 978
 979         switch (rt->dst.error) {
 980         case EINVAL:
 981         default:
 982                 goto out;
 983         case EHOSTUNREACH:
 984                 code = ICMP_HOST_UNREACH;
 985                 break;
 986         case ENETUNREACH:
 987                 code = ICMP_NET_UNREACH;
 988                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 989                 break;
 990         case EACCES:
 991                 code = ICMP_PKT_FILTERED;
 992                 break;
 993         }
 994
 995         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 996                                l3mdev_master_ifindex(skb->dev), 1);
 997
 998         send = true;
 999         if (peer) {
1000                 now = jiffies;
1001                 peer->rate_tokens += now - peer->rate_last;
1002                 if (peer->rate_tokens > ip_rt_error_burst)
1003                         peer->rate_tokens = ip_rt_error_burst;
1004                 peer->rate_last = now;
1005                 if (peer->rate_tokens >= ip_rt_error_cost)
1006                         peer->rate_tokens -= ip_rt_error_cost;
1007                 else
1008                         send = false;
1009                 inet_putpeer(peer);
1010         }
1011         if (send)
1012                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1013
1014 out:    kfree_skb(skb);
1015         return 0;
1016 }
1017
1018 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1019 {
1020         struct dst_entry *dst = &rt->dst;
1021         u32 old_mtu = ipv4_mtu(dst);
1022         struct fib_result res;
1023         bool lock = false;
1024
1025         if (ip_mtu_locked(dst))
1026                 return;
1027
1028         if (old_mtu < mtu)
1029                 return;
1030
1031         if (mtu < ip_rt_min_pmtu) {
1032                 lock = true;
1033                 mtu = min(old_mtu, ip_rt_min_pmtu);
1034         }
1035
1036         if (rt->rt_pmtu == mtu && !lock &&
1037             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1038                 return;
1039
1040         rcu_read_lock();
1041         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1042                 struct fib_nh_common *nhc = FIB_RES_NHC(res);
1043                 struct fib_nh *nh;
1044
1045                 nh = container_of(nhc, struct fib_nh, nh_common);
1046                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1047                                       jiffies + ip_rt_mtu_expires);
1048         }
1049         rcu_read_unlock();
1050 }
1051
1052 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1053                               struct sk_buff *skb, u32 mtu)
1054 {
1055         struct rtable *rt = (struct rtable *) dst;
1056         struct flowi4 fl4;
1057
1058         ip_rt_build_flow_key(&fl4, sk, skb);
1059         __ip_rt_update_pmtu(rt, &fl4, mtu);
1060 }
1061
1062 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1063                       int oif, u8 protocol)
1064 {
1065         const struct iphdr *iph = (const struct iphdr *) skb->data;
1066         struct flowi4 fl4;
1067         struct rtable *rt;
1068         u32 mark = IP4_REPLY_MARK(net, skb->mark);
1069
1070         __build_flow_key(net, &fl4, NULL, iph, oif,
1071                          RT_TOS(iph->tos), protocol, mark, 0);
1072         rt = __ip_route_output_key(net, &fl4);
1073         if (!IS_ERR(rt)) {
1074                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1075                 ip_rt_put(rt);
1076         }
1077 }
1078 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1079
1080 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1081 {
1082         const struct iphdr *iph = (const struct iphdr *) skb->data;
1083         struct flowi4 fl4;
1084         struct rtable *rt;
1085
1086         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1087
1088         if (!fl4.flowi4_mark)
1089                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1090
1091         rt = __ip_route_output_key(sock_net(sk), &fl4);
1092         if (!IS_ERR(rt)) {
1093                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1094                 ip_rt_put(rt);
1095         }
1096 }
1097
1098 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1099 {
1100         const struct iphdr *iph = (const struct iphdr *) skb->data;
1101         struct flowi4 fl4;
1102         struct rtable *rt;
1103         struct dst_entry *odst = NULL;
1104         bool new = false;
1105         struct net *net = sock_net(sk);
1106
1107         bh_lock_sock(sk);
1108
1109         if (!ip_sk_accept_pmtu(sk))
1110                 goto out;
1111
1112         odst = sk_dst_get(sk);
1113
1114         if (sock_owned_by_user(sk) || !odst) {
1115                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1116                 goto out;
1117         }
1118
1119         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1120
1121         rt = (struct rtable *)odst;
1122         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1123                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1124                 if (IS_ERR(rt))
1125                         goto out;
1126
1127                 new = true;
1128         }
1129
1130         __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1131
1132         if (!dst_check(&rt->dst, 0)) {
1133                 if (new)
1134                         dst_release(&rt->dst);
1135
1136                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1137                 if (IS_ERR(rt))
1138                         goto out;
1139
1140                 new = true;
1141         }
1142
1143         if (new)
1144                 sk_dst_set(sk, &rt->dst);
1145
1146 out:
1147         bh_unlock_sock(sk);
1148         dst_release(odst);
1149 }
1150 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1151
1152 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1153                    int oif, u8 protocol)
1154 {
1155         const struct iphdr *iph = (const struct iphdr *) skb->data;
1156         struct flowi4 fl4;
1157         struct rtable *rt;
1158
1159         __build_flow_key(net, &fl4, NULL, iph, oif,
1160                          RT_TOS(iph->tos), protocol, 0, 0);
1161         rt = __ip_route_output_key(net, &fl4);
1162         if (!IS_ERR(rt)) {
1163                 __ip_do_redirect(rt, skb, &fl4, false);
1164                 ip_rt_put(rt);
1165         }
1166 }
1167 EXPORT_SYMBOL_GPL(ipv4_redirect);
1168
1169 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1170 {
1171         const struct iphdr *iph = (const struct iphdr *) skb->data;
1172         struct flowi4 fl4;
1173         struct rtable *rt;
1174         struct net *net = sock_net(sk);
1175
1176         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1177         rt = __ip_route_output_key(net, &fl4);
1178         if (!IS_ERR(rt)) {
1179                 __ip_do_redirect(rt, skb, &fl4, false);
1180                 ip_rt_put(rt);
1181         }
1182 }
1183 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1184
1185 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1186 {
1187         struct rtable *rt = (struct rtable *) dst;
1188
1189         /* All IPV4 dsts are created with ->obsolete set to the value
1190          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1191          * into this function always.
1192          *
1193          * When a PMTU/redirect information update invalidates a route,
1194          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1195          * DST_OBSOLETE_DEAD.
1196          */
1197         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1198                 return NULL;
1199         return dst;
1200 }
1201
1202 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1203 {
1204         struct ip_options opt;
1205         int res;
1206
1207         /* Recompile ip options since IPCB may not be valid anymore.
1208          * Also check we have a reasonable ipv4 header.
1209          */
1210         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1211             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1212                 return;
1213
1214         memset(&opt, 0, sizeof(opt));
1215         if (ip_hdr(skb)->ihl > 5) {
1216                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1217                         return;
1218                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1219
1220                 rcu_read_lock();
1221                 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1222                 rcu_read_unlock();
1223
1224                 if (res)
1225                         return;
1226         }
1227         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1228 }
1229
1230 static void ipv4_link_failure(struct sk_buff *skb)
1231 {
1232         struct rtable *rt;
1233
1234         ipv4_send_dest_unreach(skb);
1235
1236         rt = skb_rtable(skb);
1237         if (rt)
1238                 dst_set_expires(&rt->dst, 0);
1239 }
1240
1241 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1242 {
1243         pr_debug("%s: %pI4 -> %pI4, %s\n",
1244                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1245                  skb->dev ? skb->dev->name : "?");
1246         kfree_skb(skb);
1247         WARN_ON(1);
1248         return 0;
1249 }
1250
1251 /*
1252    We do not cache source address of outgoing interface,
1253    because it is used only by IP RR, TS and SRR options,
1254    so that it out of fast path.
1255
1256    BTW remember: "addr" is allowed to be not aligned
1257    in IP options!
1258  */
1259
1260 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1261 {
1262         __be32 src;
1263
1264         if (rt_is_output_route(rt))
1265                 src = ip_hdr(skb)->saddr;
1266         else {
1267                 struct fib_result res;
1268                 struct iphdr *iph = ip_hdr(skb);
1269                 struct flowi4 fl4 = {
1270                         .daddr = iph->daddr,
1271                         .saddr = iph->saddr,
1272                         .flowi4_tos = RT_TOS(iph->tos),
1273                         .flowi4_oif = rt->dst.dev->ifindex,
1274                         .flowi4_iif = skb->dev->ifindex,
1275                         .flowi4_mark = skb->mark,
1276                 };
1277
1278                 rcu_read_lock();
1279                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1280                         src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1281                 else
1282                         src = inet_select_addr(rt->dst.dev,
1283                                                rt_nexthop(rt, iph->daddr),
1284                                                RT_SCOPE_UNIVERSE);
1285                 rcu_read_unlock();
1286         }
1287         memcpy(addr, &src, 4);
1288 }
1289
1290 #ifdef CONFIG_IP_ROUTE_CLASSID
1291 static void set_class_tag(struct rtable *rt, u32 tag)
1292 {
1293         if (!(rt->dst.tclassid & 0xFFFF))
1294                 rt->dst.tclassid |= tag & 0xFFFF;
1295         if (!(rt->dst.tclassid & 0xFFFF0000))
1296                 rt->dst.tclassid |= tag & 0xFFFF0000;
1297 }
1298 #endif
1299
1300 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1301 {
1302         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1303         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1304                                     ip_rt_min_advmss);
1305
1306         return min(advmss, IPV4_MAX_PMTU - header_size);
1307 }
1308
1309 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1310 {
1311         const struct rtable *rt = (const struct rtable *) dst;
1312         unsigned int mtu = rt->rt_pmtu;
1313
1314         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1315                 mtu = dst_metric_raw(dst, RTAX_MTU);
1316
1317         if (mtu)
1318                 return mtu;
1319
1320         mtu = READ_ONCE(dst->dev->mtu);
1321
1322         if (unlikely(ip_mtu_locked(dst))) {
1323                 if (rt->rt_gw_family && mtu > 576)
1324                         mtu = 576;
1325         }
1326
1327         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1328
1329         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1330 }
1331
1332 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1333 {
1334         struct fnhe_hash_bucket *hash;
1335         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1336         u32 hval = fnhe_hashfun(daddr);
1337
1338         spin_lock_bh(&fnhe_lock);
1339
1340         hash = rcu_dereference_protected(nh->nh_exceptions,
1341                                          lockdep_is_held(&fnhe_lock));
1342         hash += hval;
1343
1344         fnhe_p = &hash->chain;
1345         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1346         while (fnhe) {
1347                 if (fnhe->fnhe_daddr == daddr) {
1348                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1349                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1350                         /* set fnhe_daddr to 0 to ensure it won't bind with
1351                          * new dsts in rt_bind_exception().
1352                          */
1353                         fnhe->fnhe_daddr = 0;
1354                         fnhe_flush_routes(fnhe);
1355                         kfree_rcu(fnhe, rcu);
1356                         break;
1357                 }
1358                 fnhe_p = &fnhe->fnhe_next;
1359                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1360                                                  lockdep_is_held(&fnhe_lock));
1361         }
1362
1363         spin_unlock_bh(&fnhe_lock);
1364 }
1365
1366 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1367 {
1368         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1369         struct fib_nh_exception *fnhe;
1370         u32 hval;
1371
1372         if (!hash)
1373                 return NULL;
1374
1375         hval = fnhe_hashfun(daddr);
1376
1377         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1378              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1379                 if (fnhe->fnhe_daddr == daddr) {
1380                         if (fnhe->fnhe_expires &&
1381                             time_after(jiffies, fnhe->fnhe_expires)) {
1382                                 ip_del_fnhe(nh, daddr);
1383                                 break;
1384                         }
1385                         return fnhe;
1386                 }
1387         }
1388         return NULL;
1389 }
1390
1391 /* MTU selection:
1392  * 1. mtu on route is locked - use it
1393  * 2. mtu from nexthop exception
1394  * 3. mtu from egress device
1395  */
1396
1397 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1398 {
1399         struct fib_nh_common *nhc = res->nhc;
1400         struct net_device *dev = nhc->nhc_dev;
1401         struct fib_info *fi = res->fi;
1402         u32 mtu = 0;
1403
1404         if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1405             fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1406                 mtu = fi->fib_mtu;
1407
1408         if (likely(!mtu)) {
1409                 struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common);
1410                 struct fib_nh_exception *fnhe;
1411
1412                 fnhe = find_exception(nh, daddr);
1413                 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1414                         mtu = fnhe->fnhe_pmtu;
1415         }
1416
1417         if (likely(!mtu))
1418                 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1419
1420         return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1421 }
1422
1423 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1424                               __be32 daddr, const bool do_cache)
1425 {
1426         bool ret = false;
1427
1428         spin_lock_bh(&fnhe_lock);
1429
1430         if (daddr == fnhe->fnhe_daddr) {
1431                 struct rtable __rcu **porig;
1432                 struct rtable *orig;
1433                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1434
1435                 if (rt_is_input_route(rt))
1436                         porig = &fnhe->fnhe_rth_input;
1437                 else
1438                         porig = &fnhe->fnhe_rth_output;
1439                 orig = rcu_dereference(*porig);
1440
1441                 if (fnhe->fnhe_genid != genid) {
1442                         fnhe->fnhe_genid = genid;
1443                         fnhe->fnhe_gw = 0;
1444                         fnhe->fnhe_pmtu = 0;
1445                         fnhe->fnhe_expires = 0;
1446                         fnhe->fnhe_mtu_locked = false;
1447                         fnhe_flush_routes(fnhe);
1448                         orig = NULL;
1449                 }
1450                 fill_route_from_fnhe(rt, fnhe);
1451                 if (!rt->rt_gw4) {
1452                         rt->rt_gw4 = daddr;
1453                         rt->rt_gw_family = AF_INET;
1454                 }
1455
1456                 if (do_cache) {
1457                         dst_hold(&rt->dst);
1458                         rcu_assign_pointer(*porig, rt);
1459                         if (orig) {
1460                                 dst_dev_put(&orig->dst);
1461                                 dst_release(&orig->dst);
1462                         }
1463                         ret = true;
1464                 }
1465
1466                 fnhe->fnhe_stamp = jiffies;
1467         }
1468         spin_unlock_bh(&fnhe_lock);
1469
1470         return ret;
1471 }
1472
1473 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1474 {
1475         struct fib_nh_common *nhc = &nh->nh_common;
1476         struct rtable *orig, *prev, **p;
1477         bool ret = true;
1478
1479         if (rt_is_input_route(rt)) {
1480                 p = (struct rtable **)&nhc->nhc_rth_input;
1481         } else {
1482                 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1483         }
1484         orig = *p;
1485
1486         /* hold dst before doing cmpxchg() to avoid race condition
1487          * on this dst
1488          */
1489         dst_hold(&rt->dst);
1490         prev = cmpxchg(p, orig, rt);
1491         if (prev == orig) {
1492                 if (orig) {
1493                         dst_dev_put(&orig->dst);
1494                         dst_release(&orig->dst);
1495                 }
1496         } else {
1497                 dst_release(&rt->dst);
1498                 ret = false;
1499         }
1500
1501         return ret;
1502 }
1503
1504 struct uncached_list {
1505         spinlock_t              lock;
1506         struct list_head        head;
1507 };
1508
1509 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1510
1511 void rt_add_uncached_list(struct rtable *rt)
1512 {
1513         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1514
1515         rt->rt_uncached_list = ul;
1516
1517         spin_lock_bh(&ul->lock);
1518         list_add_tail(&rt->rt_uncached, &ul->head);
1519         spin_unlock_bh(&ul->lock);
1520 }
1521
1522 void rt_del_uncached_list(struct rtable *rt)
1523 {
1524         if (!list_empty(&rt->rt_uncached)) {
1525                 struct uncached_list *ul = rt->rt_uncached_list;
1526
1527                 spin_lock_bh(&ul->lock);
1528                 list_del(&rt->rt_uncached);
1529                 spin_unlock_bh(&ul->lock);
1530         }
1531 }
1532
1533 static void ipv4_dst_destroy(struct dst_entry *dst)
1534 {
1535         struct rtable *rt = (struct rtable *)dst;
1536
1537         ip_dst_metrics_put(dst);
1538         rt_del_uncached_list(rt);
1539 }
1540
1541 void rt_flush_dev(struct net_device *dev)
1542 {
1543         struct net *net = dev_net(dev);
1544         struct rtable *rt;
1545         int cpu;
1546
1547         for_each_possible_cpu(cpu) {
1548                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1549
1550                 spin_lock_bh(&ul->lock);
1551                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1552                         if (rt->dst.dev != dev)
1553                                 continue;
1554                         rt->dst.dev = net->loopback_dev;
1555                         dev_hold(rt->dst.dev);
1556                         dev_put(dev);
1557                 }
1558                 spin_unlock_bh(&ul->lock);
1559         }
1560 }
1561
1562 static bool rt_cache_valid(const struct rtable *rt)
1563 {
1564         return  rt &&
1565                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1566                 !rt_is_expired(rt);
1567 }
1568
1569 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1570                            const struct fib_result *res,
1571                            struct fib_nh_exception *fnhe,
1572                            struct fib_info *fi, u16 type, u32 itag,
1573                            const bool do_cache)
1574 {
1575         bool cached = false;
1576
1577         if (fi) {
1578                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1579                 struct fib_nh *nh;
1580
1581                 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1582                         rt->rt_gw_family = nhc->nhc_gw_family;
1583                         /* only INET and INET6 are supported */
1584                         if (likely(nhc->nhc_gw_family == AF_INET))
1585                                 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1586                         else
1587                                 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1588                 }
1589
1590                 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1591
1592                 nh = container_of(nhc, struct fib_nh, nh_common);
1593 #ifdef CONFIG_IP_ROUTE_CLASSID
1594                 rt->dst.tclassid = nh->nh_tclassid;
1595 #endif
1596                 rt->dst.lwtstate = lwtstate_get(nh->fib_nh_lws);
1597                 if (unlikely(fnhe))
1598                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1599                 else if (do_cache)
1600                         cached = rt_cache_route(nh, rt);
1601                 if (unlikely(!cached)) {
1602                         /* Routes we intend to cache in nexthop exception or
1603                          * FIB nexthop have the DST_NOCACHE bit clear.
1604                          * However, if we are unsuccessful at storing this
1605                          * route into the cache we really need to set it.
1606                          */
1607                         if (!rt->rt_gw4) {
1608                                 rt->rt_gw_family = AF_INET;
1609                                 rt->rt_gw4 = daddr;
1610                         }
1611                         rt_add_uncached_list(rt);
1612                 }
1613         } else
1614                 rt_add_uncached_list(rt);
1615
1616 #ifdef CONFIG_IP_ROUTE_CLASSID
1617 #ifdef CONFIG_IP_MULTIPLE_TABLES
1618         set_class_tag(rt, res->tclassid);
1619 #endif
1620         set_class_tag(rt, itag);
1621 #endif
1622 }
1623
1624 struct rtable *rt_dst_alloc(struct net_device *dev,
1625                             unsigned int flags, u16 type,
1626                             bool nopolicy, bool noxfrm, bool will_cache)
1627 {
1628         struct rtable *rt;
1629
1630         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1631                        (will_cache ? 0 : DST_HOST) |
1632                        (nopolicy ? DST_NOPOLICY : 0) |
1633                        (noxfrm ? DST_NOXFRM : 0));
1634
1635         if (rt) {
1636                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1637                 rt->rt_flags = flags;
1638                 rt->rt_type = type;
1639                 rt->rt_is_input = 0;
1640                 rt->rt_iif = 0;
1641                 rt->rt_pmtu = 0;
1642                 rt->rt_mtu_locked = 0;
1643                 rt->rt_gw_family = 0;
1644                 rt->rt_gw4 = 0;
1645                 INIT_LIST_HEAD(&rt->rt_uncached);
1646
1647                 rt->dst.output = ip_output;
1648                 if (flags & RTCF_LOCAL)
1649                         rt->dst.input = ip_local_deliver;
1650         }
1651
1652         return rt;
1653 }
1654 EXPORT_SYMBOL(rt_dst_alloc);
1655
1656 /* called in rcu_read_lock() section */
1657 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1658                           u8 tos, struct net_device *dev,
1659                           struct in_device *in_dev, u32 *itag)
1660 {
1661         int err;
1662
1663         /* Primary sanity checks. */
1664         if (!in_dev)
1665                 return -EINVAL;
1666
1667         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1668             skb->protocol != htons(ETH_P_IP))
1669                 return -EINVAL;
1670
1671         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1672                 return -EINVAL;
1673
1674         if (ipv4_is_zeronet(saddr)) {
1675                 if (!ipv4_is_local_multicast(daddr) &&
1676                     ip_hdr(skb)->protocol != IPPROTO_IGMP)
1677                         return -EINVAL;
1678         } else {
1679                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1680                                           in_dev, itag);
1681                 if (err < 0)
1682                         return err;
1683         }
1684         return 0;
1685 }
1686
1687 /* called in rcu_read_lock() section */
1688 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1689                              u8 tos, struct net_device *dev, int our)
1690 {
1691         struct in_device *in_dev = __in_dev_get_rcu(dev);
1692         unsigned int flags = RTCF_MULTICAST;
1693         struct rtable *rth;
1694         u32 itag = 0;
1695         int err;
1696
1697         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1698         if (err)
1699                 return err;
1700
1701         if (our)
1702                 flags |= RTCF_LOCAL;
1703
1704         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1705                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1706         if (!rth)
1707                 return -ENOBUFS;
1708
1709 #ifdef CONFIG_IP_ROUTE_CLASSID
1710         rth->dst.tclassid = itag;
1711 #endif
1712         rth->dst.output = ip_rt_bug;
1713         rth->rt_is_input= 1;
1714
1715 #ifdef CONFIG_IP_MROUTE
1716         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1717                 rth->dst.input = ip_mr_input;
1718 #endif
1719         RT_CACHE_STAT_INC(in_slow_mc);
1720
1721         skb_dst_set(skb, &rth->dst);
1722         return 0;
1723 }
1724
1725
1726 static void ip_handle_martian_source(struct net_device *dev,
1727                                      struct in_device *in_dev,
1728                                      struct sk_buff *skb,
1729                                      __be32 daddr,
1730                                      __be32 saddr)
1731 {
1732         RT_CACHE_STAT_INC(in_martian_src);
1733 #ifdef CONFIG_IP_ROUTE_VERBOSE
1734         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1735                 /*
1736                  *      RFC1812 recommendation, if source is martian,
1737                  *      the only hint is MAC header.
1738                  */
1739                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1740                         &daddr, &saddr, dev->name);
1741                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1742                         print_hex_dump(KERN_WARNING, "ll header: ",
1743                                        DUMP_PREFIX_OFFSET, 16, 1,
1744                                        skb_mac_header(skb),
1745                                        dev->hard_header_len, false);
1746                 }
1747         }
1748 #endif
1749 }
1750
1751 /* called in rcu_read_lock() section */
1752 static int __mkroute_input(struct sk_buff *skb,
1753                            const struct fib_result *res,
1754                            struct in_device *in_dev,
1755                            __be32 daddr, __be32 saddr, u32 tos)
1756 {
1757         struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1758         struct net_device *dev = nhc->nhc_dev;
1759         struct fib_nh_exception *fnhe;
1760         struct rtable *rth;
1761         struct fib_nh *nh;
1762         int err;
1763         struct in_device *out_dev;
1764         bool do_cache;
1765         u32 itag = 0;
1766
1767         /* get a working reference to the output device */
1768         out_dev = __in_dev_get_rcu(dev);
1769         if (!out_dev) {
1770                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1771                 return -EINVAL;
1772         }
1773
1774         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1775                                   in_dev->dev, in_dev, &itag);
1776         if (err < 0) {
1777                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1778                                          saddr);
1779
1780                 goto cleanup;
1781         }
1782
1783         do_cache = res->fi && !itag;
1784         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1785             skb->protocol == htons(ETH_P_IP)) {
1786                 __be32 gw;
1787
1788                 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1789                 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1790                     inet_addr_onlink(out_dev, saddr, gw))
1791                         IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1792         }
1793
1794         if (skb->protocol != htons(ETH_P_IP)) {
1795                 /* Not IP (i.e. ARP). Do not create route, if it is
1796                  * invalid for proxy arp. DNAT routes are always valid.
1797                  *
1798                  * Proxy arp feature have been extended to allow, ARP
1799                  * replies back to the same interface, to support
1800                  * Private VLAN switch technologies. See arp.c.
1801                  */
1802                 if (out_dev == in_dev &&
1803                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1804                         err = -EINVAL;
1805                         goto cleanup;
1806                 }
1807         }
1808
1809         nh = container_of(nhc, struct fib_nh, nh_common);
1810         fnhe = find_exception(nh, daddr);
1811         if (do_cache) {
1812                 if (fnhe)
1813                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1814                 else
1815                         rth = rcu_dereference(nhc->nhc_rth_input);
1816                 if (rt_cache_valid(rth)) {
1817                         skb_dst_set_noref(skb, &rth->dst);
1818                         goto out;
1819                 }
1820         }
1821
1822         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1823                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1824                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1825         if (!rth) {
1826                 err = -ENOBUFS;
1827                 goto cleanup;
1828         }
1829
1830         rth->rt_is_input = 1;
1831         RT_CACHE_STAT_INC(in_slow_tot);
1832
1833         rth->dst.input = ip_forward;
1834
1835         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1836                        do_cache);
1837         lwtunnel_set_redirect(&rth->dst);
1838         skb_dst_set(skb, &rth->dst);
1839 out:
1840         err = 0;
1841  cleanup:
1842         return err;
1843 }
1844
1845 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1846 /* To make ICMP packets follow the right flow, the multipath hash is
1847  * calculated from the inner IP addresses.
1848  */
1849 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1850                                  struct flow_keys *hash_keys)
1851 {
1852         const struct iphdr *outer_iph = ip_hdr(skb);
1853         const struct iphdr *key_iph = outer_iph;
1854         const struct iphdr *inner_iph;
1855         const struct icmphdr *icmph;
1856         struct iphdr _inner_iph;
1857         struct icmphdr _icmph;
1858
1859         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1860                 goto out;
1861
1862         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1863                 goto out;
1864
1865         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1866                                    &_icmph);
1867         if (!icmph)
1868                 goto out;
1869
1870         if (icmph->type != ICMP_DEST_UNREACH &&
1871             icmph->type != ICMP_REDIRECT &&
1872             icmph->type != ICMP_TIME_EXCEEDED &&
1873             icmph->type != ICMP_PARAMETERPROB)
1874                 goto out;
1875
1876         inner_iph = skb_header_pointer(skb,
1877                                        outer_iph->ihl * 4 + sizeof(_icmph),
1878                                        sizeof(_inner_iph), &_inner_iph);
1879         if (!inner_iph)
1880                 goto out;
1881
1882         key_iph = inner_iph;
1883 out:
1884         hash_keys->addrs.v4addrs.src = key_iph->saddr;
1885         hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1886 }
1887
1888 /* if skb is set it will be used and fl4 can be NULL */
1889 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1890                        const struct sk_buff *skb, struct flow_keys *flkeys)
1891 {
1892         u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1893         struct flow_keys hash_keys;
1894         u32 mhash;
1895
1896         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1897         case 0:
1898                 memset(&hash_keys, 0, sizeof(hash_keys));
1899                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1900                 if (skb) {
1901                         ip_multipath_l3_keys(skb, &hash_keys);
1902                 } else {
1903                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1904                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1905                 }
1906                 break;
1907         case 1:
1908                 /* skb is currently provided only when forwarding */
1909                 if (skb) {
1910                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1911                         struct flow_keys keys;
1912
1913                         /* short-circuit if we already have L4 hash present */
1914                         if (skb->l4_hash)
1915                                 return skb_get_hash_raw(skb) >> 1;
1916
1917                         memset(&hash_keys, 0, sizeof(hash_keys));
1918
1919                         if (!flkeys) {
1920                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1921                                 flkeys = &keys;
1922                         }
1923
1924                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1925                         hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1926                         hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1927                         hash_keys.ports.src = flkeys->ports.src;
1928                         hash_keys.ports.dst = flkeys->ports.dst;
1929                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1930                 } else {
1931                         memset(&hash_keys, 0, sizeof(hash_keys));
1932                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1933                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1934                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1935                         hash_keys.ports.src = fl4->fl4_sport;
1936                         hash_keys.ports.dst = fl4->fl4_dport;
1937                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1938                 }
1939                 break;
1940         }
1941         mhash = flow_hash_from_keys(&hash_keys);
1942
1943         if (multipath_hash)
1944                 mhash = jhash_2words(mhash, multipath_hash, 0);
1945
1946         return mhash >> 1;
1947 }
1948 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1949
1950 static int ip_mkroute_input(struct sk_buff *skb,
1951                             struct fib_result *res,
1952                             struct in_device *in_dev,
1953                             __be32 daddr, __be32 saddr, u32 tos,
1954                             struct flow_keys *hkeys)
1955 {
1956 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1957         if (res->fi && res->fi->fib_nhs > 1) {
1958                 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
1959
1960                 fib_select_multipath(res, h);
1961         }
1962 #endif
1963
1964         /* create a routing cache entry */
1965         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1966 }
1967
1968 /*
1969  *      NOTE. We drop all the packets that has local source
1970  *      addresses, because every properly looped back packet
1971  *      must have correct destination already attached by output routine.
1972  *
1973  *      Such approach solves two big problems:
1974  *      1. Not simplex devices are handled properly.
1975  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1976  *      called with rcu_read_lock()
1977  */
1978
1979 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1980                                u8 tos, struct net_device *dev,
1981                                struct fib_result *res)
1982 {
1983         struct in_device *in_dev = __in_dev_get_rcu(dev);
1984         struct flow_keys *flkeys = NULL, _flkeys;
1985         struct net    *net = dev_net(dev);
1986         struct ip_tunnel_info *tun_info;
1987         int             err = -EINVAL;
1988         unsigned int    flags = 0;
1989         u32             itag = 0;
1990         struct rtable   *rth;
1991         struct flowi4   fl4;
1992         bool do_cache;
1993
1994         /* IP on this device is disabled. */
1995
1996         if (!in_dev)
1997                 goto out;
1998
1999         /* Check for the most weird martians, which can be not detected
2000            by fib_lookup.
2001          */
2002
2003         tun_info = skb_tunnel_info(skb);
2004         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2005                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2006         else
2007                 fl4.flowi4_tun_key.tun_id = 0;
2008         skb_dst_drop(skb);
2009
2010         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2011                 goto martian_source;
2012
2013         res->fi = NULL;
2014         res->table = NULL;
2015         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2016                 goto brd_input;
2017
2018         /* Accept zero addresses only to limited broadcast;
2019          * I even do not know to fix it or not. Waiting for complains :-)
2020          */
2021         if (ipv4_is_zeronet(saddr))
2022                 goto martian_source;
2023
2024         if (ipv4_is_zeronet(daddr))
2025                 goto martian_destination;
2026
2027         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2028          * and call it once if daddr or/and saddr are loopback addresses
2029          */
2030         if (ipv4_is_loopback(daddr)) {
2031                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2032                         goto martian_destination;
2033         } else if (ipv4_is_loopback(saddr)) {
2034                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2035                         goto martian_source;
2036         }
2037
2038         /*
2039          *      Now we are ready to route packet.
2040          */
2041         fl4.flowi4_oif = 0;
2042         fl4.flowi4_iif = dev->ifindex;
2043         fl4.flowi4_mark = skb->mark;
2044         fl4.flowi4_tos = tos;
2045         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2046         fl4.flowi4_flags = 0;
2047         fl4.daddr = daddr;
2048         fl4.saddr = saddr;
2049         fl4.flowi4_uid = sock_net_uid(net, NULL);
2050
2051         if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2052                 flkeys = &_flkeys;
2053         } else {
2054                 fl4.flowi4_proto = 0;
2055                 fl4.fl4_sport = 0;
2056                 fl4.fl4_dport = 0;
2057         }
2058
2059         err = fib_lookup(net, &fl4, res, 0);
2060         if (err != 0) {
2061                 if (!IN_DEV_FORWARD(in_dev))
2062                         err = -EHOSTUNREACH;
2063                 goto no_route;
2064         }
2065
2066         if (res->type == RTN_BROADCAST) {
2067                 if (IN_DEV_BFORWARD(in_dev))
2068                         goto make_route;
2069                 goto brd_input;
2070         }
2071
2072         if (res->type == RTN_LOCAL) {
2073                 err = fib_validate_source(skb, saddr, daddr, tos,
2074                                           0, dev, in_dev, &itag);
2075                 if (err < 0)
2076                         goto martian_source;
2077                 goto local_input;
2078         }
2079
2080         if (!IN_DEV_FORWARD(in_dev)) {
2081                 err = -EHOSTUNREACH;
2082                 goto no_route;
2083         }
2084         if (res->type != RTN_UNICAST)
2085                 goto martian_destination;
2086
2087 make_route:
2088         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2089 out:    return err;
2090
2091 brd_input:
2092         if (skb->protocol != htons(ETH_P_IP))
2093                 goto e_inval;
2094
2095         if (!ipv4_is_zeronet(saddr)) {
2096                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2097                                           in_dev, &itag);
2098                 if (err < 0)
2099                         goto martian_source;
2100         }
2101         flags |= RTCF_BROADCAST;
2102         res->type = RTN_BROADCAST;
2103         RT_CACHE_STAT_INC(in_brd);
2104
2105 local_input:
2106         do_cache = false;
2107         if (res->fi) {
2108                 if (!itag) {
2109                         struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2110
2111                         rth = rcu_dereference(nhc->nhc_rth_input);
2112                         if (rt_cache_valid(rth)) {
2113                                 skb_dst_set_noref(skb, &rth->dst);
2114                                 err = 0;
2115                                 goto out;
2116                         }
2117                         do_cache = true;
2118                 }
2119         }
2120
2121         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2122                            flags | RTCF_LOCAL, res->type,
2123                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2124         if (!rth)
2125                 goto e_nobufs;
2126
2127         rth->dst.output= ip_rt_bug;
2128 #ifdef CONFIG_IP_ROUTE_CLASSID
2129         rth->dst.tclassid = itag;
2130 #endif
2131         rth->rt_is_input = 1;
2132
2133         RT_CACHE_STAT_INC(in_slow_tot);
2134         if (res->type == RTN_UNREACHABLE) {
2135                 rth->dst.input= ip_error;
2136                 rth->dst.error= -err;
2137                 rth->rt_flags   &= ~RTCF_LOCAL;
2138         }
2139
2140         if (do_cache) {
2141                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2142                 struct fib_nh *nh;
2143
2144                 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2145                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2146                         WARN_ON(rth->dst.input == lwtunnel_input);
2147                         rth->dst.lwtstate->orig_input = rth->dst.input;
2148                         rth->dst.input = lwtunnel_input;
2149                 }
2150
2151                 nh = container_of(nhc, struct fib_nh, nh_common);
2152                 if (unlikely(!rt_cache_route(nh, rth)))
2153                         rt_add_uncached_list(rth);
2154         }
2155         skb_dst_set(skb, &rth->dst);
2156         err = 0;
2157         goto out;
2158
2159 no_route:
2160         RT_CACHE_STAT_INC(in_no_route);
2161         res->type = RTN_UNREACHABLE;
2162         res->fi = NULL;
2163         res->table = NULL;
2164         goto local_input;
2165
2166         /*
2167          *      Do not cache martian addresses: they should be logged (RFC1812)
2168          */
2169 martian_destination:
2170         RT_CACHE_STAT_INC(in_martian_dst);
2171 #ifdef CONFIG_IP_ROUTE_VERBOSE
2172         if (IN_DEV_LOG_MARTIANS(in_dev))
2173                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2174                                      &daddr, &saddr, dev->name);
2175 #endif
2176
2177 e_inval:
2178         err = -EINVAL;
2179         goto out;
2180
2181 e_nobufs:
2182         err = -ENOBUFS;
2183         goto out;
2184
2185 martian_source:
2186         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2187         goto out;
2188 }
2189
2190 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2191                          u8 tos, struct net_device *dev)
2192 {
2193         struct fib_result res;
2194         int err;
2195
2196         tos &= IPTOS_RT_MASK;
2197         rcu_read_lock();
2198         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2199         rcu_read_unlock();
2200
2201         return err;
2202 }
2203 EXPORT_SYMBOL(ip_route_input_noref);
2204
2205 /* called with rcu_read_lock held */
2206 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2207                        u8 tos, struct net_device *dev, struct fib_result *res)
2208 {
2209         /* Multicast recognition logic is moved from route cache to here.
2210            The problem was that too many Ethernet cards have broken/missing
2211            hardware multicast filters :-( As result the host on multicasting
2212            network acquires a lot of useless route cache entries, sort of
2213            SDR messages from all the world. Now we try to get rid of them.
2214            Really, provided software IP multicast filter is organized
2215            reasonably (at least, hashed), it does not result in a slowdown
2216            comparing with route cache reject entries.
2217            Note, that multicast routers are not affected, because
2218            route cache entry is created eventually.
2219          */
2220         if (ipv4_is_multicast(daddr)) {
2221                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2222                 int our = 0;
2223                 int err = -EINVAL;
2224
2225                 if (!in_dev)
2226                         return err;
2227                 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2228                                       ip_hdr(skb)->protocol);
2229
2230                 /* check l3 master if no match yet */
2231                 if (!our && netif_is_l3_slave(dev)) {
2232                         struct in_device *l3_in_dev;
2233
2234                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2235                         if (l3_in_dev)
2236                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2237                                                       ip_hdr(skb)->protocol);
2238                 }
2239
2240                 if (our
2241 #ifdef CONFIG_IP_MROUTE
2242                         ||
2243                     (!ipv4_is_local_multicast(daddr) &&
2244                      IN_DEV_MFORWARD(in_dev))
2245 #endif
2246                    ) {
2247                         err = ip_route_input_mc(skb, daddr, saddr,
2248                                                 tos, dev, our);
2249                 }
2250                 return err;
2251         }
2252
2253         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2254 }
2255
2256 /* called with rcu_read_lock() */
2257 static struct rtable *__mkroute_output(const struct fib_result *res,
2258                                        const struct flowi4 *fl4, int orig_oif,
2259                                        struct net_device *dev_out,
2260                                        unsigned int flags)
2261 {
2262         struct fib_info *fi = res->fi;
2263         struct fib_nh_exception *fnhe;
2264         struct in_device *in_dev;
2265         u16 type = res->type;
2266         struct rtable *rth;
2267         bool do_cache;
2268
2269         in_dev = __in_dev_get_rcu(dev_out);
2270         if (!in_dev)
2271                 return ERR_PTR(-EINVAL);
2272
2273         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2274                 if (ipv4_is_loopback(fl4->saddr) &&
2275                     !(dev_out->flags & IFF_LOOPBACK) &&
2276                     !netif_is_l3_master(dev_out))
2277                         return ERR_PTR(-EINVAL);
2278
2279         if (ipv4_is_lbcast(fl4->daddr))
2280                 type = RTN_BROADCAST;
2281         else if (ipv4_is_multicast(fl4->daddr))
2282                 type = RTN_MULTICAST;
2283         else if (ipv4_is_zeronet(fl4->daddr))
2284                 return ERR_PTR(-EINVAL);
2285
2286         if (dev_out->flags & IFF_LOOPBACK)
2287                 flags |= RTCF_LOCAL;
2288
2289         do_cache = true;
2290         if (type == RTN_BROADCAST) {
2291                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2292                 fi = NULL;
2293         } else if (type == RTN_MULTICAST) {
2294                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2295                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2296                                      fl4->flowi4_proto))
2297                         flags &= ~RTCF_LOCAL;
2298                 else
2299                         do_cache = false;
2300                 /* If multicast route do not exist use
2301                  * default one, but do not gateway in this case.
2302                  * Yes, it is hack.
2303                  */
2304                 if (fi && res->prefixlen < 4)
2305                         fi = NULL;
2306         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2307                    (orig_oif != dev_out->ifindex)) {
2308                 /* For local routes that require a particular output interface
2309                  * we do not want to cache the result.  Caching the result
2310                  * causes incorrect behaviour when there are multiple source
2311                  * addresses on the interface, the end result being that if the
2312                  * intended recipient is waiting on that interface for the
2313                  * packet he won't receive it because it will be delivered on
2314                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2315                  * be set to the loopback interface as well.
2316                  */
2317                 do_cache = false;
2318         }
2319
2320         fnhe = NULL;
2321         do_cache &= fi != NULL;
2322         if (fi) {
2323                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2324                 struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common);
2325                 struct rtable __rcu **prth;
2326
2327                 fnhe = find_exception(nh, fl4->daddr);
2328                 if (!do_cache)
2329                         goto add;
2330                 if (fnhe) {
2331                         prth = &fnhe->fnhe_rth_output;
2332                 } else {
2333                         if (unlikely(fl4->flowi4_flags &
2334                                      FLOWI_FLAG_KNOWN_NH &&
2335                                      !(nhc->nhc_gw_family &&
2336                                        nhc->nhc_scope == RT_SCOPE_LINK))) {
2337                                 do_cache = false;
2338                                 goto add;
2339                         }
2340                         prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2341                 }
2342                 rth = rcu_dereference(*prth);
2343                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2344                         return rth;
2345         }
2346
2347 add:
2348         rth = rt_dst_alloc(dev_out, flags, type,
2349                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2350                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2351                            do_cache);
2352         if (!rth)
2353                 return ERR_PTR(-ENOBUFS);
2354
2355         rth->rt_iif = orig_oif;
2356
2357         RT_CACHE_STAT_INC(out_slow_tot);
2358
2359         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2360                 if (flags & RTCF_LOCAL &&
2361                     !(dev_out->flags & IFF_LOOPBACK)) {
2362                         rth->dst.output = ip_mc_output;
2363                         RT_CACHE_STAT_INC(out_slow_mc);
2364                 }
2365 #ifdef CONFIG_IP_MROUTE
2366                 if (type == RTN_MULTICAST) {
2367                         if (IN_DEV_MFORWARD(in_dev) &&
2368                             !ipv4_is_local_multicast(fl4->daddr)) {
2369                                 rth->dst.input = ip_mr_input;
2370                                 rth->dst.output = ip_mc_output;
2371                         }
2372                 }
2373 #endif
2374         }
2375
2376         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2377         lwtunnel_set_redirect(&rth->dst);
2378
2379         return rth;
2380 }
2381
2382 /*
2383  * Major route resolver routine.
2384  */
2385
2386 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2387                                         const struct sk_buff *skb)
2388 {
2389         __u8 tos = RT_FL_TOS(fl4);
2390         struct fib_result res = {
2391                 .type           = RTN_UNSPEC,
2392                 .fi             = NULL,
2393                 .table          = NULL,
2394                 .tclassid       = 0,
2395         };
2396         struct rtable *rth;
2397
2398         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2399         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2400         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2401                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2402
2403         rcu_read_lock();
2404         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2405         rcu_read_unlock();
2406
2407         return rth;
2408 }
2409 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2410
2411 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2412                                             struct fib_result *res,
2413                                             const struct sk_buff *skb)
2414 {
2415         struct net_device *dev_out = NULL;
2416         int orig_oif = fl4->flowi4_oif;
2417         unsigned int flags = 0;
2418         struct rtable *rth;
2419         int err = -ENETUNREACH;
2420
2421         if (fl4->saddr) {
2422                 rth = ERR_PTR(-EINVAL);
2423                 if (ipv4_is_multicast(fl4->saddr) ||
2424                     ipv4_is_lbcast(fl4->saddr) ||
2425                     ipv4_is_zeronet(fl4->saddr))
2426                         goto out;
2427
2428                 /* I removed check for oif == dev_out->oif here.
2429                    It was wrong for two reasons:
2430                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2431                       is assigned to multiple interfaces.
2432                    2. Moreover, we are allowed to send packets with saddr
2433                       of another iface. --ANK
2434                  */
2435
2436                 if (fl4->flowi4_oif == 0 &&
2437                     (ipv4_is_multicast(fl4->daddr) ||
2438                      ipv4_is_lbcast(fl4->daddr))) {
2439                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2440                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2441                         if (!dev_out)
2442                                 goto out;
2443
2444                         /* Special hack: user can direct multicasts
2445                            and limited broadcast via necessary interface
2446                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2447                            This hack is not just for fun, it allows
2448                            vic,vat and friends to work.
2449                            They bind socket to loopback, set ttl to zero
2450                            and expect that it will work.
2451                            From the viewpoint of routing cache they are broken,
2452                            because we are not allowed to build multicast path
2453                            with loopback source addr (look, routing cache
2454                            cannot know, that ttl is zero, so that packet
2455                            will not leave this host and route is valid).
2456                            Luckily, this hack is good workaround.
2457                          */
2458
2459                         fl4->flowi4_oif = dev_out->ifindex;
2460                         goto make_route;
2461                 }
2462
2463                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2464                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2465                         if (!__ip_dev_find(net, fl4->saddr, false))
2466                                 goto out;
2467                 }
2468         }
2469
2470
2471         if (fl4->flowi4_oif) {
2472                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2473                 rth = ERR_PTR(-ENODEV);
2474                 if (!dev_out)
2475                         goto out;
2476
2477                 /* RACE: Check return value of inet_select_addr instead. */
2478                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2479                         rth = ERR_PTR(-ENETUNREACH);
2480                         goto out;
2481                 }
2482                 if (ipv4_is_local_multicast(fl4->daddr) ||
2483                     ipv4_is_lbcast(fl4->daddr) ||
2484                     fl4->flowi4_proto == IPPROTO_IGMP) {
2485                         if (!fl4->saddr)
2486                                 fl4->saddr = inet_select_addr(dev_out, 0,
2487                                                               RT_SCOPE_LINK);
2488                         goto make_route;
2489                 }
2490                 if (!fl4->saddr) {
2491                         if (ipv4_is_multicast(fl4->daddr))
2492                                 fl4->saddr = inet_select_addr(dev_out, 0,
2493                                                               fl4->flowi4_scope);
2494                         else if (!fl4->daddr)
2495                                 fl4->saddr = inet_select_addr(dev_out, 0,
2496                                                               RT_SCOPE_HOST);
2497                 }
2498         }
2499
2500         if (!fl4->daddr) {
2501                 fl4->daddr = fl4->saddr;
2502                 if (!fl4->daddr)
2503                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2504                 dev_out = net->loopback_dev;
2505                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2506                 res->type = RTN_LOCAL;
2507                 flags |= RTCF_LOCAL;
2508                 goto make_route;
2509         }
2510
2511         err = fib_lookup(net, fl4, res, 0);
2512         if (err) {
2513                 res->fi = NULL;
2514                 res->table = NULL;
2515                 if (fl4->flowi4_oif &&
2516                     (ipv4_is_multicast(fl4->daddr) ||
2517                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2518                         /* Apparently, routing tables are wrong. Assume,
2519                            that the destination is on link.
2520
2521                            WHY? DW.
2522                            Because we are allowed to send to iface
2523                            even if it has NO routes and NO assigned
2524                            addresses. When oif is specified, routing
2525                            tables are looked up with only one purpose:
2526                            to catch if destination is gatewayed, rather than
2527                            direct. Moreover, if MSG_DONTROUTE is set,
2528                            we send packet, ignoring both routing tables
2529                            and ifaddr state. --ANK
2530
2531
2532                            We could make it even if oif is unknown,
2533                            likely IPv6, but we do not.
2534                          */
2535
2536                         if (fl4->saddr == 0)
2537                                 fl4->saddr = inet_select_addr(dev_out, 0,
2538                                                               RT_SCOPE_LINK);
2539                         res->type = RTN_UNICAST;
2540                         goto make_route;
2541                 }
2542                 rth = ERR_PTR(err);
2543                 goto out;
2544         }
2545
2546         if (res->type == RTN_LOCAL) {
2547                 if (!fl4->saddr) {
2548                         if (res->fi->fib_prefsrc)
2549                                 fl4->saddr = res->fi->fib_prefsrc;
2550                         else
2551                                 fl4->saddr = fl4->daddr;
2552                 }
2553
2554                 /* L3 master device is the loopback for that domain */
2555                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2556                         net->loopback_dev;
2557
2558                 /* make sure orig_oif points to fib result device even
2559                  * though packet rx/tx happens over loopback or l3mdev
2560                  */
2561                 orig_oif = FIB_RES_OIF(*res);
2562
2563                 fl4->flowi4_oif = dev_out->ifindex;
2564                 flags |= RTCF_LOCAL;
2565                 goto make_route;
2566         }
2567
2568         fib_select_path(net, res, fl4, skb);
2569
2570         dev_out = FIB_RES_DEV(*res);
2571         fl4->flowi4_oif = dev_out->ifindex;
2572
2573
2574 make_route:
2575         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2576
2577 out:
2578         return rth;
2579 }
2580
2581 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2582 {
2583         return NULL;
2584 }
2585
2586 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2587 {
2588         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2589
2590         return mtu ? : dst->dev->mtu;
2591 }
2592
2593 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2594                                           struct sk_buff *skb, u32 mtu)
2595 {
2596 }
2597
2598 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2599                                        struct sk_buff *skb)
2600 {
2601 }
2602
2603 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2604                                           unsigned long old)
2605 {
2606         return NULL;
2607 }
2608
2609 static struct dst_ops ipv4_dst_blackhole_ops = {
2610         .family                 =       AF_INET,
2611         .check                  =       ipv4_blackhole_dst_check,
2612         .mtu                    =       ipv4_blackhole_mtu,
2613         .default_advmss         =       ipv4_default_advmss,
2614         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2615         .redirect               =       ipv4_rt_blackhole_redirect,
2616         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2617         .neigh_lookup           =       ipv4_neigh_lookup,
2618 };
2619
2620 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2621 {
2622         struct rtable *ort = (struct rtable *) dst_orig;
2623         struct rtable *rt;
2624
2625         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2626         if (rt) {
2627                 struct dst_entry *new = &rt->dst;
2628
2629                 new->__use = 1;
2630                 new->input = dst_discard;
2631                 new->output = dst_discard_out;
2632
2633                 new->dev = net->loopback_dev;
2634                 if (new->dev)
2635                         dev_hold(new->dev);
2636
2637                 rt->rt_is_input = ort->rt_is_input;
2638                 rt->rt_iif = ort->rt_iif;
2639                 rt->rt_pmtu = ort->rt_pmtu;
2640                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2641
2642                 rt->rt_genid = rt_genid_ipv4(net);
2643                 rt->rt_flags = ort->rt_flags;
2644                 rt->rt_type = ort->rt_type;
2645                 rt->rt_gw_family = ort->rt_gw_family;
2646                 if (rt->rt_gw_family == AF_INET)
2647                         rt->rt_gw4 = ort->rt_gw4;
2648                 else if (rt->rt_gw_family == AF_INET6)
2649                         rt->rt_gw6 = ort->rt_gw6;
2650
2651                 INIT_LIST_HEAD(&rt->rt_uncached);
2652         }
2653
2654         dst_release(dst_orig);
2655
2656         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2657 }
2658
2659 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2660                                     const struct sock *sk)
2661 {
2662         struct rtable *rt = __ip_route_output_key(net, flp4);
2663
2664         if (IS_ERR(rt))
2665                 return rt;
2666
2667         if (flp4->flowi4_proto)
2668                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2669                                                         flowi4_to_flowi(flp4),
2670                                                         sk, 0);
2671
2672         return rt;
2673 }
2674 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2675
2676 /* called with rcu_read_lock held */
2677 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2678                         struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2679                         struct sk_buff *skb, u32 portid, u32 seq)
2680 {
2681         struct rtmsg *r;
2682         struct nlmsghdr *nlh;
2683         unsigned long expires = 0;
2684         u32 error;
2685         u32 metrics[RTAX_MAX];
2686
2687         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2688         if (!nlh)
2689                 return -EMSGSIZE;
2690
2691         r = nlmsg_data(nlh);
2692         r->rtm_family    = AF_INET;
2693         r->rtm_dst_len  = 32;
2694         r->rtm_src_len  = 0;
2695         r->rtm_tos      = fl4->flowi4_tos;
2696         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2697         if (nla_put_u32(skb, RTA_TABLE, table_id))
2698                 goto nla_put_failure;
2699         r->rtm_type     = rt->rt_type;
2700         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2701         r->rtm_protocol = RTPROT_UNSPEC;
2702         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2703         if (rt->rt_flags & RTCF_NOTIFY)
2704                 r->rtm_flags |= RTM_F_NOTIFY;
2705         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2706                 r->rtm_flags |= RTCF_DOREDIRECT;
2707
2708         if (nla_put_in_addr(skb, RTA_DST, dst))
2709                 goto nla_put_failure;
2710         if (src) {
2711                 r->rtm_src_len = 32;
2712                 if (nla_put_in_addr(skb, RTA_SRC, src))
2713                         goto nla_put_failure;
2714         }
2715         if (rt->dst.dev &&
2716             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2717                 goto nla_put_failure;
2718 #ifdef CONFIG_IP_ROUTE_CLASSID
2719         if (rt->dst.tclassid &&
2720             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2721                 goto nla_put_failure;
2722 #endif
2723         if (!rt_is_input_route(rt) &&
2724             fl4->saddr != src) {
2725                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2726                         goto nla_put_failure;
2727         }
2728         if (rt->rt_gw_family == AF_INET &&
2729             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2730                 goto nla_put_failure;
2731         } else if (rt->rt_gw_family == AF_INET6) {
2732                 int alen = sizeof(struct in6_addr);
2733                 struct nlattr *nla;
2734                 struct rtvia *via;
2735
2736                 nla = nla_reserve(skb, RTA_VIA, alen + 2);
2737                 if (!nla)
2738                         goto nla_put_failure;
2739
2740                 via = nla_data(nla);
2741                 via->rtvia_family = AF_INET6;
2742                 memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2743         }
2744
2745         expires = rt->dst.expires;
2746         if (expires) {
2747                 unsigned long now = jiffies;
2748
2749                 if (time_before(now, expires))
2750                         expires -= now;
2751                 else
2752                         expires = 0;
2753         }
2754
2755         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2756         if (rt->rt_pmtu && expires)
2757                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2758         if (rt->rt_mtu_locked && expires)
2759                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2760         if (rtnetlink_put_metrics(skb, metrics) < 0)
2761                 goto nla_put_failure;
2762
2763         if (fl4->flowi4_mark &&
2764             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2765                 goto nla_put_failure;
2766
2767         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2768             nla_put_u32(skb, RTA_UID,
2769                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2770                 goto nla_put_failure;
2771
2772         error = rt->dst.error;
2773
2774         if (rt_is_input_route(rt)) {
2775 #ifdef CONFIG_IP_MROUTE
2776                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2777                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2778                         int err = ipmr_get_route(net, skb,
2779                                                  fl4->saddr, fl4->daddr,
2780                                                  r, portid);
2781
2782                         if (err <= 0) {
2783                                 if (err == 0)
2784                                         return 0;
2785                                 goto nla_put_failure;
2786                         }
2787                 } else
2788 #endif
2789                         if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2790                                 goto nla_put_failure;
2791         }
2792
2793         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2794                 goto nla_put_failure;
2795
2796         nlmsg_end(skb, nlh);
2797         return 0;
2798
2799 nla_put_failure:
2800         nlmsg_cancel(skb, nlh);
2801         return -EMSGSIZE;
2802 }
2803
2804 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2805                                                    u8 ip_proto, __be16 sport,
2806                                                    __be16 dport)
2807 {
2808         struct sk_buff *skb;
2809         struct iphdr *iph;
2810
2811         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2812         if (!skb)
2813                 return NULL;
2814
2815         /* Reserve room for dummy headers, this skb can pass
2816          * through good chunk of routing engine.
2817          */
2818         skb_reset_mac_header(skb);
2819         skb_reset_network_header(skb);
2820         skb->protocol = htons(ETH_P_IP);
2821         iph = skb_put(skb, sizeof(struct iphdr));
2822         iph->protocol = ip_proto;
2823         iph->saddr = src;
2824         iph->daddr = dst;
2825         iph->version = 0x4;
2826         iph->frag_off = 0;
2827         iph->ihl = 0x5;
2828         skb_set_transport_header(skb, skb->len);
2829
2830         switch (iph->protocol) {
2831         case IPPROTO_UDP: {
2832                 struct udphdr *udph;
2833
2834                 udph = skb_put_zero(skb, sizeof(struct udphdr));
2835                 udph->source = sport;
2836                 udph->dest = dport;
2837                 udph->len = sizeof(struct udphdr);
2838                 udph->check = 0;
2839                 break;
2840         }
2841         case IPPROTO_TCP: {
2842                 struct tcphdr *tcph;
2843
2844                 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
2845                 tcph->source    = sport;
2846                 tcph->dest      = dport;
2847                 tcph->doff      = sizeof(struct tcphdr) / 4;
2848                 tcph->rst = 1;
2849                 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
2850                                             src, dst, 0);
2851                 break;
2852         }
2853         case IPPROTO_ICMP: {
2854                 struct icmphdr *icmph;
2855
2856                 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
2857                 icmph->type = ICMP_ECHO;
2858                 icmph->code = 0;
2859         }
2860         }
2861
2862         return skb;
2863 }
2864
2865 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
2866                                        const struct nlmsghdr *nlh,
2867                                        struct nlattr **tb,
2868                                        struct netlink_ext_ack *extack)
2869 {
2870         struct rtmsg *rtm;
2871         int i, err;
2872
2873         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
2874                 NL_SET_ERR_MSG(extack,
2875                                "ipv4: Invalid header for route get request");
2876                 return -EINVAL;
2877         }
2878
2879         if (!netlink_strict_get_check(skb))
2880                 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
2881                                               rtm_ipv4_policy, extack);
2882
2883         rtm = nlmsg_data(nlh);
2884         if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
2885             (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
2886             rtm->rtm_table || rtm->rtm_protocol ||
2887             rtm->rtm_scope || rtm->rtm_type) {
2888                 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
2889                 return -EINVAL;
2890         }
2891
2892         if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
2893                                RTM_F_LOOKUP_TABLE |
2894                                RTM_F_FIB_MATCH)) {
2895                 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
2896                 return -EINVAL;
2897         }
2898
2899         err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
2900                                             rtm_ipv4_policy, extack);
2901         if (err)
2902                 return err;
2903
2904         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
2905             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
2906                 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
2907                 return -EINVAL;
2908         }
2909
2910         for (i = 0; i <= RTA_MAX; i++) {
2911                 if (!tb[i])
2912                         continue;
2913
2914                 switch (i) {
2915                 case RTA_IIF:
2916                 case RTA_OIF:
2917                 case RTA_SRC:
2918                 case RTA_DST:
2919                 case RTA_IP_PROTO:
2920                 case RTA_SPORT:
2921                 case RTA_DPORT:
2922                 case RTA_MARK:
2923                 case RTA_UID:
2924                         break;
2925                 default:
2926                         NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
2927                         return -EINVAL;
2928                 }
2929         }
2930
2931         return 0;
2932 }
2933
2934 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2935                              struct netlink_ext_ack *extack)
2936 {
2937         struct net *net = sock_net(in_skb->sk);
2938         struct nlattr *tb[RTA_MAX+1];
2939         u32 table_id = RT_TABLE_MAIN;
2940         __be16 sport = 0, dport = 0;
2941         struct fib_result res = {};
2942         u8 ip_proto = IPPROTO_UDP;
2943         struct rtable *rt = NULL;
2944         struct sk_buff *skb;
2945         struct rtmsg *rtm;
2946         struct flowi4 fl4 = {};
2947         __be32 dst = 0;
2948         __be32 src = 0;
2949         kuid_t uid;
2950         u32 iif;
2951         int err;
2952         int mark;
2953
2954         err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
2955         if (err < 0)
2956                 return err;
2957
2958         rtm = nlmsg_data(nlh);
2959         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2960         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2961         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2962         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2963         if (tb[RTA_UID])
2964                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2965         else
2966                 uid = (iif ? INVALID_UID : current_uid());
2967
2968         if (tb[RTA_IP_PROTO]) {
2969                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
2970                                                   &ip_proto, AF_INET, extack);
2971                 if (err)
2972                         return err;
2973         }
2974
2975         if (tb[RTA_SPORT])
2976                 sport = nla_get_be16(tb[RTA_SPORT]);
2977
2978         if (tb[RTA_DPORT])
2979                 dport = nla_get_be16(tb[RTA_DPORT]);
2980
2981         skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
2982         if (!skb)
2983                 return -ENOBUFS;
2984
2985         fl4.daddr = dst;
2986         fl4.saddr = src;
2987         fl4.flowi4_tos = rtm->rtm_tos;
2988         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2989         fl4.flowi4_mark = mark;
2990         fl4.flowi4_uid = uid;
2991         if (sport)
2992                 fl4.fl4_sport = sport;
2993         if (dport)
2994                 fl4.fl4_dport = dport;
2995         fl4.flowi4_proto = ip_proto;
2996
2997         rcu_read_lock();
2998
2999         if (iif) {
3000                 struct net_device *dev;
3001
3002                 dev = dev_get_by_index_rcu(net, iif);
3003                 if (!dev) {
3004                         err = -ENODEV;
3005                         goto errout_rcu;
3006                 }
3007
3008                 fl4.flowi4_iif = iif; /* for rt_fill_info */
3009                 skb->dev        = dev;
3010                 skb->mark       = mark;
3011                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
3012                                          dev, &res);
3013
3014                 rt = skb_rtable(skb);
3015                 if (err == 0 && rt->dst.error)
3016                         err = -rt->dst.error;
3017         } else {
3018                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3019                 skb->dev = net->loopback_dev;
3020                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3021                 err = 0;
3022                 if (IS_ERR(rt))
3023                         err = PTR_ERR(rt);
3024                 else
3025                         skb_dst_set(skb, &rt->dst);
3026         }
3027
3028         if (err)
3029                 goto errout_rcu;
3030
3031         if (rtm->rtm_flags & RTM_F_NOTIFY)
3032                 rt->rt_flags |= RTCF_NOTIFY;
3033
3034         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3035                 table_id = res.table ? res.table->tb_id : 0;
3036
3037         /* reset skb for netlink reply msg */
3038         skb_trim(skb, 0);
3039         skb_reset_network_header(skb);
3040         skb_reset_transport_header(skb);
3041         skb_reset_mac_header(skb);
3042
3043         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3044                 if (!res.fi) {
3045                         err = fib_props[res.type].error;
3046                         if (!err)
3047                                 err = -EHOSTUNREACH;
3048                         goto errout_rcu;
3049                 }
3050                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3051                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
3052                                     rt->rt_type, res.prefix, res.prefixlen,
3053                                     fl4.flowi4_tos, res.fi, 0);
3054         } else {
3055                 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3056                                    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
3057         }
3058         if (err < 0)
3059                 goto errout_rcu;
3060
3061         rcu_read_unlock();
3062
3063         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3064
3065 errout_free:
3066         return err;
3067 errout_rcu:
3068         rcu_read_unlock();
3069         kfree_skb(skb);
3070         goto errout_free;
3071 }
3072
3073 void ip_rt_multicast_event(struct in_device *in_dev)
3074 {
3075         rt_cache_flush(dev_net(in_dev->dev));
3076 }
3077
3078 #ifdef CONFIG_SYSCTL
3079 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3080 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
3081 static int ip_rt_gc_elasticity __read_mostly    = 8;
3082 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
3083
3084 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3085                                         void __user *buffer,
3086                                         size_t *lenp, loff_t *ppos)
3087 {
3088         struct net *net = (struct net *)__ctl->extra1;
3089
3090         if (write) {
3091                 rt_cache_flush(net);
3092                 fnhe_genid_bump(net);
3093                 return 0;
3094         }
3095
3096         return -EINVAL;
3097 }
3098
3099 static struct ctl_table ipv4_route_table[] = {
3100         {
3101                 .procname       = "gc_thresh",
3102                 .data           = &ipv4_dst_ops.gc_thresh,
3103                 .maxlen         = sizeof(int),
3104                 .mode           = 0644,
3105                 .proc_handler   = proc_dointvec,
3106         },
3107         {
3108                 .procname       = "max_size",
3109                 .data           = &ip_rt_max_size,
3110                 .maxlen         = sizeof(int),
3111                 .mode           = 0644,
3112                 .proc_handler   = proc_dointvec,
3113         },
3114         {
3115                 /*  Deprecated. Use gc_min_interval_ms */
3116
3117                 .procname       = "gc_min_interval",
3118                 .data           = &ip_rt_gc_min_interval,
3119                 .maxlen         = sizeof(int),
3120                 .mode           = 0644,
3121                 .proc_handler   = proc_dointvec_jiffies,
3122         },
3123         {
3124                 .procname       = "gc_min_interval_ms",
3125                 .data           = &ip_rt_gc_min_interval,
3126                 .maxlen         = sizeof(int),
3127                 .mode           = 0644,
3128                 .proc_handler   = proc_dointvec_ms_jiffies,
3129         },
3130         {
3131                 .procname       = "gc_timeout",
3132                 .data           = &ip_rt_gc_timeout,
3133                 .maxlen         = sizeof(int),
3134                 .mode           = 0644,
3135                 .proc_handler   = proc_dointvec_jiffies,
3136         },
3137         {
3138                 .procname       = "gc_interval",
3139                 .data           = &ip_rt_gc_interval,
3140                 .maxlen         = sizeof(int),
3141                 .mode           = 0644,
3142                 .proc_handler   = proc_dointvec_jiffies,
3143         },
3144         {
3145                 .procname       = "redirect_load",
3146                 .data           = &ip_rt_redirect_load,
3147                 .maxlen         = sizeof(int),
3148                 .mode           = 0644,
3149                 .proc_handler   = proc_dointvec,
3150         },
3151         {
3152                 .procname       = "redirect_number",
3153                 .data           = &ip_rt_redirect_number,
3154                 .maxlen         = sizeof(int),
3155                 .mode           = 0644,
3156                 .proc_handler   = proc_dointvec,
3157         },
3158         {
3159                 .procname       = "redirect_silence",
3160                 .data           = &ip_rt_redirect_silence,
3161                 .maxlen         = sizeof(int),
3162                 .mode           = 0644,
3163                 .proc_handler   = proc_dointvec,
3164         },
3165         {
3166                 .procname       = "error_cost",
3167                 .data           = &ip_rt_error_cost,
3168                 .maxlen         = sizeof(int),
3169                 .mode           = 0644,
3170                 .proc_handler   = proc_dointvec,
3171         },
3172         {
3173                 .procname       = "error_burst",
3174                 .data           = &ip_rt_error_burst,
3175                 .maxlen         = sizeof(int),
3176                 .mode           = 0644,
3177                 .proc_handler   = proc_dointvec,
3178         },
3179         {
3180                 .procname       = "gc_elasticity",
3181                 .data           = &ip_rt_gc_elasticity,
3182                 .maxlen         = sizeof(int),
3183                 .mode           = 0644,
3184                 .proc_handler   = proc_dointvec,
3185         },
3186         {
3187                 .procname       = "mtu_expires",
3188                 .data           = &ip_rt_mtu_expires,
3189                 .maxlen         = sizeof(int),
3190                 .mode           = 0644,
3191                 .proc_handler   = proc_dointvec_jiffies,
3192         },
3193         {
3194                 .procname       = "min_pmtu",
3195                 .data           = &ip_rt_min_pmtu,
3196                 .maxlen         = sizeof(int),
3197                 .mode           = 0644,
3198                 .proc_handler   = proc_dointvec_minmax,
3199                 .extra1         = &ip_min_valid_pmtu,
3200         },
3201         {
3202                 .procname       = "min_adv_mss",
3203                 .data           = &ip_rt_min_advmss,
3204                 .maxlen         = sizeof(int),
3205                 .mode           = 0644,
3206                 .proc_handler   = proc_dointvec,
3207         },
3208         { }
3209 };
3210
3211 static struct ctl_table ipv4_route_flush_table[] = {
3212         {
3213                 .procname       = "flush",
3214                 .maxlen         = sizeof(int),
3215                 .mode           = 0200,
3216                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3217         },
3218         { },
3219 };
3220
3221 static __net_init int sysctl_route_net_init(struct net *net)
3222 {
3223         struct ctl_table *tbl;
3224
3225         tbl = ipv4_route_flush_table;
3226         if (!net_eq(net, &init_net)) {
3227                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3228                 if (!tbl)
3229                         goto err_dup;
3230
3231                 /* Don't export sysctls to unprivileged users */
3232                 if (net->user_ns != &init_user_ns)
3233                         tbl[0].procname = NULL;
3234         }
3235         tbl[0].extra1 = net;
3236
3237         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3238         if (!net->ipv4.route_hdr)
3239                 goto err_reg;
3240         return 0;
3241
3242 err_reg:
3243         if (tbl != ipv4_route_flush_table)
3244                 kfree(tbl);
3245 err_dup:
3246         return -ENOMEM;
3247 }
3248
3249 static __net_exit void sysctl_route_net_exit(struct net *net)
3250 {
3251         struct ctl_table *tbl;
3252
3253         tbl = net->ipv4.route_hdr->ctl_table_arg;
3254         unregister_net_sysctl_table(net->ipv4.route_hdr);
3255         BUG_ON(tbl == ipv4_route_flush_table);
3256         kfree(tbl);
3257 }
3258
3259 static __net_initdata struct pernet_operations sysctl_route_ops = {
3260         .init = sysctl_route_net_init,
3261         .exit = sysctl_route_net_exit,
3262 };
3263 #endif
3264
3265 static __net_init int rt_genid_init(struct net *net)
3266 {
3267         atomic_set(&net->ipv4.rt_genid, 0);
3268         atomic_set(&net->fnhe_genid, 0);
3269         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3270         return 0;
3271 }
3272
3273 static __net_initdata struct pernet_operations rt_genid_ops = {
3274         .init = rt_genid_init,
3275 };
3276
3277 static int __net_init ipv4_inetpeer_init(struct net *net)
3278 {
3279         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3280
3281         if (!bp)
3282                 return -ENOMEM;
3283         inet_peer_base_init(bp);
3284         net->ipv4.peers = bp;
3285         return 0;
3286 }
3287
3288 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3289 {
3290         struct inet_peer_base *bp = net->ipv4.peers;
3291
3292         net->ipv4.peers = NULL;
3293         inetpeer_invalidate_tree(bp);
3294         kfree(bp);
3295 }
3296
3297 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3298         .init   =       ipv4_inetpeer_init,
3299         .exit   =       ipv4_inetpeer_exit,
3300 };
3301
3302 #ifdef CONFIG_IP_ROUTE_CLASSID
3303 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3304 #endif /* CONFIG_IP_ROUTE_CLASSID */
3305
3306 int __init ip_rt_init(void)
3307 {
3308         int cpu;
3309
3310         ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3311                                   GFP_KERNEL);
3312         if (!ip_idents)
3313                 panic("IP: failed to allocate ip_idents\n");
3314
3315         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3316
3317         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3318         if (!ip_tstamps)
3319                 panic("IP: failed to allocate ip_tstamps\n");
3320
3321         for_each_possible_cpu(cpu) {
3322                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3323
3324                 INIT_LIST_HEAD(&ul->head);
3325                 spin_lock_init(&ul->lock);
3326         }
3327 #ifdef CONFIG_IP_ROUTE_CLASSID
3328         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3329         if (!ip_rt_acct)
3330                 panic("IP: failed to allocate ip_rt_acct\n");
3331 #endif
3332
3333         ipv4_dst_ops.kmem_cachep =
3334                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3335                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3336
3337         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3338
3339         if (dst_entries_init(&ipv4_dst_ops) < 0)
3340                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3341
3342         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3343                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3344
3345         ipv4_dst_ops.gc_thresh = ~0;
3346         ip_rt_max_size = INT_MAX;
3347
3348         devinet_init();
3349         ip_fib_init();
3350
3351         if (ip_rt_proc_init())
3352                 pr_err("Unable to create route proc files\n");
3353 #ifdef CONFIG_XFRM
3354         xfrm_init();
3355         xfrm4_init();
3356 #endif
3357         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3358                       RTNL_FLAG_DOIT_UNLOCKED);
3359
3360 #ifdef CONFIG_SYSCTL
3361         register_pernet_subsys(&sysctl_route_ops);
3362 #endif
3363         register_pernet_subsys(&rt_genid_ops);
3364         register_pernet_subsys(&ipv4_inetpeer_ops);
3365         return 0;
3366 }
3367
3368 #ifdef CONFIG_SYSCTL
3369 /*
3370  * We really need to sanitize the damn ipv4 init order, then all
3371  * this nonsense will go away.
3372  */
3373 void __init ip_static_sysctl_init(void)
3374 {
3375         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3376 }
3377 #endif