net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/bpf-cgroup.h>
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58 #include <linux/mroute6.h>
  59 #include <net/l3mdev.h>
  60 #include <net/lwtunnel.h>
  61
  62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  63 {
  64         struct dst_entry *dst = skb_dst(skb);
  65         struct net_device *dev = dst->dev;
  66         struct neighbour *neigh;
  67         struct in6_addr *nexthop;
  68         int ret;
  69
  70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  72
  73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  74                     ((mroute6_is_socket(net, skb) &&
  75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  77                                          &ipv6_hdr(skb)->saddr))) {
  78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  79
  80                         /* Do not check for IFF_ALLMULTI; multicast routing
  81                            is not supported in any case.
  82                          */
  83                         if (newskb)
  84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  85                                         net, sk, newskb, NULL, newskb->dev,
  86                                         dev_loopback_xmit);
  87
  88                         if (ipv6_hdr(skb)->hop_limit == 0) {
  89                                 IP6_INC_STATS(net, idev,
  90                                               IPSTATS_MIB_OUTDISCARDS);
  91                                 kfree_skb(skb);
  92                                 return 0;
  93                         }
  94                 }
  95
  96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  97
  98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
  99                     IPV6_ADDR_SCOPE_NODELOCAL &&
 100                     !(dev->flags & IFF_LOOPBACK)) {
 101                         kfree_skb(skb);
 102                         return 0;
 103                 }
 104         }
 105
 106         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 107                 int res = lwtunnel_xmit(skb);
 108
 109                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 110                         return res;
 111         }
 112
 113         rcu_read_lock_bh();
 114         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 115         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 116         if (unlikely(!neigh))
 117                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 118         if (!IS_ERR(neigh)) {
 119                 sock_confirm_neigh(skb, neigh);
 120                 ret = neigh_output(neigh, skb);
 121                 rcu_read_unlock_bh();
 122                 return ret;
 123         }
 124         rcu_read_unlock_bh();
 125
 126         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 127         kfree_skb(skb);
 128         return -EINVAL;
 129 }
 130
 131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 132 {
 133         int ret;
 134
 135         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 136         if (ret) {
 137                 kfree_skb(skb);
 138                 return ret;
 139         }
 140
 141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 142         /* Policy lookup after SNAT yielded a new policy */
 143         if (skb_dst(skb)->xfrm) {
 144                 IPCB(skb)->flags |= IPSKB_REROUTED;
 145                 return dst_output(net, sk, skb);
 146         }
 147 #endif
 148
 149         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 150             dst_allfrag(skb_dst(skb)) ||
 151             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 152                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 153         else
 154                 return ip6_finish_output2(net, sk, skb);
 155 }
 156
 157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 158 {
 159         struct net_device *dev = skb_dst(skb)->dev;
 160         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 161
 162         skb->protocol = htons(ETH_P_IPV6);
 163         skb->dev = dev;
 164
 165         if (unlikely(idev->cnf.disable_ipv6)) {
 166                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 167                 kfree_skb(skb);
 168                 return 0;
 169         }
 170
 171         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 172                             net, sk, skb, NULL, dev,
 173                             ip6_finish_output,
 174                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 175 }
 176
 177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 178 {
 179         if (!np->autoflowlabel_set)
 180                 return ip6_default_np_autolabel(net);
 181         else
 182                 return np->autoflowlabel;
 183 }
 184
 185 /*
 186  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 187  * Note : socket lock is not held for SYNACK packets, but might be modified
 188  * by calls to skb_set_owner_w() and ipv6_local_error(),
 189  * which are using proper atomic operations or spinlocks.
 190  */
 191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 192              __u32 mark, struct ipv6_txoptions *opt, int tclass)
 193 {
 194         struct net *net = sock_net(sk);
 195         const struct ipv6_pinfo *np = inet6_sk(sk);
 196         struct in6_addr *first_hop = &fl6->daddr;
 197         struct dst_entry *dst = skb_dst(skb);
 198         unsigned int head_room;
 199         struct ipv6hdr *hdr;
 200         u8  proto = fl6->flowi6_proto;
 201         int seg_len = skb->len;
 202         int hlimit = -1;
 203         u32 mtu;
 204
 205         head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 206         if (opt)
 207                 head_room += opt->opt_nflen + opt->opt_flen;
 208
 209         if (unlikely(skb_headroom(skb) < head_room)) {
 210                 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 211                 if (!skb2) {
 212                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 213                                       IPSTATS_MIB_OUTDISCARDS);
 214                         kfree_skb(skb);
 215                         return -ENOBUFS;
 216                 }
 217                 if (skb->sk)
 218                         skb_set_owner_w(skb2, skb->sk);
 219                 consume_skb(skb);
 220                 skb = skb2;
 221         }
 222
 223         if (opt) {
 224                 seg_len += opt->opt_nflen + opt->opt_flen;
 225
 226                 if (opt->opt_flen)
 227                         ipv6_push_frag_opts(skb, opt, &proto);
 228
 229                 if (opt->opt_nflen)
 230                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 231                                              &fl6->saddr);
 232         }
 233
 234         skb_push(skb, sizeof(struct ipv6hdr));
 235         skb_reset_network_header(skb);
 236         hdr = ipv6_hdr(skb);
 237
 238         /*
 239          *      Fill in the IPv6 header
 240          */
 241         if (np)
 242                 hlimit = np->hop_limit;
 243         if (hlimit < 0)
 244                 hlimit = ip6_dst_hoplimit(dst);
 245
 246         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 247                                 ip6_autoflowlabel(net, np), fl6));
 248
 249         hdr->payload_len = htons(seg_len);
 250         hdr->nexthdr = proto;
 251         hdr->hop_limit = hlimit;
 252
 253         hdr->saddr = fl6->saddr;
 254         hdr->daddr = *first_hop;
 255
 256         skb->protocol = htons(ETH_P_IPV6);
 257         skb->priority = sk->sk_priority;
 258         skb->mark = mark;
 259
 260         mtu = dst_mtu(dst);
 261         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 262                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 263                               IPSTATS_MIB_OUT, skb->len);
 264
 265                 /* if egress device is enslaved to an L3 master device pass the
 266                  * skb to its handler for processing
 267                  */
 268                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 269                 if (unlikely(!skb))
 270                         return 0;
 271
 272                 /* hooks should never assume socket lock is held.
 273                  * we promote our socket to non const
 274                  */
 275                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 276                                net, (struct sock *)sk, skb, NULL, dst->dev,
 277                                dst_output);
 278         }
 279
 280         skb->dev = dst->dev;
 281         /* ipv6_local_error() does not require socket lock,
 282          * we promote our socket to non const
 283          */
 284         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 285
 286         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 287         kfree_skb(skb);
 288         return -EMSGSIZE;
 289 }
 290 EXPORT_SYMBOL(ip6_xmit);
 291
 292 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 293 {
 294         struct ip6_ra_chain *ra;
 295         struct sock *last = NULL;
 296
 297         read_lock(&ip6_ra_lock);
 298         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 299                 struct sock *sk = ra->sk;
 300                 if (sk && ra->sel == sel &&
 301                     (!sk->sk_bound_dev_if ||
 302                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 303                         if (last) {
 304                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 305                                 if (skb2)
 306                                         rawv6_rcv(last, skb2);
 307                         }
 308                         last = sk;
 309                 }
 310         }
 311
 312         if (last) {
 313                 rawv6_rcv(last, skb);
 314                 read_unlock(&ip6_ra_lock);
 315                 return 1;
 316         }
 317         read_unlock(&ip6_ra_lock);
 318         return 0;
 319 }
 320
 321 static int ip6_forward_proxy_check(struct sk_buff *skb)
 322 {
 323         struct ipv6hdr *hdr = ipv6_hdr(skb);
 324         u8 nexthdr = hdr->nexthdr;
 325         __be16 frag_off;
 326         int offset;
 327
 328         if (ipv6_ext_hdr(nexthdr)) {
 329                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 330                 if (offset < 0)
 331                         return 0;
 332         } else
 333                 offset = sizeof(struct ipv6hdr);
 334
 335         if (nexthdr == IPPROTO_ICMPV6) {
 336                 struct icmp6hdr *icmp6;
 337
 338                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 339                                          offset + 1 - skb->data)))
 340                         return 0;
 341
 342                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 343
 344                 switch (icmp6->icmp6_type) {
 345                 case NDISC_ROUTER_SOLICITATION:
 346                 case NDISC_ROUTER_ADVERTISEMENT:
 347                 case NDISC_NEIGHBOUR_SOLICITATION:
 348                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 349                 case NDISC_REDIRECT:
 350                         /* For reaction involving unicast neighbor discovery
 351                          * message destined to the proxied address, pass it to
 352                          * input function.
 353                          */
 354                         return 1;
 355                 default:
 356                         break;
 357                 }
 358         }
 359
 360         /*
 361          * The proxying router can't forward traffic sent to a link-local
 362          * address, so signal the sender and discard the packet. This
 363          * behavior is clarified by the MIPv6 specification.
 364          */
 365         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 366                 dst_link_failure(skb);
 367                 return -1;
 368         }
 369
 370         return 0;
 371 }
 372
 373 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 374                                      struct sk_buff *skb)
 375 {
 376         struct dst_entry *dst = skb_dst(skb);
 377
 378         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 379         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 380
 381         skb->tstamp = 0;
 382         return dst_output(net, sk, skb);
 383 }
 384
 385 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 386 {
 387         if (skb->len <= mtu)
 388                 return false;
 389
 390         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 391         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 392                 return true;
 393
 394         if (skb->ignore_df)
 395                 return false;
 396
 397         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 398                 return false;
 399
 400         return true;
 401 }
 402
 403 int ip6_forward(struct sk_buff *skb)
 404 {
 405         struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
 406         struct dst_entry *dst = skb_dst(skb);
 407         struct ipv6hdr *hdr = ipv6_hdr(skb);
 408         struct inet6_skb_parm *opt = IP6CB(skb);
 409         struct net *net = dev_net(dst->dev);
 410         u32 mtu;
 411
 412         if (net->ipv6.devconf_all->forwarding == 0)
 413                 goto error;
 414
 415         if (skb->pkt_type != PACKET_HOST)
 416                 goto drop;
 417
 418         if (unlikely(skb->sk))
 419                 goto drop;
 420
 421         if (skb_warn_if_lro(skb))
 422                 goto drop;
 423
 424         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 425                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 426                 goto drop;
 427         }
 428
 429         skb_forward_csum(skb);
 430
 431         /*
 432          *      We DO NOT make any processing on
 433          *      RA packets, pushing them to user level AS IS
 434          *      without ane WARRANTY that application will be able
 435          *      to interpret them. The reason is that we
 436          *      cannot make anything clever here.
 437          *
 438          *      We are not end-node, so that if packet contains
 439          *      AH/ESP, we cannot make anything.
 440          *      Defragmentation also would be mistake, RA packets
 441          *      cannot be fragmented, because there is no warranty
 442          *      that different fragments will go along one path. --ANK
 443          */
 444         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 445                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 446                         return 0;
 447         }
 448
 449         /*
 450          *      check and decrement ttl
 451          */
 452         if (hdr->hop_limit <= 1) {
 453                 /* Force OUTPUT device used as source address */
 454                 skb->dev = dst->dev;
 455                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 456                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 457
 458                 kfree_skb(skb);
 459                 return -ETIMEDOUT;
 460         }
 461
 462         /* XXX: idev->cnf.proxy_ndp? */
 463         if (net->ipv6.devconf_all->proxy_ndp &&
 464             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 465                 int proxied = ip6_forward_proxy_check(skb);
 466                 if (proxied > 0)
 467                         return ip6_input(skb);
 468                 else if (proxied < 0) {
 469                         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 470                         goto drop;
 471                 }
 472         }
 473
 474         if (!xfrm6_route_forward(skb)) {
 475                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 476                 goto drop;
 477         }
 478         dst = skb_dst(skb);
 479
 480         /* IPv6 specs say nothing about it, but it is clear that we cannot
 481            send redirects to source routed frames.
 482            We don't send redirects to frames decapsulated from IPsec.
 483          */
 484         if (IP6CB(skb)->iif == dst->dev->ifindex &&
 485             opt->srcrt == 0 && !skb_sec_path(skb)) {
 486                 struct in6_addr *target = NULL;
 487                 struct inet_peer *peer;
 488                 struct rt6_info *rt;
 489
 490                 /*
 491                  *      incoming and outgoing devices are the same
 492                  *      send a redirect.
 493                  */
 494
 495                 rt = (struct rt6_info *) dst;
 496                 if (rt->rt6i_flags & RTF_GATEWAY)
 497                         target = &rt->rt6i_gateway;
 498                 else
 499                         target = &hdr->daddr;
 500
 501                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 502
 503                 /* Limit redirects both by destination (here)
 504                    and by source (inside ndisc_send_redirect)
 505                  */
 506                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 507                         ndisc_send_redirect(skb, target);
 508                 if (peer)
 509                         inet_putpeer(peer);
 510         } else {
 511                 int addrtype = ipv6_addr_type(&hdr->saddr);
 512
 513                 /* This check is security critical. */
 514                 if (addrtype == IPV6_ADDR_ANY ||
 515                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 516                         goto error;
 517                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 518                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 519                                     ICMPV6_NOT_NEIGHBOUR, 0);
 520                         goto error;
 521                 }
 522         }
 523
 524         mtu = ip6_dst_mtu_forward(dst);
 525         if (mtu < IPV6_MIN_MTU)
 526                 mtu = IPV6_MIN_MTU;
 527
 528         if (ip6_pkt_too_big(skb, mtu)) {
 529                 /* Again, force OUTPUT device used as source address */
 530                 skb->dev = dst->dev;
 531                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 532                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
 533                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 534                                 IPSTATS_MIB_FRAGFAILS);
 535                 kfree_skb(skb);
 536                 return -EMSGSIZE;
 537         }
 538
 539         if (skb_cow(skb, dst->dev->hard_header_len)) {
 540                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 541                                 IPSTATS_MIB_OUTDISCARDS);
 542                 goto drop;
 543         }
 544
 545         hdr = ipv6_hdr(skb);
 546
 547         /* Mangling hops number delayed to point after skb COW */
 548
 549         hdr->hop_limit--;
 550
 551         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 552                        net, NULL, skb, skb->dev, dst->dev,
 553                        ip6_forward_finish);
 554
 555 error:
 556         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 557 drop:
 558         kfree_skb(skb);
 559         return -EINVAL;
 560 }
 561
 562 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 563 {
 564         to->pkt_type = from->pkt_type;
 565         to->priority = from->priority;
 566         to->protocol = from->protocol;
 567         skb_dst_drop(to);
 568         skb_dst_set(to, dst_clone(skb_dst(from)));
 569         to->dev = from->dev;
 570         to->mark = from->mark;
 571
 572         skb_copy_hash(to, from);
 573
 574 #ifdef CONFIG_NET_SCHED
 575         to->tc_index = from->tc_index;
 576 #endif
 577         nf_copy(to, from);
 578         skb_copy_secmark(to, from);
 579 }
 580
 581 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 582                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 583 {
 584         struct sk_buff *frag;
 585         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 586         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 587                                 inet6_sk(skb->sk) : NULL;
 588         struct ipv6hdr *tmp_hdr;
 589         struct frag_hdr *fh;
 590         unsigned int mtu, hlen, left, len;
 591         int hroom, troom;
 592         __be32 frag_id;
 593         int ptr, offset = 0, err = 0;
 594         u8 *prevhdr, nexthdr = 0;
 595
 596         err = ip6_find_1stfragopt(skb, &prevhdr);
 597         if (err < 0)
 598                 goto fail;
 599         hlen = err;
 600         nexthdr = *prevhdr;
 601
 602         mtu = ip6_skb_dst_mtu(skb);
 603
 604         /* We must not fragment if the socket is set to force MTU discovery
 605          * or if the skb it not generated by a local socket.
 606          */
 607         if (unlikely(!skb->ignore_df && skb->len > mtu))
 608                 goto fail_toobig;
 609
 610         if (IP6CB(skb)->frag_max_size) {
 611                 if (IP6CB(skb)->frag_max_size > mtu)
 612                         goto fail_toobig;
 613
 614                 /* don't send fragments larger than what we received */
 615                 mtu = IP6CB(skb)->frag_max_size;
 616                 if (mtu < IPV6_MIN_MTU)
 617                         mtu = IPV6_MIN_MTU;
 618         }
 619
 620         if (np && np->frag_size < mtu) {
 621                 if (np->frag_size)
 622                         mtu = np->frag_size;
 623         }
 624         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 625                 goto fail_toobig;
 626         mtu -= hlen + sizeof(struct frag_hdr);
 627
 628         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 629                                     &ipv6_hdr(skb)->saddr);
 630
 631         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 632             (err = skb_checksum_help(skb)))
 633                 goto fail;
 634
 635         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 636         if (skb_has_frag_list(skb)) {
 637                 unsigned int first_len = skb_pagelen(skb);
 638                 struct sk_buff *frag2;
 639
 640                 if (first_len - hlen > mtu ||
 641                     ((first_len - hlen) & 7) ||
 642                     skb_cloned(skb) ||
 643                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 644                         goto slow_path;
 645
 646                 skb_walk_frags(skb, frag) {
 647                         /* Correct geometry. */
 648                         if (frag->len > mtu ||
 649                             ((frag->len & 7) && frag->next) ||
 650                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 651                                 goto slow_path_clean;
 652
 653                         /* Partially cloned skb? */
 654                         if (skb_shared(frag))
 655                                 goto slow_path_clean;
 656
 657                         BUG_ON(frag->sk);
 658                         if (skb->sk) {
 659                                 frag->sk = skb->sk;
 660                                 frag->destructor = sock_wfree;
 661                         }
 662                         skb->truesize -= frag->truesize;
 663                 }
 664
 665                 err = 0;
 666                 offset = 0;
 667                 /* BUILD HEADER */
 668
 669                 *prevhdr = NEXTHDR_FRAGMENT;
 670                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 671                 if (!tmp_hdr) {
 672                         err = -ENOMEM;
 673                         goto fail;
 674                 }
 675                 frag = skb_shinfo(skb)->frag_list;
 676                 skb_frag_list_init(skb);
 677
 678                 __skb_pull(skb, hlen);
 679                 fh = __skb_push(skb, sizeof(struct frag_hdr));
 680                 __skb_push(skb, hlen);
 681                 skb_reset_network_header(skb);
 682                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 683
 684                 fh->nexthdr = nexthdr;
 685                 fh->reserved = 0;
 686                 fh->frag_off = htons(IP6_MF);
 687                 fh->identification = frag_id;
 688
 689                 first_len = skb_pagelen(skb);
 690                 skb->data_len = first_len - skb_headlen(skb);
 691                 skb->len = first_len;
 692                 ipv6_hdr(skb)->payload_len = htons(first_len -
 693                                                    sizeof(struct ipv6hdr));
 694
 695                 for (;;) {
 696                         /* Prepare header of the next frame,
 697                          * before previous one went down. */
 698                         if (frag) {
 699                                 frag->ip_summed = CHECKSUM_NONE;
 700                                 skb_reset_transport_header(frag);
 701                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
 702                                 __skb_push(frag, hlen);
 703                                 skb_reset_network_header(frag);
 704                                 memcpy(skb_network_header(frag), tmp_hdr,
 705                                        hlen);
 706                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 707                                 fh->nexthdr = nexthdr;
 708                                 fh->reserved = 0;
 709                                 fh->frag_off = htons(offset);
 710                                 if (frag->next)
 711                                         fh->frag_off |= htons(IP6_MF);
 712                                 fh->identification = frag_id;
 713                                 ipv6_hdr(frag)->payload_len =
 714                                                 htons(frag->len -
 715                                                       sizeof(struct ipv6hdr));
 716                                 ip6_copy_metadata(frag, skb);
 717                         }
 718
 719                         err = output(net, sk, skb);
 720                         if (!err)
 721                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 722                                               IPSTATS_MIB_FRAGCREATES);
 723
 724                         if (err || !frag)
 725                                 break;
 726
 727                         skb = frag;
 728                         frag = skb->next;
 729                         skb_mark_not_on_list(skb);
 730                 }
 731
 732                 kfree(tmp_hdr);
 733
 734                 if (err == 0) {
 735                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 736                                       IPSTATS_MIB_FRAGOKS);
 737                         return 0;
 738                 }
 739
 740                 kfree_skb_list(frag);
 741
 742                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 743                               IPSTATS_MIB_FRAGFAILS);
 744                 return err;
 745
 746 slow_path_clean:
 747                 skb_walk_frags(skb, frag2) {
 748                         if (frag2 == frag)
 749                                 break;
 750                         frag2->sk = NULL;
 751                         frag2->destructor = NULL;
 752                         skb->truesize += frag2->truesize;
 753                 }
 754         }
 755
 756 slow_path:
 757         left = skb->len - hlen;         /* Space per frame */
 758         ptr = hlen;                     /* Where to start from */
 759
 760         /*
 761          *      Fragment the datagram.
 762          */
 763
 764         troom = rt->dst.dev->needed_tailroom;
 765
 766         /*
 767          *      Keep copying data until we run out.
 768          */
 769         while (left > 0)        {
 770                 u8 *fragnexthdr_offset;
 771
 772                 len = left;
 773                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 774                 if (len > mtu)
 775                         len = mtu;
 776                 /* IF: we are not sending up to and including the packet end
 777                    then align the next start on an eight byte boundary */
 778                 if (len < left) {
 779                         len &= ~7;
 780                 }
 781
 782                 /* Allocate buffer */
 783                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 784                                  hroom + troom, GFP_ATOMIC);
 785                 if (!frag) {
 786                         err = -ENOMEM;
 787                         goto fail;
 788                 }
 789
 790                 /*
 791                  *      Set up data on packet
 792                  */
 793
 794                 ip6_copy_metadata(frag, skb);
 795                 skb_reserve(frag, hroom);
 796                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 797                 skb_reset_network_header(frag);
 798                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 799                 frag->transport_header = (frag->network_header + hlen +
 800                                           sizeof(struct frag_hdr));
 801
 802                 /*
 803                  *      Charge the memory for the fragment to any owner
 804                  *      it might possess
 805                  */
 806                 if (skb->sk)
 807                         skb_set_owner_w(frag, skb->sk);
 808
 809                 /*
 810                  *      Copy the packet header into the new buffer.
 811                  */
 812                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 813
 814                 fragnexthdr_offset = skb_network_header(frag);
 815                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
 816                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 817
 818                 /*
 819                  *      Build fragment header.
 820                  */
 821                 fh->nexthdr = nexthdr;
 822                 fh->reserved = 0;
 823                 fh->identification = frag_id;
 824
 825                 /*
 826                  *      Copy a block of the IP datagram.
 827                  */
 828                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 829                                      len));
 830                 left -= len;
 831
 832                 fh->frag_off = htons(offset);
 833                 if (left > 0)
 834                         fh->frag_off |= htons(IP6_MF);
 835                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 836                                                     sizeof(struct ipv6hdr));
 837
 838                 ptr += len;
 839                 offset += len;
 840
 841                 /*
 842                  *      Put this fragment into the sending queue.
 843                  */
 844                 err = output(net, sk, frag);
 845                 if (err)
 846                         goto fail;
 847
 848                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 849                               IPSTATS_MIB_FRAGCREATES);
 850         }
 851         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 852                       IPSTATS_MIB_FRAGOKS);
 853         consume_skb(skb);
 854         return err;
 855
 856 fail_toobig:
 857         if (skb->sk && dst_allfrag(skb_dst(skb)))
 858                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 859
 860         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 861         err = -EMSGSIZE;
 862
 863 fail:
 864         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 865                       IPSTATS_MIB_FRAGFAILS);
 866         kfree_skb(skb);
 867         return err;
 868 }
 869
 870 static inline int ip6_rt_check(const struct rt6key *rt_key,
 871                                const struct in6_addr *fl_addr,
 872                                const struct in6_addr *addr_cache)
 873 {
 874         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 875                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 876 }
 877
 878 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 879                                           struct dst_entry *dst,
 880                                           const struct flowi6 *fl6)
 881 {
 882         struct ipv6_pinfo *np = inet6_sk(sk);
 883         struct rt6_info *rt;
 884
 885         if (!dst)
 886                 goto out;
 887
 888         if (dst->ops->family != AF_INET6) {
 889                 dst_release(dst);
 890                 return NULL;
 891         }
 892
 893         rt = (struct rt6_info *)dst;
 894         /* Yes, checking route validity in not connected
 895          * case is not very simple. Take into account,
 896          * that we do not support routing by source, TOS,
 897          * and MSG_DONTROUTE            --ANK (980726)
 898          *
 899          * 1. ip6_rt_check(): If route was host route,
 900          *    check that cached destination is current.
 901          *    If it is network route, we still may
 902          *    check its validity using saved pointer
 903          *    to the last used address: daddr_cache.
 904          *    We do not want to save whole address now,
 905          *    (because main consumer of this service
 906          *    is tcp, which has not this problem),
 907          *    so that the last trick works only on connected
 908          *    sockets.
 909          * 2. oif also should be the same.
 910          */
 911         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 912 #ifdef CONFIG_IPV6_SUBTREES
 913             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 914 #endif
 915            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 916               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 917                 dst_release(dst);
 918                 dst = NULL;
 919         }
 920
 921 out:
 922         return dst;
 923 }
 924
 925 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 926                                struct dst_entry **dst, struct flowi6 *fl6)
 927 {
 928 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 929         struct neighbour *n;
 930         struct rt6_info *rt;
 931 #endif
 932         int err;
 933         int flags = 0;
 934
 935         /* The correct way to handle this would be to do
 936          * ip6_route_get_saddr, and then ip6_route_output; however,
 937          * the route-specific preferred source forces the
 938          * ip6_route_output call _before_ ip6_route_get_saddr.
 939          *
 940          * In source specific routing (no src=any default route),
 941          * ip6_route_output will fail given src=any saddr, though, so
 942          * that's why we try it again later.
 943          */
 944         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
 945                 struct fib6_info *from;
 946                 struct rt6_info *rt;
 947                 bool had_dst = *dst != NULL;
 948
 949                 if (!had_dst)
 950                         *dst = ip6_route_output(net, sk, fl6);
 951                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
 952
 953                 rcu_read_lock();
 954                 from = rt ? rcu_dereference(rt->from) : NULL;
 955                 err = ip6_route_get_saddr(net, from, &fl6->daddr,
 956                                           sk ? inet6_sk(sk)->srcprefs : 0,
 957                                           &fl6->saddr);
 958                 rcu_read_unlock();
 959
 960                 if (err)
 961                         goto out_err_release;
 962
 963                 /* If we had an erroneous initial result, pretend it
 964                  * never existed and let the SA-enabled version take
 965                  * over.
 966                  */
 967                 if (!had_dst && (*dst)->error) {
 968                         dst_release(*dst);
 969                         *dst = NULL;
 970                 }
 971
 972                 if (fl6->flowi6_oif)
 973                         flags |= RT6_LOOKUP_F_IFACE;
 974         }
 975
 976         if (!*dst)
 977                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
 978
 979         err = (*dst)->error;
 980         if (err)
 981                 goto out_err_release;
 982
 983 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 984         /*
 985          * Here if the dst entry we've looked up
 986          * has a neighbour entry that is in the INCOMPLETE
 987          * state and the src address from the flow is
 988          * marked as OPTIMISTIC, we release the found
 989          * dst entry and replace it instead with the
 990          * dst entry of the nexthop router
 991          */
 992         rt = (struct rt6_info *) *dst;
 993         rcu_read_lock_bh();
 994         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
 995                                       rt6_nexthop(rt, &fl6->daddr));
 996         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
 997         rcu_read_unlock_bh();
 998
 999         if (err) {
1000                 struct inet6_ifaddr *ifp;
1001                 struct flowi6 fl_gw6;
1002                 int redirect;
1003
1004                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1005                                       (*dst)->dev, 1);
1006
1007                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1008                 if (ifp)
1009                         in6_ifa_put(ifp);
1010
1011                 if (redirect) {
1012                         /*
1013                          * We need to get the dst entry for the
1014                          * default router instead
1015                          */
1016                         dst_release(*dst);
1017                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1018                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1019                         *dst = ip6_route_output(net, sk, &fl_gw6);
1020                         err = (*dst)->error;
1021                         if (err)
1022                                 goto out_err_release;
1023                 }
1024         }
1025 #endif
1026         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1027             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1028                 err = -EAFNOSUPPORT;
1029                 goto out_err_release;
1030         }
1031
1032         return 0;
1033
1034 out_err_release:
1035         dst_release(*dst);
1036         *dst = NULL;
1037
1038         if (err == -ENETUNREACH)
1039                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1040         return err;
1041 }
1042
1043 /**
1044  *      ip6_dst_lookup - perform route lookup on flow
1045  *      @sk: socket which provides route info
1046  *      @dst: pointer to dst_entry * for result
1047  *      @fl6: flow to lookup
1048  *
1049  *      This function performs a route lookup on the given flow.
1050  *
1051  *      It returns zero on success, or a standard errno code on error.
1052  */
1053 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1054                    struct flowi6 *fl6)
1055 {
1056         *dst = NULL;
1057         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1058 }
1059 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1060
1061 /**
1062  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1063  *      @sk: socket which provides route info
1064  *      @fl6: flow to lookup
1065  *      @final_dst: final destination address for ipsec lookup
1066  *
1067  *      This function performs a route lookup on the given flow.
1068  *
1069  *      It returns a valid dst pointer on success, or a pointer encoded
1070  *      error code.
1071  */
1072 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1073                                       const struct in6_addr *final_dst)
1074 {
1075         struct dst_entry *dst = NULL;
1076         int err;
1077
1078         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1079         if (err)
1080                 return ERR_PTR(err);
1081         if (final_dst)
1082                 fl6->daddr = *final_dst;
1083
1084         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1085 }
1086 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1087
1088 /**
1089  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1090  *      @sk: socket which provides the dst cache and route info
1091  *      @fl6: flow to lookup
1092  *      @final_dst: final destination address for ipsec lookup
1093  *      @connected: whether @sk is connected or not
1094  *
1095  *      This function performs a route lookup on the given flow with the
1096  *      possibility of using the cached route in the socket if it is valid.
1097  *      It will take the socket dst lock when operating on the dst cache.
1098  *      As a result, this function can only be used in process context.
1099  *
1100  *      In addition, for a connected socket, cache the dst in the socket
1101  *      if the current cache is not valid.
1102  *
1103  *      It returns a valid dst pointer on success, or a pointer encoded
1104  *      error code.
1105  */
1106 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1107                                          const struct in6_addr *final_dst,
1108                                          bool connected)
1109 {
1110         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1111
1112         dst = ip6_sk_dst_check(sk, dst, fl6);
1113         if (dst)
1114                 return dst;
1115
1116         dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1117         if (connected && !IS_ERR(dst))
1118                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1119
1120         return dst;
1121 }
1122 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1123
1124 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1125                                                gfp_t gfp)
1126 {
1127         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1128 }
1129
1130 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1131                                                 gfp_t gfp)
1132 {
1133         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1134 }
1135
1136 static void ip6_append_data_mtu(unsigned int *mtu,
1137                                 int *maxfraglen,
1138                                 unsigned int fragheaderlen,
1139                                 struct sk_buff *skb,
1140                                 struct rt6_info *rt,
1141                                 unsigned int orig_mtu)
1142 {
1143         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1144                 if (!skb) {
1145                         /* first fragment, reserve header_len */
1146                         *mtu = orig_mtu - rt->dst.header_len;
1147
1148                 } else {
1149                         /*
1150                          * this fragment is not first, the headers
1151                          * space is regarded as data space.
1152                          */
1153                         *mtu = orig_mtu;
1154                 }
1155                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1156                               + fragheaderlen - sizeof(struct frag_hdr);
1157         }
1158 }
1159
1160 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1161                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1162                           struct rt6_info *rt, struct flowi6 *fl6)
1163 {
1164         struct ipv6_pinfo *np = inet6_sk(sk);
1165         unsigned int mtu;
1166         struct ipv6_txoptions *opt = ipc6->opt;
1167
1168         /*
1169          * setup for corking
1170          */
1171         if (opt) {
1172                 if (WARN_ON(v6_cork->opt))
1173                         return -EINVAL;
1174
1175                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1176                 if (unlikely(!v6_cork->opt))
1177                         return -ENOBUFS;
1178
1179                 v6_cork->opt->tot_len = sizeof(*opt);
1180                 v6_cork->opt->opt_flen = opt->opt_flen;
1181                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1182
1183                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1184                                                     sk->sk_allocation);
1185                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1186                         return -ENOBUFS;
1187
1188                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1189                                                     sk->sk_allocation);
1190                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1191                         return -ENOBUFS;
1192
1193                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1194                                                    sk->sk_allocation);
1195                 if (opt->hopopt && !v6_cork->opt->hopopt)
1196                         return -ENOBUFS;
1197
1198                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1199                                                     sk->sk_allocation);
1200                 if (opt->srcrt && !v6_cork->opt->srcrt)
1201                         return -ENOBUFS;
1202
1203                 /* need source address above miyazawa*/
1204         }
1205         dst_hold(&rt->dst);
1206         cork->base.dst = &rt->dst;
1207         cork->fl.u.ip6 = *fl6;
1208         v6_cork->hop_limit = ipc6->hlimit;
1209         v6_cork->tclass = ipc6->tclass;
1210         if (rt->dst.flags & DST_XFRM_TUNNEL)
1211                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1212                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1213         else
1214                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1215                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1216         if (np->frag_size < mtu) {
1217                 if (np->frag_size)
1218                         mtu = np->frag_size;
1219         }
1220         if (mtu < IPV6_MIN_MTU)
1221                 return -EINVAL;
1222         cork->base.fragsize = mtu;
1223         cork->base.gso_size = ipc6->gso_size;
1224         cork->base.tx_flags = 0;
1225         sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1226
1227         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1228                 cork->base.flags |= IPCORK_ALLFRAG;
1229         cork->base.length = 0;
1230
1231         cork->base.transmit_time = ipc6->sockc.transmit_time;
1232
1233         return 0;
1234 }
1235
1236 static int __ip6_append_data(struct sock *sk,
1237                              struct flowi6 *fl6,
1238                              struct sk_buff_head *queue,
1239                              struct inet_cork *cork,
1240                              struct inet6_cork *v6_cork,
1241                              struct page_frag *pfrag,
1242                              int getfrag(void *from, char *to, int offset,
1243                                          int len, int odd, struct sk_buff *skb),
1244                              void *from, int length, int transhdrlen,
1245                              unsigned int flags, struct ipcm6_cookie *ipc6)
1246 {
1247         struct sk_buff *skb, *skb_prev = NULL;
1248         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1249         int exthdrlen = 0;
1250         int dst_exthdrlen = 0;
1251         int hh_len;
1252         int copy;
1253         int err;
1254         int offset = 0;
1255         u32 tskey = 0;
1256         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1257         struct ipv6_txoptions *opt = v6_cork->opt;
1258         int csummode = CHECKSUM_NONE;
1259         unsigned int maxnonfragsize, headersize;
1260         unsigned int wmem_alloc_delta = 0;
1261         bool paged;
1262
1263         skb = skb_peek_tail(queue);
1264         if (!skb) {
1265                 exthdrlen = opt ? opt->opt_flen : 0;
1266                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1267         }
1268
1269         paged = !!cork->gso_size;
1270         mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1271         orig_mtu = mtu;
1272
1273         if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1274             sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1275                 tskey = sk->sk_tskey++;
1276
1277         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1278
1279         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1280                         (opt ? opt->opt_nflen : 0);
1281         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1282                      sizeof(struct frag_hdr);
1283
1284         headersize = sizeof(struct ipv6hdr) +
1285                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1286                      (dst_allfrag(&rt->dst) ?
1287                       sizeof(struct frag_hdr) : 0) +
1288                      rt->rt6i_nfheader_len;
1289
1290         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1291          * the first fragment
1292          */
1293         if (headersize + transhdrlen > mtu)
1294                 goto emsgsize;
1295
1296         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1297             (sk->sk_protocol == IPPROTO_UDP ||
1298              sk->sk_protocol == IPPROTO_RAW)) {
1299                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1300                                 sizeof(struct ipv6hdr));
1301                 goto emsgsize;
1302         }
1303
1304         if (ip6_sk_ignore_df(sk))
1305                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1306         else
1307                 maxnonfragsize = mtu;
1308
1309         if (cork->length + length > maxnonfragsize - headersize) {
1310 emsgsize:
1311                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1312                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1313                 return -EMSGSIZE;
1314         }
1315
1316         /* CHECKSUM_PARTIAL only with no extension headers and when
1317          * we are not going to fragment
1318          */
1319         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1320             headersize == sizeof(struct ipv6hdr) &&
1321             length <= mtu - headersize &&
1322             (!(flags & MSG_MORE) || cork->gso_size) &&
1323             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1324                 csummode = CHECKSUM_PARTIAL;
1325
1326         /*
1327          * Let's try using as much space as possible.
1328          * Use MTU if total length of the message fits into the MTU.
1329          * Otherwise, we need to reserve fragment header and
1330          * fragment alignment (= 8-15 octects, in total).
1331          *
1332          * Note that we may need to "move" the data from the tail of
1333          * of the buffer to the new fragment when we split
1334          * the message.
1335          *
1336          * FIXME: It may be fragmented into multiple chunks
1337          *        at once if non-fragmentable extension headers
1338          *        are too large.
1339          * --yoshfuji
1340          */
1341
1342         cork->length += length;
1343         if (!skb)
1344                 goto alloc_new_skb;
1345
1346         while (length > 0) {
1347                 /* Check if the remaining data fits into current packet. */
1348                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1349                 if (copy < length)
1350                         copy = maxfraglen - skb->len;
1351
1352                 if (copy <= 0) {
1353                         char *data;
1354                         unsigned int datalen;
1355                         unsigned int fraglen;
1356                         unsigned int fraggap;
1357                         unsigned int alloclen;
1358                         unsigned int pagedlen;
1359 alloc_new_skb:
1360                         /* There's no room in the current skb */
1361                         if (skb)
1362                                 fraggap = skb->len - maxfraglen;
1363                         else
1364                                 fraggap = 0;
1365                         /* update mtu and maxfraglen if necessary */
1366                         if (!skb || !skb_prev)
1367                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1368                                                     fragheaderlen, skb, rt,
1369                                                     orig_mtu);
1370
1371                         skb_prev = skb;
1372
1373                         /*
1374                          * If remaining data exceeds the mtu,
1375                          * we know we need more fragment(s).
1376                          */
1377                         datalen = length + fraggap;
1378
1379                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1380                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1381                         fraglen = datalen + fragheaderlen;
1382                         pagedlen = 0;
1383
1384                         if ((flags & MSG_MORE) &&
1385                             !(rt->dst.dev->features&NETIF_F_SG))
1386                                 alloclen = mtu;
1387                         else if (!paged)
1388                                 alloclen = fraglen;
1389                         else {
1390                                 alloclen = min_t(int, fraglen, MAX_HEADER);
1391                                 pagedlen = fraglen - alloclen;
1392                         }
1393
1394                         alloclen += dst_exthdrlen;
1395
1396                         if (datalen != length + fraggap) {
1397                                 /*
1398                                  * this is not the last fragment, the trailer
1399                                  * space is regarded as data space.
1400                                  */
1401                                 datalen += rt->dst.trailer_len;
1402                         }
1403
1404                         alloclen += rt->dst.trailer_len;
1405                         fraglen = datalen + fragheaderlen;
1406
1407                         /*
1408                          * We just reserve space for fragment header.
1409                          * Note: this may be overallocation if the message
1410                          * (without MSG_MORE) fits into the MTU.
1411                          */
1412                         alloclen += sizeof(struct frag_hdr);
1413
1414                         copy = datalen - transhdrlen - fraggap - pagedlen;
1415                         if (copy < 0) {
1416                                 err = -EINVAL;
1417                                 goto error;
1418                         }
1419                         if (transhdrlen) {
1420                                 skb = sock_alloc_send_skb(sk,
1421                                                 alloclen + hh_len,
1422                                                 (flags & MSG_DONTWAIT), &err);
1423                         } else {
1424                                 skb = NULL;
1425                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1426                                     2 * sk->sk_sndbuf)
1427                                         skb = alloc_skb(alloclen + hh_len,
1428                                                         sk->sk_allocation);
1429                                 if (unlikely(!skb))
1430                                         err = -ENOBUFS;
1431                         }
1432                         if (!skb)
1433                                 goto error;
1434                         /*
1435                          *      Fill in the control structures
1436                          */
1437                         skb->protocol = htons(ETH_P_IPV6);
1438                         skb->ip_summed = csummode;
1439                         skb->csum = 0;
1440                         /* reserve for fragmentation and ipsec header */
1441                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1442                                     dst_exthdrlen);
1443
1444                         /* Only the initial fragment is time stamped */
1445                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
1446                         cork->tx_flags = 0;
1447                         skb_shinfo(skb)->tskey = tskey;
1448                         tskey = 0;
1449
1450                         /*
1451                          *      Find where to start putting bytes
1452                          */
1453                         data = skb_put(skb, fraglen - pagedlen);
1454                         skb_set_network_header(skb, exthdrlen);
1455                         data += fragheaderlen;
1456                         skb->transport_header = (skb->network_header +
1457                                                  fragheaderlen);
1458                         if (fraggap) {
1459                                 skb->csum = skb_copy_and_csum_bits(
1460                                         skb_prev, maxfraglen,
1461                                         data + transhdrlen, fraggap, 0);
1462                                 skb_prev->csum = csum_sub(skb_prev->csum,
1463                                                           skb->csum);
1464                                 data += fraggap;
1465                                 pskb_trim_unique(skb_prev, maxfraglen);
1466                         }
1467                         if (copy > 0 &&
1468                             getfrag(from, data + transhdrlen, offset,
1469                                     copy, fraggap, skb) < 0) {
1470                                 err = -EFAULT;
1471                                 kfree_skb(skb);
1472                                 goto error;
1473                         }
1474
1475                         offset += copy;
1476                         length -= copy + transhdrlen;
1477                         transhdrlen = 0;
1478                         exthdrlen = 0;
1479                         dst_exthdrlen = 0;
1480
1481                         if ((flags & MSG_CONFIRM) && !skb_prev)
1482                                 skb_set_dst_pending_confirm(skb, 1);
1483
1484                         /*
1485                          * Put the packet on the pending queue
1486                          */
1487                         if (!skb->destructor) {
1488                                 skb->destructor = sock_wfree;
1489                                 skb->sk = sk;
1490                                 wmem_alloc_delta += skb->truesize;
1491                         }
1492                         __skb_queue_tail(queue, skb);
1493                         continue;
1494                 }
1495
1496                 if (copy > length)
1497                         copy = length;
1498
1499                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1500                     skb_tailroom(skb) >= copy) {
1501                         unsigned int off;
1502
1503                         off = skb->len;
1504                         if (getfrag(from, skb_put(skb, copy),
1505                                                 offset, copy, off, skb) < 0) {
1506                                 __skb_trim(skb, off);
1507                                 err = -EFAULT;
1508                                 goto error;
1509                         }
1510                 } else {
1511                         int i = skb_shinfo(skb)->nr_frags;
1512
1513                         err = -ENOMEM;
1514                         if (!sk_page_frag_refill(sk, pfrag))
1515                                 goto error;
1516
1517                         if (!skb_can_coalesce(skb, i, pfrag->page,
1518                                               pfrag->offset)) {
1519                                 err = -EMSGSIZE;
1520                                 if (i == MAX_SKB_FRAGS)
1521                                         goto error;
1522
1523                                 __skb_fill_page_desc(skb, i, pfrag->page,
1524                                                      pfrag->offset, 0);
1525                                 skb_shinfo(skb)->nr_frags = ++i;
1526                                 get_page(pfrag->page);
1527                         }
1528                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1529                         if (getfrag(from,
1530                                     page_address(pfrag->page) + pfrag->offset,
1531                                     offset, copy, skb->len, skb) < 0)
1532                                 goto error_efault;
1533
1534                         pfrag->offset += copy;
1535                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1536                         skb->len += copy;
1537                         skb->data_len += copy;
1538                         skb->truesize += copy;
1539                         wmem_alloc_delta += copy;
1540                 }
1541                 offset += copy;
1542                 length -= copy;
1543         }
1544
1545         if (wmem_alloc_delta)
1546                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1547         return 0;
1548
1549 error_efault:
1550         err = -EFAULT;
1551 error:
1552         cork->length -= length;
1553         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1554         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1555         return err;
1556 }
1557
1558 int ip6_append_data(struct sock *sk,
1559                     int getfrag(void *from, char *to, int offset, int len,
1560                                 int odd, struct sk_buff *skb),
1561                     void *from, int length, int transhdrlen,
1562                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1563                     struct rt6_info *rt, unsigned int flags)
1564 {
1565         struct inet_sock *inet = inet_sk(sk);
1566         struct ipv6_pinfo *np = inet6_sk(sk);
1567         int exthdrlen;
1568         int err;
1569
1570         if (flags&MSG_PROBE)
1571                 return 0;
1572         if (skb_queue_empty(&sk->sk_write_queue)) {
1573                 /*
1574                  * setup for corking
1575                  */
1576                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1577                                      ipc6, rt, fl6);
1578                 if (err)
1579                         return err;
1580
1581                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1582                 length += exthdrlen;
1583                 transhdrlen += exthdrlen;
1584         } else {
1585                 fl6 = &inet->cork.fl.u.ip6;
1586                 transhdrlen = 0;
1587         }
1588
1589         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1590                                  &np->cork, sk_page_frag(sk), getfrag,
1591                                  from, length, transhdrlen, flags, ipc6);
1592 }
1593 EXPORT_SYMBOL_GPL(ip6_append_data);
1594
1595 static void ip6_cork_release(struct inet_cork_full *cork,
1596                              struct inet6_cork *v6_cork)
1597 {
1598         if (v6_cork->opt) {
1599                 kfree(v6_cork->opt->dst0opt);
1600                 kfree(v6_cork->opt->dst1opt);
1601                 kfree(v6_cork->opt->hopopt);
1602                 kfree(v6_cork->opt->srcrt);
1603                 kfree(v6_cork->opt);
1604                 v6_cork->opt = NULL;
1605         }
1606
1607         if (cork->base.dst) {
1608                 dst_release(cork->base.dst);
1609                 cork->base.dst = NULL;
1610                 cork->base.flags &= ~IPCORK_ALLFRAG;
1611         }
1612         memset(&cork->fl, 0, sizeof(cork->fl));
1613 }
1614
1615 struct sk_buff *__ip6_make_skb(struct sock *sk,
1616                                struct sk_buff_head *queue,
1617                                struct inet_cork_full *cork,
1618                                struct inet6_cork *v6_cork)
1619 {
1620         struct sk_buff *skb, *tmp_skb;
1621         struct sk_buff **tail_skb;
1622         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1623         struct ipv6_pinfo *np = inet6_sk(sk);
1624         struct net *net = sock_net(sk);
1625         struct ipv6hdr *hdr;
1626         struct ipv6_txoptions *opt = v6_cork->opt;
1627         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1628         struct flowi6 *fl6 = &cork->fl.u.ip6;
1629         unsigned char proto = fl6->flowi6_proto;
1630
1631         skb = __skb_dequeue(queue);
1632         if (!skb)
1633                 goto out;
1634         tail_skb = &(skb_shinfo(skb)->frag_list);
1635
1636         /* move skb->data to ip header from ext header */
1637         if (skb->data < skb_network_header(skb))
1638                 __skb_pull(skb, skb_network_offset(skb));
1639         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1640                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1641                 *tail_skb = tmp_skb;
1642                 tail_skb = &(tmp_skb->next);
1643                 skb->len += tmp_skb->len;
1644                 skb->data_len += tmp_skb->len;
1645                 skb->truesize += tmp_skb->truesize;
1646                 tmp_skb->destructor = NULL;
1647                 tmp_skb->sk = NULL;
1648         }
1649
1650         /* Allow local fragmentation. */
1651         skb->ignore_df = ip6_sk_ignore_df(sk);
1652
1653         *final_dst = fl6->daddr;
1654         __skb_pull(skb, skb_network_header_len(skb));
1655         if (opt && opt->opt_flen)
1656                 ipv6_push_frag_opts(skb, opt, &proto);
1657         if (opt && opt->opt_nflen)
1658                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1659
1660         skb_push(skb, sizeof(struct ipv6hdr));
1661         skb_reset_network_header(skb);
1662         hdr = ipv6_hdr(skb);
1663
1664         ip6_flow_hdr(hdr, v6_cork->tclass,
1665                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1666                                         ip6_autoflowlabel(net, np), fl6));
1667         hdr->hop_limit = v6_cork->hop_limit;
1668         hdr->nexthdr = proto;
1669         hdr->saddr = fl6->saddr;
1670         hdr->daddr = *final_dst;
1671
1672         skb->priority = sk->sk_priority;
1673         skb->mark = sk->sk_mark;
1674
1675         skb->tstamp = cork->base.transmit_time;
1676
1677         skb_dst_set(skb, dst_clone(&rt->dst));
1678         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1679         if (proto == IPPROTO_ICMPV6) {
1680                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1681
1682                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1683                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1684         }
1685
1686         ip6_cork_release(cork, v6_cork);
1687 out:
1688         return skb;
1689 }
1690
1691 int ip6_send_skb(struct sk_buff *skb)
1692 {
1693         struct net *net = sock_net(skb->sk);
1694         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1695         int err;
1696
1697         err = ip6_local_out(net, skb->sk, skb);
1698         if (err) {
1699                 if (err > 0)
1700                         err = net_xmit_errno(err);
1701                 if (err)
1702                         IP6_INC_STATS(net, rt->rt6i_idev,
1703                                       IPSTATS_MIB_OUTDISCARDS);
1704         }
1705
1706         return err;
1707 }
1708
1709 int ip6_push_pending_frames(struct sock *sk)
1710 {
1711         struct sk_buff *skb;
1712
1713         skb = ip6_finish_skb(sk);
1714         if (!skb)
1715                 return 0;
1716
1717         return ip6_send_skb(skb);
1718 }
1719 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1720
1721 static void __ip6_flush_pending_frames(struct sock *sk,
1722                                        struct sk_buff_head *queue,
1723                                        struct inet_cork_full *cork,
1724                                        struct inet6_cork *v6_cork)
1725 {
1726         struct sk_buff *skb;
1727
1728         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1729                 if (skb_dst(skb))
1730                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1731                                       IPSTATS_MIB_OUTDISCARDS);
1732                 kfree_skb(skb);
1733         }
1734
1735         ip6_cork_release(cork, v6_cork);
1736 }
1737
1738 void ip6_flush_pending_frames(struct sock *sk)
1739 {
1740         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1741                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1742 }
1743 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1744
1745 struct sk_buff *ip6_make_skb(struct sock *sk,
1746                              int getfrag(void *from, char *to, int offset,
1747                                          int len, int odd, struct sk_buff *skb),
1748                              void *from, int length, int transhdrlen,
1749                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1750                              struct rt6_info *rt, unsigned int flags,
1751                              struct inet_cork_full *cork)
1752 {
1753         struct inet6_cork v6_cork;
1754         struct sk_buff_head queue;
1755         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1756         int err;
1757
1758         if (flags & MSG_PROBE)
1759                 return NULL;
1760
1761         __skb_queue_head_init(&queue);
1762
1763         cork->base.flags = 0;
1764         cork->base.addr = 0;
1765         cork->base.opt = NULL;
1766         cork->base.dst = NULL;
1767         v6_cork.opt = NULL;
1768         err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1769         if (err) {
1770                 ip6_cork_release(cork, &v6_cork);
1771                 return ERR_PTR(err);
1772         }
1773         if (ipc6->dontfrag < 0)
1774                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1775
1776         err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1777                                 &current->task_frag, getfrag, from,
1778                                 length + exthdrlen, transhdrlen + exthdrlen,
1779                                 flags, ipc6);
1780         if (err) {
1781                 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1782                 return ERR_PTR(err);
1783         }
1784
1785         return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1786 }