]> asedeno.scripts.mit.edu Git - linux.git/blob - net/ipv6/ip6_output.c
Merge tag 'pci-v4.10-fixes-4' of git://git.kernel.org/pub/scm/linux/kernel/git/helgaa...
[linux.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
61
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63 {
64         struct dst_entry *dst = skb_dst(skb);
65         struct net_device *dev = dst->dev;
66         struct neighbour *neigh;
67         struct in6_addr *nexthop;
68         int ret;
69
70         skb->protocol = htons(ETH_P_IPV6);
71         skb->dev = dev;
72
73         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
74                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
75
76                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
77                     ((mroute6_socket(net, skb) &&
78                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
79                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
80                                          &ipv6_hdr(skb)->saddr))) {
81                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
82
83                         /* Do not check for IFF_ALLMULTI; multicast routing
84                            is not supported in any case.
85                          */
86                         if (newskb)
87                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
88                                         net, sk, newskb, NULL, newskb->dev,
89                                         dev_loopback_xmit);
90
91                         if (ipv6_hdr(skb)->hop_limit == 0) {
92                                 IP6_INC_STATS(net, idev,
93                                               IPSTATS_MIB_OUTDISCARDS);
94                                 kfree_skb(skb);
95                                 return 0;
96                         }
97                 }
98
99                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
100
101                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
102                     IPV6_ADDR_SCOPE_NODELOCAL &&
103                     !(dev->flags & IFF_LOOPBACK)) {
104                         kfree_skb(skb);
105                         return 0;
106                 }
107         }
108
109         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
110                 int res = lwtunnel_xmit(skb);
111
112                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
113                         return res;
114         }
115
116         rcu_read_lock_bh();
117         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
118         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
119         if (unlikely(!neigh))
120                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
121         if (!IS_ERR(neigh)) {
122                 ret = dst_neigh_output(dst, neigh, skb);
123                 rcu_read_unlock_bh();
124                 return ret;
125         }
126         rcu_read_unlock_bh();
127
128         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
129         kfree_skb(skb);
130         return -EINVAL;
131 }
132
133 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
134 {
135         int ret;
136
137         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
138         if (ret) {
139                 kfree_skb(skb);
140                 return ret;
141         }
142
143         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
144             dst_allfrag(skb_dst(skb)) ||
145             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
146                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
147         else
148                 return ip6_finish_output2(net, sk, skb);
149 }
150
151 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
152 {
153         struct net_device *dev = skb_dst(skb)->dev;
154         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
155
156         if (unlikely(idev->cnf.disable_ipv6)) {
157                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
158                 kfree_skb(skb);
159                 return 0;
160         }
161
162         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
163                             net, sk, skb, NULL, dev,
164                             ip6_finish_output,
165                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
166 }
167
168 /*
169  * xmit an sk_buff (used by TCP, SCTP and DCCP)
170  * Note : socket lock is not held for SYNACK packets, but might be modified
171  * by calls to skb_set_owner_w() and ipv6_local_error(),
172  * which are using proper atomic operations or spinlocks.
173  */
174 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
175              __u32 mark, struct ipv6_txoptions *opt, int tclass)
176 {
177         struct net *net = sock_net(sk);
178         const struct ipv6_pinfo *np = inet6_sk(sk);
179         struct in6_addr *first_hop = &fl6->daddr;
180         struct dst_entry *dst = skb_dst(skb);
181         struct ipv6hdr *hdr;
182         u8  proto = fl6->flowi6_proto;
183         int seg_len = skb->len;
184         int hlimit = -1;
185         u32 mtu;
186
187         if (opt) {
188                 unsigned int head_room;
189
190                 /* First: exthdrs may take lots of space (~8K for now)
191                    MAX_HEADER is not enough.
192                  */
193                 head_room = opt->opt_nflen + opt->opt_flen;
194                 seg_len += head_room;
195                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
196
197                 if (skb_headroom(skb) < head_room) {
198                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
199                         if (!skb2) {
200                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
201                                               IPSTATS_MIB_OUTDISCARDS);
202                                 kfree_skb(skb);
203                                 return -ENOBUFS;
204                         }
205                         consume_skb(skb);
206                         skb = skb2;
207                         /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
208                          * it is safe to call in our context (socket lock not held)
209                          */
210                         skb_set_owner_w(skb, (struct sock *)sk);
211                 }
212                 if (opt->opt_flen)
213                         ipv6_push_frag_opts(skb, opt, &proto);
214                 if (opt->opt_nflen)
215                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
216                                              &fl6->saddr);
217         }
218
219         skb_push(skb, sizeof(struct ipv6hdr));
220         skb_reset_network_header(skb);
221         hdr = ipv6_hdr(skb);
222
223         /*
224          *      Fill in the IPv6 header
225          */
226         if (np)
227                 hlimit = np->hop_limit;
228         if (hlimit < 0)
229                 hlimit = ip6_dst_hoplimit(dst);
230
231         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
232                                                      np->autoflowlabel, fl6));
233
234         hdr->payload_len = htons(seg_len);
235         hdr->nexthdr = proto;
236         hdr->hop_limit = hlimit;
237
238         hdr->saddr = fl6->saddr;
239         hdr->daddr = *first_hop;
240
241         skb->protocol = htons(ETH_P_IPV6);
242         skb->priority = sk->sk_priority;
243         skb->mark = mark;
244
245         mtu = dst_mtu(dst);
246         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
247                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
248                               IPSTATS_MIB_OUT, skb->len);
249
250                 /* if egress device is enslaved to an L3 master device pass the
251                  * skb to its handler for processing
252                  */
253                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
254                 if (unlikely(!skb))
255                         return 0;
256
257                 /* hooks should never assume socket lock is held.
258                  * we promote our socket to non const
259                  */
260                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
261                                net, (struct sock *)sk, skb, NULL, dst->dev,
262                                dst_output);
263         }
264
265         skb->dev = dst->dev;
266         /* ipv6_local_error() does not require socket lock,
267          * we promote our socket to non const
268          */
269         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
270
271         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
272         kfree_skb(skb);
273         return -EMSGSIZE;
274 }
275 EXPORT_SYMBOL(ip6_xmit);
276
277 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
278 {
279         struct ip6_ra_chain *ra;
280         struct sock *last = NULL;
281
282         read_lock(&ip6_ra_lock);
283         for (ra = ip6_ra_chain; ra; ra = ra->next) {
284                 struct sock *sk = ra->sk;
285                 if (sk && ra->sel == sel &&
286                     (!sk->sk_bound_dev_if ||
287                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
288                         if (last) {
289                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
290                                 if (skb2)
291                                         rawv6_rcv(last, skb2);
292                         }
293                         last = sk;
294                 }
295         }
296
297         if (last) {
298                 rawv6_rcv(last, skb);
299                 read_unlock(&ip6_ra_lock);
300                 return 1;
301         }
302         read_unlock(&ip6_ra_lock);
303         return 0;
304 }
305
306 static int ip6_forward_proxy_check(struct sk_buff *skb)
307 {
308         struct ipv6hdr *hdr = ipv6_hdr(skb);
309         u8 nexthdr = hdr->nexthdr;
310         __be16 frag_off;
311         int offset;
312
313         if (ipv6_ext_hdr(nexthdr)) {
314                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
315                 if (offset < 0)
316                         return 0;
317         } else
318                 offset = sizeof(struct ipv6hdr);
319
320         if (nexthdr == IPPROTO_ICMPV6) {
321                 struct icmp6hdr *icmp6;
322
323                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
324                                          offset + 1 - skb->data)))
325                         return 0;
326
327                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
328
329                 switch (icmp6->icmp6_type) {
330                 case NDISC_ROUTER_SOLICITATION:
331                 case NDISC_ROUTER_ADVERTISEMENT:
332                 case NDISC_NEIGHBOUR_SOLICITATION:
333                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
334                 case NDISC_REDIRECT:
335                         /* For reaction involving unicast neighbor discovery
336                          * message destined to the proxied address, pass it to
337                          * input function.
338                          */
339                         return 1;
340                 default:
341                         break;
342                 }
343         }
344
345         /*
346          * The proxying router can't forward traffic sent to a link-local
347          * address, so signal the sender and discard the packet. This
348          * behavior is clarified by the MIPv6 specification.
349          */
350         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
351                 dst_link_failure(skb);
352                 return -1;
353         }
354
355         return 0;
356 }
357
358 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
359                                      struct sk_buff *skb)
360 {
361         return dst_output(net, sk, skb);
362 }
363
364 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
365 {
366         unsigned int mtu;
367         struct inet6_dev *idev;
368
369         if (dst_metric_locked(dst, RTAX_MTU)) {
370                 mtu = dst_metric_raw(dst, RTAX_MTU);
371                 if (mtu)
372                         return mtu;
373         }
374
375         mtu = IPV6_MIN_MTU;
376         rcu_read_lock();
377         idev = __in6_dev_get(dst->dev);
378         if (idev)
379                 mtu = idev->cnf.mtu6;
380         rcu_read_unlock();
381
382         return mtu;
383 }
384
385 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
386 {
387         if (skb->len <= mtu)
388                 return false;
389
390         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
391         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
392                 return true;
393
394         if (skb->ignore_df)
395                 return false;
396
397         if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
398                 return false;
399
400         return true;
401 }
402
403 int ip6_forward(struct sk_buff *skb)
404 {
405         struct dst_entry *dst = skb_dst(skb);
406         struct ipv6hdr *hdr = ipv6_hdr(skb);
407         struct inet6_skb_parm *opt = IP6CB(skb);
408         struct net *net = dev_net(dst->dev);
409         u32 mtu;
410
411         if (net->ipv6.devconf_all->forwarding == 0)
412                 goto error;
413
414         if (skb->pkt_type != PACKET_HOST)
415                 goto drop;
416
417         if (unlikely(skb->sk))
418                 goto drop;
419
420         if (skb_warn_if_lro(skb))
421                 goto drop;
422
423         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
424                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
425                                 IPSTATS_MIB_INDISCARDS);
426                 goto drop;
427         }
428
429         skb_forward_csum(skb);
430
431         /*
432          *      We DO NOT make any processing on
433          *      RA packets, pushing them to user level AS IS
434          *      without ane WARRANTY that application will be able
435          *      to interpret them. The reason is that we
436          *      cannot make anything clever here.
437          *
438          *      We are not end-node, so that if packet contains
439          *      AH/ESP, we cannot make anything.
440          *      Defragmentation also would be mistake, RA packets
441          *      cannot be fragmented, because there is no warranty
442          *      that different fragments will go along one path. --ANK
443          */
444         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
445                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
446                         return 0;
447         }
448
449         /*
450          *      check and decrement ttl
451          */
452         if (hdr->hop_limit <= 1) {
453                 /* Force OUTPUT device used as source address */
454                 skb->dev = dst->dev;
455                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
456                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
457                                 IPSTATS_MIB_INHDRERRORS);
458
459                 kfree_skb(skb);
460                 return -ETIMEDOUT;
461         }
462
463         /* XXX: idev->cnf.proxy_ndp? */
464         if (net->ipv6.devconf_all->proxy_ndp &&
465             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
466                 int proxied = ip6_forward_proxy_check(skb);
467                 if (proxied > 0)
468                         return ip6_input(skb);
469                 else if (proxied < 0) {
470                         __IP6_INC_STATS(net, ip6_dst_idev(dst),
471                                         IPSTATS_MIB_INDISCARDS);
472                         goto drop;
473                 }
474         }
475
476         if (!xfrm6_route_forward(skb)) {
477                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
478                                 IPSTATS_MIB_INDISCARDS);
479                 goto drop;
480         }
481         dst = skb_dst(skb);
482
483         /* IPv6 specs say nothing about it, but it is clear that we cannot
484            send redirects to source routed frames.
485            We don't send redirects to frames decapsulated from IPsec.
486          */
487         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
488                 struct in6_addr *target = NULL;
489                 struct inet_peer *peer;
490                 struct rt6_info *rt;
491
492                 /*
493                  *      incoming and outgoing devices are the same
494                  *      send a redirect.
495                  */
496
497                 rt = (struct rt6_info *) dst;
498                 if (rt->rt6i_flags & RTF_GATEWAY)
499                         target = &rt->rt6i_gateway;
500                 else
501                         target = &hdr->daddr;
502
503                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
504
505                 /* Limit redirects both by destination (here)
506                    and by source (inside ndisc_send_redirect)
507                  */
508                 if (inet_peer_xrlim_allow(peer, 1*HZ))
509                         ndisc_send_redirect(skb, target);
510                 if (peer)
511                         inet_putpeer(peer);
512         } else {
513                 int addrtype = ipv6_addr_type(&hdr->saddr);
514
515                 /* This check is security critical. */
516                 if (addrtype == IPV6_ADDR_ANY ||
517                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
518                         goto error;
519                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
520                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
521                                     ICMPV6_NOT_NEIGHBOUR, 0);
522                         goto error;
523                 }
524         }
525
526         mtu = ip6_dst_mtu_forward(dst);
527         if (mtu < IPV6_MIN_MTU)
528                 mtu = IPV6_MIN_MTU;
529
530         if (ip6_pkt_too_big(skb, mtu)) {
531                 /* Again, force OUTPUT device used as source address */
532                 skb->dev = dst->dev;
533                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
534                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
535                                 IPSTATS_MIB_INTOOBIGERRORS);
536                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
537                                 IPSTATS_MIB_FRAGFAILS);
538                 kfree_skb(skb);
539                 return -EMSGSIZE;
540         }
541
542         if (skb_cow(skb, dst->dev->hard_header_len)) {
543                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
544                                 IPSTATS_MIB_OUTDISCARDS);
545                 goto drop;
546         }
547
548         hdr = ipv6_hdr(skb);
549
550         /* Mangling hops number delayed to point after skb COW */
551
552         hdr->hop_limit--;
553
554         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
555         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
556         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
557                        net, NULL, skb, skb->dev, dst->dev,
558                        ip6_forward_finish);
559
560 error:
561         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
562 drop:
563         kfree_skb(skb);
564         return -EINVAL;
565 }
566
567 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
568 {
569         to->pkt_type = from->pkt_type;
570         to->priority = from->priority;
571         to->protocol = from->protocol;
572         skb_dst_drop(to);
573         skb_dst_set(to, dst_clone(skb_dst(from)));
574         to->dev = from->dev;
575         to->mark = from->mark;
576
577 #ifdef CONFIG_NET_SCHED
578         to->tc_index = from->tc_index;
579 #endif
580         nf_copy(to, from);
581         skb_copy_secmark(to, from);
582 }
583
584 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
585                  int (*output)(struct net *, struct sock *, struct sk_buff *))
586 {
587         struct sk_buff *frag;
588         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
589         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
590                                 inet6_sk(skb->sk) : NULL;
591         struct ipv6hdr *tmp_hdr;
592         struct frag_hdr *fh;
593         unsigned int mtu, hlen, left, len;
594         int hroom, troom;
595         __be32 frag_id;
596         int ptr, offset = 0, err = 0;
597         u8 *prevhdr, nexthdr = 0;
598
599         hlen = ip6_find_1stfragopt(skb, &prevhdr);
600         nexthdr = *prevhdr;
601
602         mtu = ip6_skb_dst_mtu(skb);
603
604         /* We must not fragment if the socket is set to force MTU discovery
605          * or if the skb it not generated by a local socket.
606          */
607         if (unlikely(!skb->ignore_df && skb->len > mtu))
608                 goto fail_toobig;
609
610         if (IP6CB(skb)->frag_max_size) {
611                 if (IP6CB(skb)->frag_max_size > mtu)
612                         goto fail_toobig;
613
614                 /* don't send fragments larger than what we received */
615                 mtu = IP6CB(skb)->frag_max_size;
616                 if (mtu < IPV6_MIN_MTU)
617                         mtu = IPV6_MIN_MTU;
618         }
619
620         if (np && np->frag_size < mtu) {
621                 if (np->frag_size)
622                         mtu = np->frag_size;
623         }
624         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
625                 goto fail_toobig;
626         mtu -= hlen + sizeof(struct frag_hdr);
627
628         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
629                                     &ipv6_hdr(skb)->saddr);
630
631         if (skb->ip_summed == CHECKSUM_PARTIAL &&
632             (err = skb_checksum_help(skb)))
633                 goto fail;
634
635         hroom = LL_RESERVED_SPACE(rt->dst.dev);
636         if (skb_has_frag_list(skb)) {
637                 unsigned int first_len = skb_pagelen(skb);
638                 struct sk_buff *frag2;
639
640                 if (first_len - hlen > mtu ||
641                     ((first_len - hlen) & 7) ||
642                     skb_cloned(skb) ||
643                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
644                         goto slow_path;
645
646                 skb_walk_frags(skb, frag) {
647                         /* Correct geometry. */
648                         if (frag->len > mtu ||
649                             ((frag->len & 7) && frag->next) ||
650                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
651                                 goto slow_path_clean;
652
653                         /* Partially cloned skb? */
654                         if (skb_shared(frag))
655                                 goto slow_path_clean;
656
657                         BUG_ON(frag->sk);
658                         if (skb->sk) {
659                                 frag->sk = skb->sk;
660                                 frag->destructor = sock_wfree;
661                         }
662                         skb->truesize -= frag->truesize;
663                 }
664
665                 err = 0;
666                 offset = 0;
667                 /* BUILD HEADER */
668
669                 *prevhdr = NEXTHDR_FRAGMENT;
670                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
671                 if (!tmp_hdr) {
672                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
673                                       IPSTATS_MIB_FRAGFAILS);
674                         err = -ENOMEM;
675                         goto fail;
676                 }
677                 frag = skb_shinfo(skb)->frag_list;
678                 skb_frag_list_init(skb);
679
680                 __skb_pull(skb, hlen);
681                 fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
682                 __skb_push(skb, hlen);
683                 skb_reset_network_header(skb);
684                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
685
686                 fh->nexthdr = nexthdr;
687                 fh->reserved = 0;
688                 fh->frag_off = htons(IP6_MF);
689                 fh->identification = frag_id;
690
691                 first_len = skb_pagelen(skb);
692                 skb->data_len = first_len - skb_headlen(skb);
693                 skb->len = first_len;
694                 ipv6_hdr(skb)->payload_len = htons(first_len -
695                                                    sizeof(struct ipv6hdr));
696
697                 dst_hold(&rt->dst);
698
699                 for (;;) {
700                         /* Prepare header of the next frame,
701                          * before previous one went down. */
702                         if (frag) {
703                                 frag->ip_summed = CHECKSUM_NONE;
704                                 skb_reset_transport_header(frag);
705                                 fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
706                                 __skb_push(frag, hlen);
707                                 skb_reset_network_header(frag);
708                                 memcpy(skb_network_header(frag), tmp_hdr,
709                                        hlen);
710                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
711                                 fh->nexthdr = nexthdr;
712                                 fh->reserved = 0;
713                                 fh->frag_off = htons(offset);
714                                 if (frag->next)
715                                         fh->frag_off |= htons(IP6_MF);
716                                 fh->identification = frag_id;
717                                 ipv6_hdr(frag)->payload_len =
718                                                 htons(frag->len -
719                                                       sizeof(struct ipv6hdr));
720                                 ip6_copy_metadata(frag, skb);
721                         }
722
723                         err = output(net, sk, skb);
724                         if (!err)
725                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
726                                               IPSTATS_MIB_FRAGCREATES);
727
728                         if (err || !frag)
729                                 break;
730
731                         skb = frag;
732                         frag = skb->next;
733                         skb->next = NULL;
734                 }
735
736                 kfree(tmp_hdr);
737
738                 if (err == 0) {
739                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
740                                       IPSTATS_MIB_FRAGOKS);
741                         ip6_rt_put(rt);
742                         return 0;
743                 }
744
745                 kfree_skb_list(frag);
746
747                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
748                               IPSTATS_MIB_FRAGFAILS);
749                 ip6_rt_put(rt);
750                 return err;
751
752 slow_path_clean:
753                 skb_walk_frags(skb, frag2) {
754                         if (frag2 == frag)
755                                 break;
756                         frag2->sk = NULL;
757                         frag2->destructor = NULL;
758                         skb->truesize += frag2->truesize;
759                 }
760         }
761
762 slow_path:
763         left = skb->len - hlen;         /* Space per frame */
764         ptr = hlen;                     /* Where to start from */
765
766         /*
767          *      Fragment the datagram.
768          */
769
770         *prevhdr = NEXTHDR_FRAGMENT;
771         troom = rt->dst.dev->needed_tailroom;
772
773         /*
774          *      Keep copying data until we run out.
775          */
776         while (left > 0)        {
777                 len = left;
778                 /* IF: it doesn't fit, use 'mtu' - the data space left */
779                 if (len > mtu)
780                         len = mtu;
781                 /* IF: we are not sending up to and including the packet end
782                    then align the next start on an eight byte boundary */
783                 if (len < left) {
784                         len &= ~7;
785                 }
786
787                 /* Allocate buffer */
788                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
789                                  hroom + troom, GFP_ATOMIC);
790                 if (!frag) {
791                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
792                                       IPSTATS_MIB_FRAGFAILS);
793                         err = -ENOMEM;
794                         goto fail;
795                 }
796
797                 /*
798                  *      Set up data on packet
799                  */
800
801                 ip6_copy_metadata(frag, skb);
802                 skb_reserve(frag, hroom);
803                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
804                 skb_reset_network_header(frag);
805                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
806                 frag->transport_header = (frag->network_header + hlen +
807                                           sizeof(struct frag_hdr));
808
809                 /*
810                  *      Charge the memory for the fragment to any owner
811                  *      it might possess
812                  */
813                 if (skb->sk)
814                         skb_set_owner_w(frag, skb->sk);
815
816                 /*
817                  *      Copy the packet header into the new buffer.
818                  */
819                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
820
821                 /*
822                  *      Build fragment header.
823                  */
824                 fh->nexthdr = nexthdr;
825                 fh->reserved = 0;
826                 fh->identification = frag_id;
827
828                 /*
829                  *      Copy a block of the IP datagram.
830                  */
831                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
832                                      len));
833                 left -= len;
834
835                 fh->frag_off = htons(offset);
836                 if (left > 0)
837                         fh->frag_off |= htons(IP6_MF);
838                 ipv6_hdr(frag)->payload_len = htons(frag->len -
839                                                     sizeof(struct ipv6hdr));
840
841                 ptr += len;
842                 offset += len;
843
844                 /*
845                  *      Put this fragment into the sending queue.
846                  */
847                 err = output(net, sk, frag);
848                 if (err)
849                         goto fail;
850
851                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
852                               IPSTATS_MIB_FRAGCREATES);
853         }
854         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
855                       IPSTATS_MIB_FRAGOKS);
856         consume_skb(skb);
857         return err;
858
859 fail_toobig:
860         if (skb->sk && dst_allfrag(skb_dst(skb)))
861                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
862
863         skb->dev = skb_dst(skb)->dev;
864         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
865         err = -EMSGSIZE;
866
867 fail:
868         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
869                       IPSTATS_MIB_FRAGFAILS);
870         kfree_skb(skb);
871         return err;
872 }
873
874 static inline int ip6_rt_check(const struct rt6key *rt_key,
875                                const struct in6_addr *fl_addr,
876                                const struct in6_addr *addr_cache)
877 {
878         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
879                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
880 }
881
882 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
883                                           struct dst_entry *dst,
884                                           const struct flowi6 *fl6)
885 {
886         struct ipv6_pinfo *np = inet6_sk(sk);
887         struct rt6_info *rt;
888
889         if (!dst)
890                 goto out;
891
892         if (dst->ops->family != AF_INET6) {
893                 dst_release(dst);
894                 return NULL;
895         }
896
897         rt = (struct rt6_info *)dst;
898         /* Yes, checking route validity in not connected
899          * case is not very simple. Take into account,
900          * that we do not support routing by source, TOS,
901          * and MSG_DONTROUTE            --ANK (980726)
902          *
903          * 1. ip6_rt_check(): If route was host route,
904          *    check that cached destination is current.
905          *    If it is network route, we still may
906          *    check its validity using saved pointer
907          *    to the last used address: daddr_cache.
908          *    We do not want to save whole address now,
909          *    (because main consumer of this service
910          *    is tcp, which has not this problem),
911          *    so that the last trick works only on connected
912          *    sockets.
913          * 2. oif also should be the same.
914          */
915         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
916 #ifdef CONFIG_IPV6_SUBTREES
917             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
918 #endif
919            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
920               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
921                 dst_release(dst);
922                 dst = NULL;
923         }
924
925 out:
926         return dst;
927 }
928
929 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
930                                struct dst_entry **dst, struct flowi6 *fl6)
931 {
932 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
933         struct neighbour *n;
934         struct rt6_info *rt;
935 #endif
936         int err;
937         int flags = 0;
938
939         /* The correct way to handle this would be to do
940          * ip6_route_get_saddr, and then ip6_route_output; however,
941          * the route-specific preferred source forces the
942          * ip6_route_output call _before_ ip6_route_get_saddr.
943          *
944          * In source specific routing (no src=any default route),
945          * ip6_route_output will fail given src=any saddr, though, so
946          * that's why we try it again later.
947          */
948         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
949                 struct rt6_info *rt;
950                 bool had_dst = *dst != NULL;
951
952                 if (!had_dst)
953                         *dst = ip6_route_output(net, sk, fl6);
954                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
955                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
956                                           sk ? inet6_sk(sk)->srcprefs : 0,
957                                           &fl6->saddr);
958                 if (err)
959                         goto out_err_release;
960
961                 /* If we had an erroneous initial result, pretend it
962                  * never existed and let the SA-enabled version take
963                  * over.
964                  */
965                 if (!had_dst && (*dst)->error) {
966                         dst_release(*dst);
967                         *dst = NULL;
968                 }
969
970                 if (fl6->flowi6_oif)
971                         flags |= RT6_LOOKUP_F_IFACE;
972         }
973
974         if (!*dst)
975                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
976
977         err = (*dst)->error;
978         if (err)
979                 goto out_err_release;
980
981 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
982         /*
983          * Here if the dst entry we've looked up
984          * has a neighbour entry that is in the INCOMPLETE
985          * state and the src address from the flow is
986          * marked as OPTIMISTIC, we release the found
987          * dst entry and replace it instead with the
988          * dst entry of the nexthop router
989          */
990         rt = (struct rt6_info *) *dst;
991         rcu_read_lock_bh();
992         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
993                                       rt6_nexthop(rt, &fl6->daddr));
994         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
995         rcu_read_unlock_bh();
996
997         if (err) {
998                 struct inet6_ifaddr *ifp;
999                 struct flowi6 fl_gw6;
1000                 int redirect;
1001
1002                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1003                                       (*dst)->dev, 1);
1004
1005                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1006                 if (ifp)
1007                         in6_ifa_put(ifp);
1008
1009                 if (redirect) {
1010                         /*
1011                          * We need to get the dst entry for the
1012                          * default router instead
1013                          */
1014                         dst_release(*dst);
1015                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1016                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1017                         *dst = ip6_route_output(net, sk, &fl_gw6);
1018                         err = (*dst)->error;
1019                         if (err)
1020                                 goto out_err_release;
1021                 }
1022         }
1023 #endif
1024         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1025             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr)))
1026                 return -EAFNOSUPPORT;
1027
1028         return 0;
1029
1030 out_err_release:
1031         dst_release(*dst);
1032         *dst = NULL;
1033
1034         if (err == -ENETUNREACH)
1035                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1036         return err;
1037 }
1038
1039 /**
1040  *      ip6_dst_lookup - perform route lookup on flow
1041  *      @sk: socket which provides route info
1042  *      @dst: pointer to dst_entry * for result
1043  *      @fl6: flow to lookup
1044  *
1045  *      This function performs a route lookup on the given flow.
1046  *
1047  *      It returns zero on success, or a standard errno code on error.
1048  */
1049 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1050                    struct flowi6 *fl6)
1051 {
1052         *dst = NULL;
1053         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1054 }
1055 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1056
1057 /**
1058  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1059  *      @sk: socket which provides route info
1060  *      @fl6: flow to lookup
1061  *      @final_dst: final destination address for ipsec lookup
1062  *
1063  *      This function performs a route lookup on the given flow.
1064  *
1065  *      It returns a valid dst pointer on success, or a pointer encoded
1066  *      error code.
1067  */
1068 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1069                                       const struct in6_addr *final_dst)
1070 {
1071         struct dst_entry *dst = NULL;
1072         int err;
1073
1074         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1075         if (err)
1076                 return ERR_PTR(err);
1077         if (final_dst)
1078                 fl6->daddr = *final_dst;
1079
1080         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1081 }
1082 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1083
1084 /**
1085  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1086  *      @sk: socket which provides the dst cache and route info
1087  *      @fl6: flow to lookup
1088  *      @final_dst: final destination address for ipsec lookup
1089  *
1090  *      This function performs a route lookup on the given flow with the
1091  *      possibility of using the cached route in the socket if it is valid.
1092  *      It will take the socket dst lock when operating on the dst cache.
1093  *      As a result, this function can only be used in process context.
1094  *
1095  *      It returns a valid dst pointer on success, or a pointer encoded
1096  *      error code.
1097  */
1098 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1099                                          const struct in6_addr *final_dst)
1100 {
1101         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1102
1103         dst = ip6_sk_dst_check(sk, dst, fl6);
1104         if (!dst)
1105                 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1106
1107         return dst;
1108 }
1109 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1110
1111 static inline int ip6_ufo_append_data(struct sock *sk,
1112                         struct sk_buff_head *queue,
1113                         int getfrag(void *from, char *to, int offset, int len,
1114                         int odd, struct sk_buff *skb),
1115                         void *from, int length, int hh_len, int fragheaderlen,
1116                         int exthdrlen, int transhdrlen, int mtu,
1117                         unsigned int flags, const struct flowi6 *fl6)
1118
1119 {
1120         struct sk_buff *skb;
1121         int err;
1122
1123         /* There is support for UDP large send offload by network
1124          * device, so create one single skb packet containing complete
1125          * udp datagram
1126          */
1127         skb = skb_peek_tail(queue);
1128         if (!skb) {
1129                 skb = sock_alloc_send_skb(sk,
1130                         hh_len + fragheaderlen + transhdrlen + 20,
1131                         (flags & MSG_DONTWAIT), &err);
1132                 if (!skb)
1133                         return err;
1134
1135                 /* reserve space for Hardware header */
1136                 skb_reserve(skb, hh_len);
1137
1138                 /* create space for UDP/IP header */
1139                 skb_put(skb, fragheaderlen + transhdrlen);
1140
1141                 /* initialize network header pointer */
1142                 skb_set_network_header(skb, exthdrlen);
1143
1144                 /* initialize protocol header pointer */
1145                 skb->transport_header = skb->network_header + fragheaderlen;
1146
1147                 skb->protocol = htons(ETH_P_IPV6);
1148                 skb->csum = 0;
1149
1150                 __skb_queue_tail(queue, skb);
1151         } else if (skb_is_gso(skb)) {
1152                 goto append;
1153         }
1154
1155         skb->ip_summed = CHECKSUM_PARTIAL;
1156         /* Specify the length of each IPv6 datagram fragment.
1157          * It has to be a multiple of 8.
1158          */
1159         skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1160                                      sizeof(struct frag_hdr)) & ~7;
1161         skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1162         skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1163                                                          &fl6->daddr,
1164                                                          &fl6->saddr);
1165
1166 append:
1167         return skb_append_datato_frags(sk, skb, getfrag, from,
1168                                        (length - transhdrlen));
1169 }
1170
1171 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1172                                                gfp_t gfp)
1173 {
1174         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1175 }
1176
1177 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1178                                                 gfp_t gfp)
1179 {
1180         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1181 }
1182
1183 static void ip6_append_data_mtu(unsigned int *mtu,
1184                                 int *maxfraglen,
1185                                 unsigned int fragheaderlen,
1186                                 struct sk_buff *skb,
1187                                 struct rt6_info *rt,
1188                                 unsigned int orig_mtu)
1189 {
1190         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1191                 if (!skb) {
1192                         /* first fragment, reserve header_len */
1193                         *mtu = orig_mtu - rt->dst.header_len;
1194
1195                 } else {
1196                         /*
1197                          * this fragment is not first, the headers
1198                          * space is regarded as data space.
1199                          */
1200                         *mtu = orig_mtu;
1201                 }
1202                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1203                               + fragheaderlen - sizeof(struct frag_hdr);
1204         }
1205 }
1206
1207 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1208                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1209                           struct rt6_info *rt, struct flowi6 *fl6)
1210 {
1211         struct ipv6_pinfo *np = inet6_sk(sk);
1212         unsigned int mtu;
1213         struct ipv6_txoptions *opt = ipc6->opt;
1214
1215         /*
1216          * setup for corking
1217          */
1218         if (opt) {
1219                 if (WARN_ON(v6_cork->opt))
1220                         return -EINVAL;
1221
1222                 v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1223                 if (unlikely(!v6_cork->opt))
1224                         return -ENOBUFS;
1225
1226                 v6_cork->opt->tot_len = opt->tot_len;
1227                 v6_cork->opt->opt_flen = opt->opt_flen;
1228                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1229
1230                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1231                                                     sk->sk_allocation);
1232                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1233                         return -ENOBUFS;
1234
1235                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1236                                                     sk->sk_allocation);
1237                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1238                         return -ENOBUFS;
1239
1240                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1241                                                    sk->sk_allocation);
1242                 if (opt->hopopt && !v6_cork->opt->hopopt)
1243                         return -ENOBUFS;
1244
1245                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1246                                                     sk->sk_allocation);
1247                 if (opt->srcrt && !v6_cork->opt->srcrt)
1248                         return -ENOBUFS;
1249
1250                 /* need source address above miyazawa*/
1251         }
1252         dst_hold(&rt->dst);
1253         cork->base.dst = &rt->dst;
1254         cork->fl.u.ip6 = *fl6;
1255         v6_cork->hop_limit = ipc6->hlimit;
1256         v6_cork->tclass = ipc6->tclass;
1257         if (rt->dst.flags & DST_XFRM_TUNNEL)
1258                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1259                       rt->dst.dev->mtu : dst_mtu(&rt->dst);
1260         else
1261                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1262                       rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1263         if (np->frag_size < mtu) {
1264                 if (np->frag_size)
1265                         mtu = np->frag_size;
1266         }
1267         cork->base.fragsize = mtu;
1268         if (dst_allfrag(rt->dst.path))
1269                 cork->base.flags |= IPCORK_ALLFRAG;
1270         cork->base.length = 0;
1271
1272         return 0;
1273 }
1274
1275 static int __ip6_append_data(struct sock *sk,
1276                              struct flowi6 *fl6,
1277                              struct sk_buff_head *queue,
1278                              struct inet_cork *cork,
1279                              struct inet6_cork *v6_cork,
1280                              struct page_frag *pfrag,
1281                              int getfrag(void *from, char *to, int offset,
1282                                          int len, int odd, struct sk_buff *skb),
1283                              void *from, int length, int transhdrlen,
1284                              unsigned int flags, struct ipcm6_cookie *ipc6,
1285                              const struct sockcm_cookie *sockc)
1286 {
1287         struct sk_buff *skb, *skb_prev = NULL;
1288         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1289         int exthdrlen = 0;
1290         int dst_exthdrlen = 0;
1291         int hh_len;
1292         int copy;
1293         int err;
1294         int offset = 0;
1295         __u8 tx_flags = 0;
1296         u32 tskey = 0;
1297         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1298         struct ipv6_txoptions *opt = v6_cork->opt;
1299         int csummode = CHECKSUM_NONE;
1300         unsigned int maxnonfragsize, headersize;
1301
1302         skb = skb_peek_tail(queue);
1303         if (!skb) {
1304                 exthdrlen = opt ? opt->opt_flen : 0;
1305                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1306         }
1307
1308         mtu = cork->fragsize;
1309         orig_mtu = mtu;
1310
1311         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1312
1313         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1314                         (opt ? opt->opt_nflen : 0);
1315         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1316                      sizeof(struct frag_hdr);
1317
1318         headersize = sizeof(struct ipv6hdr) +
1319                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1320                      (dst_allfrag(&rt->dst) ?
1321                       sizeof(struct frag_hdr) : 0) +
1322                      rt->rt6i_nfheader_len;
1323
1324         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1325             (sk->sk_protocol == IPPROTO_UDP ||
1326              sk->sk_protocol == IPPROTO_RAW)) {
1327                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1328                                 sizeof(struct ipv6hdr));
1329                 goto emsgsize;
1330         }
1331
1332         if (ip6_sk_ignore_df(sk))
1333                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1334         else
1335                 maxnonfragsize = mtu;
1336
1337         if (cork->length + length > maxnonfragsize - headersize) {
1338 emsgsize:
1339                 ipv6_local_error(sk, EMSGSIZE, fl6,
1340                                  mtu - headersize +
1341                                  sizeof(struct ipv6hdr));
1342                 return -EMSGSIZE;
1343         }
1344
1345         /* CHECKSUM_PARTIAL only with no extension headers and when
1346          * we are not going to fragment
1347          */
1348         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1349             headersize == sizeof(struct ipv6hdr) &&
1350             length <= mtu - headersize &&
1351             !(flags & MSG_MORE) &&
1352             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1353                 csummode = CHECKSUM_PARTIAL;
1354
1355         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1356                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1357                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1358                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1359                         tskey = sk->sk_tskey++;
1360         }
1361
1362         /*
1363          * Let's try using as much space as possible.
1364          * Use MTU if total length of the message fits into the MTU.
1365          * Otherwise, we need to reserve fragment header and
1366          * fragment alignment (= 8-15 octects, in total).
1367          *
1368          * Note that we may need to "move" the data from the tail of
1369          * of the buffer to the new fragment when we split
1370          * the message.
1371          *
1372          * FIXME: It may be fragmented into multiple chunks
1373          *        at once if non-fragmentable extension headers
1374          *        are too large.
1375          * --yoshfuji
1376          */
1377
1378         cork->length += length;
1379         if ((((length + fragheaderlen) > mtu) ||
1380              (skb && skb_is_gso(skb))) &&
1381             (sk->sk_protocol == IPPROTO_UDP) &&
1382             (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
1383             (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) {
1384                 err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1385                                           hh_len, fragheaderlen, exthdrlen,
1386                                           transhdrlen, mtu, flags, fl6);
1387                 if (err)
1388                         goto error;
1389                 return 0;
1390         }
1391
1392         if (!skb)
1393                 goto alloc_new_skb;
1394
1395         while (length > 0) {
1396                 /* Check if the remaining data fits into current packet. */
1397                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1398                 if (copy < length)
1399                         copy = maxfraglen - skb->len;
1400
1401                 if (copy <= 0) {
1402                         char *data;
1403                         unsigned int datalen;
1404                         unsigned int fraglen;
1405                         unsigned int fraggap;
1406                         unsigned int alloclen;
1407 alloc_new_skb:
1408                         /* There's no room in the current skb */
1409                         if (skb)
1410                                 fraggap = skb->len - maxfraglen;
1411                         else
1412                                 fraggap = 0;
1413                         /* update mtu and maxfraglen if necessary */
1414                         if (!skb || !skb_prev)
1415                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1416                                                     fragheaderlen, skb, rt,
1417                                                     orig_mtu);
1418
1419                         skb_prev = skb;
1420
1421                         /*
1422                          * If remaining data exceeds the mtu,
1423                          * we know we need more fragment(s).
1424                          */
1425                         datalen = length + fraggap;
1426
1427                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1428                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1429                         if ((flags & MSG_MORE) &&
1430                             !(rt->dst.dev->features&NETIF_F_SG))
1431                                 alloclen = mtu;
1432                         else
1433                                 alloclen = datalen + fragheaderlen;
1434
1435                         alloclen += dst_exthdrlen;
1436
1437                         if (datalen != length + fraggap) {
1438                                 /*
1439                                  * this is not the last fragment, the trailer
1440                                  * space is regarded as data space.
1441                                  */
1442                                 datalen += rt->dst.trailer_len;
1443                         }
1444
1445                         alloclen += rt->dst.trailer_len;
1446                         fraglen = datalen + fragheaderlen;
1447
1448                         /*
1449                          * We just reserve space for fragment header.
1450                          * Note: this may be overallocation if the message
1451                          * (without MSG_MORE) fits into the MTU.
1452                          */
1453                         alloclen += sizeof(struct frag_hdr);
1454
1455                         if (transhdrlen) {
1456                                 skb = sock_alloc_send_skb(sk,
1457                                                 alloclen + hh_len,
1458                                                 (flags & MSG_DONTWAIT), &err);
1459                         } else {
1460                                 skb = NULL;
1461                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1462                                     2 * sk->sk_sndbuf)
1463                                         skb = sock_wmalloc(sk,
1464                                                            alloclen + hh_len, 1,
1465                                                            sk->sk_allocation);
1466                                 if (unlikely(!skb))
1467                                         err = -ENOBUFS;
1468                         }
1469                         if (!skb)
1470                                 goto error;
1471                         /*
1472                          *      Fill in the control structures
1473                          */
1474                         skb->protocol = htons(ETH_P_IPV6);
1475                         skb->ip_summed = csummode;
1476                         skb->csum = 0;
1477                         /* reserve for fragmentation and ipsec header */
1478                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1479                                     dst_exthdrlen);
1480
1481                         /* Only the initial fragment is time stamped */
1482                         skb_shinfo(skb)->tx_flags = tx_flags;
1483                         tx_flags = 0;
1484                         skb_shinfo(skb)->tskey = tskey;
1485                         tskey = 0;
1486
1487                         /*
1488                          *      Find where to start putting bytes
1489                          */
1490                         data = skb_put(skb, fraglen);
1491                         skb_set_network_header(skb, exthdrlen);
1492                         data += fragheaderlen;
1493                         skb->transport_header = (skb->network_header +
1494                                                  fragheaderlen);
1495                         if (fraggap) {
1496                                 skb->csum = skb_copy_and_csum_bits(
1497                                         skb_prev, maxfraglen,
1498                                         data + transhdrlen, fraggap, 0);
1499                                 skb_prev->csum = csum_sub(skb_prev->csum,
1500                                                           skb->csum);
1501                                 data += fraggap;
1502                                 pskb_trim_unique(skb_prev, maxfraglen);
1503                         }
1504                         copy = datalen - transhdrlen - fraggap;
1505
1506                         if (copy < 0) {
1507                                 err = -EINVAL;
1508                                 kfree_skb(skb);
1509                                 goto error;
1510                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1511                                 err = -EFAULT;
1512                                 kfree_skb(skb);
1513                                 goto error;
1514                         }
1515
1516                         offset += copy;
1517                         length -= datalen - fraggap;
1518                         transhdrlen = 0;
1519                         exthdrlen = 0;
1520                         dst_exthdrlen = 0;
1521
1522                         /*
1523                          * Put the packet on the pending queue
1524                          */
1525                         __skb_queue_tail(queue, skb);
1526                         continue;
1527                 }
1528
1529                 if (copy > length)
1530                         copy = length;
1531
1532                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1533                         unsigned int off;
1534
1535                         off = skb->len;
1536                         if (getfrag(from, skb_put(skb, copy),
1537                                                 offset, copy, off, skb) < 0) {
1538                                 __skb_trim(skb, off);
1539                                 err = -EFAULT;
1540                                 goto error;
1541                         }
1542                 } else {
1543                         int i = skb_shinfo(skb)->nr_frags;
1544
1545                         err = -ENOMEM;
1546                         if (!sk_page_frag_refill(sk, pfrag))
1547                                 goto error;
1548
1549                         if (!skb_can_coalesce(skb, i, pfrag->page,
1550                                               pfrag->offset)) {
1551                                 err = -EMSGSIZE;
1552                                 if (i == MAX_SKB_FRAGS)
1553                                         goto error;
1554
1555                                 __skb_fill_page_desc(skb, i, pfrag->page,
1556                                                      pfrag->offset, 0);
1557                                 skb_shinfo(skb)->nr_frags = ++i;
1558                                 get_page(pfrag->page);
1559                         }
1560                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1561                         if (getfrag(from,
1562                                     page_address(pfrag->page) + pfrag->offset,
1563                                     offset, copy, skb->len, skb) < 0)
1564                                 goto error_efault;
1565
1566                         pfrag->offset += copy;
1567                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1568                         skb->len += copy;
1569                         skb->data_len += copy;
1570                         skb->truesize += copy;
1571                         atomic_add(copy, &sk->sk_wmem_alloc);
1572                 }
1573                 offset += copy;
1574                 length -= copy;
1575         }
1576
1577         return 0;
1578
1579 error_efault:
1580         err = -EFAULT;
1581 error:
1582         cork->length -= length;
1583         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1584         return err;
1585 }
1586
1587 int ip6_append_data(struct sock *sk,
1588                     int getfrag(void *from, char *to, int offset, int len,
1589                                 int odd, struct sk_buff *skb),
1590                     void *from, int length, int transhdrlen,
1591                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1592                     struct rt6_info *rt, unsigned int flags,
1593                     const struct sockcm_cookie *sockc)
1594 {
1595         struct inet_sock *inet = inet_sk(sk);
1596         struct ipv6_pinfo *np = inet6_sk(sk);
1597         int exthdrlen;
1598         int err;
1599
1600         if (flags&MSG_PROBE)
1601                 return 0;
1602         if (skb_queue_empty(&sk->sk_write_queue)) {
1603                 /*
1604                  * setup for corking
1605                  */
1606                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1607                                      ipc6, rt, fl6);
1608                 if (err)
1609                         return err;
1610
1611                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1612                 length += exthdrlen;
1613                 transhdrlen += exthdrlen;
1614         } else {
1615                 fl6 = &inet->cork.fl.u.ip6;
1616                 transhdrlen = 0;
1617         }
1618
1619         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1620                                  &np->cork, sk_page_frag(sk), getfrag,
1621                                  from, length, transhdrlen, flags, ipc6, sockc);
1622 }
1623 EXPORT_SYMBOL_GPL(ip6_append_data);
1624
1625 static void ip6_cork_release(struct inet_cork_full *cork,
1626                              struct inet6_cork *v6_cork)
1627 {
1628         if (v6_cork->opt) {
1629                 kfree(v6_cork->opt->dst0opt);
1630                 kfree(v6_cork->opt->dst1opt);
1631                 kfree(v6_cork->opt->hopopt);
1632                 kfree(v6_cork->opt->srcrt);
1633                 kfree(v6_cork->opt);
1634                 v6_cork->opt = NULL;
1635         }
1636
1637         if (cork->base.dst) {
1638                 dst_release(cork->base.dst);
1639                 cork->base.dst = NULL;
1640                 cork->base.flags &= ~IPCORK_ALLFRAG;
1641         }
1642         memset(&cork->fl, 0, sizeof(cork->fl));
1643 }
1644
1645 struct sk_buff *__ip6_make_skb(struct sock *sk,
1646                                struct sk_buff_head *queue,
1647                                struct inet_cork_full *cork,
1648                                struct inet6_cork *v6_cork)
1649 {
1650         struct sk_buff *skb, *tmp_skb;
1651         struct sk_buff **tail_skb;
1652         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1653         struct ipv6_pinfo *np = inet6_sk(sk);
1654         struct net *net = sock_net(sk);
1655         struct ipv6hdr *hdr;
1656         struct ipv6_txoptions *opt = v6_cork->opt;
1657         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1658         struct flowi6 *fl6 = &cork->fl.u.ip6;
1659         unsigned char proto = fl6->flowi6_proto;
1660
1661         skb = __skb_dequeue(queue);
1662         if (!skb)
1663                 goto out;
1664         tail_skb = &(skb_shinfo(skb)->frag_list);
1665
1666         /* move skb->data to ip header from ext header */
1667         if (skb->data < skb_network_header(skb))
1668                 __skb_pull(skb, skb_network_offset(skb));
1669         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1670                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1671                 *tail_skb = tmp_skb;
1672                 tail_skb = &(tmp_skb->next);
1673                 skb->len += tmp_skb->len;
1674                 skb->data_len += tmp_skb->len;
1675                 skb->truesize += tmp_skb->truesize;
1676                 tmp_skb->destructor = NULL;
1677                 tmp_skb->sk = NULL;
1678         }
1679
1680         /* Allow local fragmentation. */
1681         skb->ignore_df = ip6_sk_ignore_df(sk);
1682
1683         *final_dst = fl6->daddr;
1684         __skb_pull(skb, skb_network_header_len(skb));
1685         if (opt && opt->opt_flen)
1686                 ipv6_push_frag_opts(skb, opt, &proto);
1687         if (opt && opt->opt_nflen)
1688                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1689
1690         skb_push(skb, sizeof(struct ipv6hdr));
1691         skb_reset_network_header(skb);
1692         hdr = ipv6_hdr(skb);
1693
1694         ip6_flow_hdr(hdr, v6_cork->tclass,
1695                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1696                                         np->autoflowlabel, fl6));
1697         hdr->hop_limit = v6_cork->hop_limit;
1698         hdr->nexthdr = proto;
1699         hdr->saddr = fl6->saddr;
1700         hdr->daddr = *final_dst;
1701
1702         skb->priority = sk->sk_priority;
1703         skb->mark = sk->sk_mark;
1704
1705         skb_dst_set(skb, dst_clone(&rt->dst));
1706         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1707         if (proto == IPPROTO_ICMPV6) {
1708                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1709
1710                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1711                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1712         }
1713
1714         ip6_cork_release(cork, v6_cork);
1715 out:
1716         return skb;
1717 }
1718
1719 int ip6_send_skb(struct sk_buff *skb)
1720 {
1721         struct net *net = sock_net(skb->sk);
1722         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1723         int err;
1724
1725         err = ip6_local_out(net, skb->sk, skb);
1726         if (err) {
1727                 if (err > 0)
1728                         err = net_xmit_errno(err);
1729                 if (err)
1730                         IP6_INC_STATS(net, rt->rt6i_idev,
1731                                       IPSTATS_MIB_OUTDISCARDS);
1732         }
1733
1734         return err;
1735 }
1736
1737 int ip6_push_pending_frames(struct sock *sk)
1738 {
1739         struct sk_buff *skb;
1740
1741         skb = ip6_finish_skb(sk);
1742         if (!skb)
1743                 return 0;
1744
1745         return ip6_send_skb(skb);
1746 }
1747 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1748
1749 static void __ip6_flush_pending_frames(struct sock *sk,
1750                                        struct sk_buff_head *queue,
1751                                        struct inet_cork_full *cork,
1752                                        struct inet6_cork *v6_cork)
1753 {
1754         struct sk_buff *skb;
1755
1756         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1757                 if (skb_dst(skb))
1758                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1759                                       IPSTATS_MIB_OUTDISCARDS);
1760                 kfree_skb(skb);
1761         }
1762
1763         ip6_cork_release(cork, v6_cork);
1764 }
1765
1766 void ip6_flush_pending_frames(struct sock *sk)
1767 {
1768         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1769                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1770 }
1771 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1772
1773 struct sk_buff *ip6_make_skb(struct sock *sk,
1774                              int getfrag(void *from, char *to, int offset,
1775                                          int len, int odd, struct sk_buff *skb),
1776                              void *from, int length, int transhdrlen,
1777                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1778                              struct rt6_info *rt, unsigned int flags,
1779                              const struct sockcm_cookie *sockc)
1780 {
1781         struct inet_cork_full cork;
1782         struct inet6_cork v6_cork;
1783         struct sk_buff_head queue;
1784         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1785         int err;
1786
1787         if (flags & MSG_PROBE)
1788                 return NULL;
1789
1790         __skb_queue_head_init(&queue);
1791
1792         cork.base.flags = 0;
1793         cork.base.addr = 0;
1794         cork.base.opt = NULL;
1795         v6_cork.opt = NULL;
1796         err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1797         if (err)
1798                 return ERR_PTR(err);
1799
1800         if (ipc6->dontfrag < 0)
1801                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1802
1803         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1804                                 &current->task_frag, getfrag, from,
1805                                 length + exthdrlen, transhdrlen + exthdrlen,
1806                                 flags, ipc6, sockc);
1807         if (err) {
1808                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1809                 return ERR_PTR(err);
1810         }
1811
1812         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1813 }