]> asedeno.scripts.mit.edu Git - linux.git/blob - net/ipv6/ip6_output.c
sched/topology: Make local variables static
[linux.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
61
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63 {
64         struct dst_entry *dst = skb_dst(skb);
65         struct net_device *dev = dst->dev;
66         struct neighbour *neigh;
67         struct in6_addr *nexthop;
68         int ret;
69
70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72
73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74                     ((mroute6_is_socket(net, skb) &&
75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77                                          &ipv6_hdr(skb)->saddr))) {
78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79
80                         /* Do not check for IFF_ALLMULTI; multicast routing
81                            is not supported in any case.
82                          */
83                         if (newskb)
84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85                                         net, sk, newskb, NULL, newskb->dev,
86                                         dev_loopback_xmit);
87
88                         if (ipv6_hdr(skb)->hop_limit == 0) {
89                                 IP6_INC_STATS(net, idev,
90                                               IPSTATS_MIB_OUTDISCARDS);
91                                 kfree_skb(skb);
92                                 return 0;
93                         }
94                 }
95
96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
97
98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99                     IPV6_ADDR_SCOPE_NODELOCAL &&
100                     !(dev->flags & IFF_LOOPBACK)) {
101                         kfree_skb(skb);
102                         return 0;
103                 }
104         }
105
106         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107                 int res = lwtunnel_xmit(skb);
108
109                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
110                         return res;
111         }
112
113         rcu_read_lock_bh();
114         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116         if (unlikely(!neigh))
117                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118         if (!IS_ERR(neigh)) {
119                 sock_confirm_neigh(skb, neigh);
120                 ret = neigh_output(neigh, skb);
121                 rcu_read_unlock_bh();
122                 return ret;
123         }
124         rcu_read_unlock_bh();
125
126         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127         kfree_skb(skb);
128         return -EINVAL;
129 }
130
131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
132 {
133         int ret;
134
135         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
136         if (ret) {
137                 kfree_skb(skb);
138                 return ret;
139         }
140
141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
142         /* Policy lookup after SNAT yielded a new policy */
143         if (skb_dst(skb)->xfrm) {
144                 IPCB(skb)->flags |= IPSKB_REROUTED;
145                 return dst_output(net, sk, skb);
146         }
147 #endif
148
149         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
150             dst_allfrag(skb_dst(skb)) ||
151             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
152                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
153         else
154                 return ip6_finish_output2(net, sk, skb);
155 }
156
157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
158 {
159         struct net_device *dev = skb_dst(skb)->dev;
160         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161
162         skb->protocol = htons(ETH_P_IPV6);
163         skb->dev = dev;
164
165         if (unlikely(idev->cnf.disable_ipv6)) {
166                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
167                 kfree_skb(skb);
168                 return 0;
169         }
170
171         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
172                             net, sk, skb, NULL, dev,
173                             ip6_finish_output,
174                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
175 }
176
177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
178 {
179         if (!np->autoflowlabel_set)
180                 return ip6_default_np_autolabel(net);
181         else
182                 return np->autoflowlabel;
183 }
184
185 /*
186  * xmit an sk_buff (used by TCP, SCTP and DCCP)
187  * Note : socket lock is not held for SYNACK packets, but might be modified
188  * by calls to skb_set_owner_w() and ipv6_local_error(),
189  * which are using proper atomic operations or spinlocks.
190  */
191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
192              __u32 mark, struct ipv6_txoptions *opt, int tclass)
193 {
194         struct net *net = sock_net(sk);
195         const struct ipv6_pinfo *np = inet6_sk(sk);
196         struct in6_addr *first_hop = &fl6->daddr;
197         struct dst_entry *dst = skb_dst(skb);
198         struct ipv6hdr *hdr;
199         u8  proto = fl6->flowi6_proto;
200         int seg_len = skb->len;
201         int hlimit = -1;
202         u32 mtu;
203
204         if (opt) {
205                 unsigned int head_room;
206
207                 /* First: exthdrs may take lots of space (~8K for now)
208                    MAX_HEADER is not enough.
209                  */
210                 head_room = opt->opt_nflen + opt->opt_flen;
211                 seg_len += head_room;
212                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
213
214                 if (skb_headroom(skb) < head_room) {
215                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
216                         if (!skb2) {
217                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
218                                               IPSTATS_MIB_OUTDISCARDS);
219                                 kfree_skb(skb);
220                                 return -ENOBUFS;
221                         }
222                         consume_skb(skb);
223                         skb = skb2;
224                         /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
225                          * it is safe to call in our context (socket lock not held)
226                          */
227                         skb_set_owner_w(skb, (struct sock *)sk);
228                 }
229                 if (opt->opt_flen)
230                         ipv6_push_frag_opts(skb, opt, &proto);
231                 if (opt->opt_nflen)
232                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
233                                              &fl6->saddr);
234         }
235
236         skb_push(skb, sizeof(struct ipv6hdr));
237         skb_reset_network_header(skb);
238         hdr = ipv6_hdr(skb);
239
240         /*
241          *      Fill in the IPv6 header
242          */
243         if (np)
244                 hlimit = np->hop_limit;
245         if (hlimit < 0)
246                 hlimit = ip6_dst_hoplimit(dst);
247
248         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
249                                 ip6_autoflowlabel(net, np), fl6));
250
251         hdr->payload_len = htons(seg_len);
252         hdr->nexthdr = proto;
253         hdr->hop_limit = hlimit;
254
255         hdr->saddr = fl6->saddr;
256         hdr->daddr = *first_hop;
257
258         skb->protocol = htons(ETH_P_IPV6);
259         skb->priority = sk->sk_priority;
260         skb->mark = mark;
261
262         mtu = dst_mtu(dst);
263         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
264                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
265                               IPSTATS_MIB_OUT, skb->len);
266
267                 /* if egress device is enslaved to an L3 master device pass the
268                  * skb to its handler for processing
269                  */
270                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
271                 if (unlikely(!skb))
272                         return 0;
273
274                 /* hooks should never assume socket lock is held.
275                  * we promote our socket to non const
276                  */
277                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
278                                net, (struct sock *)sk, skb, NULL, dst->dev,
279                                dst_output);
280         }
281
282         skb->dev = dst->dev;
283         /* ipv6_local_error() does not require socket lock,
284          * we promote our socket to non const
285          */
286         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
287
288         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
289         kfree_skb(skb);
290         return -EMSGSIZE;
291 }
292 EXPORT_SYMBOL(ip6_xmit);
293
294 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
295 {
296         struct ip6_ra_chain *ra;
297         struct sock *last = NULL;
298
299         read_lock(&ip6_ra_lock);
300         for (ra = ip6_ra_chain; ra; ra = ra->next) {
301                 struct sock *sk = ra->sk;
302                 if (sk && ra->sel == sel &&
303                     (!sk->sk_bound_dev_if ||
304                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
305                         if (last) {
306                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
307                                 if (skb2)
308                                         rawv6_rcv(last, skb2);
309                         }
310                         last = sk;
311                 }
312         }
313
314         if (last) {
315                 rawv6_rcv(last, skb);
316                 read_unlock(&ip6_ra_lock);
317                 return 1;
318         }
319         read_unlock(&ip6_ra_lock);
320         return 0;
321 }
322
323 static int ip6_forward_proxy_check(struct sk_buff *skb)
324 {
325         struct ipv6hdr *hdr = ipv6_hdr(skb);
326         u8 nexthdr = hdr->nexthdr;
327         __be16 frag_off;
328         int offset;
329
330         if (ipv6_ext_hdr(nexthdr)) {
331                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
332                 if (offset < 0)
333                         return 0;
334         } else
335                 offset = sizeof(struct ipv6hdr);
336
337         if (nexthdr == IPPROTO_ICMPV6) {
338                 struct icmp6hdr *icmp6;
339
340                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
341                                          offset + 1 - skb->data)))
342                         return 0;
343
344                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
345
346                 switch (icmp6->icmp6_type) {
347                 case NDISC_ROUTER_SOLICITATION:
348                 case NDISC_ROUTER_ADVERTISEMENT:
349                 case NDISC_NEIGHBOUR_SOLICITATION:
350                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
351                 case NDISC_REDIRECT:
352                         /* For reaction involving unicast neighbor discovery
353                          * message destined to the proxied address, pass it to
354                          * input function.
355                          */
356                         return 1;
357                 default:
358                         break;
359                 }
360         }
361
362         /*
363          * The proxying router can't forward traffic sent to a link-local
364          * address, so signal the sender and discard the packet. This
365          * behavior is clarified by the MIPv6 specification.
366          */
367         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
368                 dst_link_failure(skb);
369                 return -1;
370         }
371
372         return 0;
373 }
374
375 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
376                                      struct sk_buff *skb)
377 {
378         struct dst_entry *dst = skb_dst(skb);
379
380         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
381         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
382
383         return dst_output(net, sk, skb);
384 }
385
386 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
387 {
388         if (skb->len <= mtu)
389                 return false;
390
391         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
392         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
393                 return true;
394
395         if (skb->ignore_df)
396                 return false;
397
398         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
399                 return false;
400
401         return true;
402 }
403
404 int ip6_forward(struct sk_buff *skb)
405 {
406         struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
407         struct dst_entry *dst = skb_dst(skb);
408         struct ipv6hdr *hdr = ipv6_hdr(skb);
409         struct inet6_skb_parm *opt = IP6CB(skb);
410         struct net *net = dev_net(dst->dev);
411         u32 mtu;
412
413         if (net->ipv6.devconf_all->forwarding == 0)
414                 goto error;
415
416         if (skb->pkt_type != PACKET_HOST)
417                 goto drop;
418
419         if (unlikely(skb->sk))
420                 goto drop;
421
422         if (skb_warn_if_lro(skb))
423                 goto drop;
424
425         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
426                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
427                 goto drop;
428         }
429
430         skb_forward_csum(skb);
431
432         /*
433          *      We DO NOT make any processing on
434          *      RA packets, pushing them to user level AS IS
435          *      without ane WARRANTY that application will be able
436          *      to interpret them. The reason is that we
437          *      cannot make anything clever here.
438          *
439          *      We are not end-node, so that if packet contains
440          *      AH/ESP, we cannot make anything.
441          *      Defragmentation also would be mistake, RA packets
442          *      cannot be fragmented, because there is no warranty
443          *      that different fragments will go along one path. --ANK
444          */
445         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
446                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
447                         return 0;
448         }
449
450         /*
451          *      check and decrement ttl
452          */
453         if (hdr->hop_limit <= 1) {
454                 /* Force OUTPUT device used as source address */
455                 skb->dev = dst->dev;
456                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
457                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
458
459                 kfree_skb(skb);
460                 return -ETIMEDOUT;
461         }
462
463         /* XXX: idev->cnf.proxy_ndp? */
464         if (net->ipv6.devconf_all->proxy_ndp &&
465             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
466                 int proxied = ip6_forward_proxy_check(skb);
467                 if (proxied > 0)
468                         return ip6_input(skb);
469                 else if (proxied < 0) {
470                         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
471                         goto drop;
472                 }
473         }
474
475         if (!xfrm6_route_forward(skb)) {
476                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
477                 goto drop;
478         }
479         dst = skb_dst(skb);
480
481         /* IPv6 specs say nothing about it, but it is clear that we cannot
482            send redirects to source routed frames.
483            We don't send redirects to frames decapsulated from IPsec.
484          */
485         if (IP6CB(skb)->iif == dst->dev->ifindex &&
486             opt->srcrt == 0 && !skb_sec_path(skb)) {
487                 struct in6_addr *target = NULL;
488                 struct inet_peer *peer;
489                 struct rt6_info *rt;
490
491                 /*
492                  *      incoming and outgoing devices are the same
493                  *      send a redirect.
494                  */
495
496                 rt = (struct rt6_info *) dst;
497                 if (rt->rt6i_flags & RTF_GATEWAY)
498                         target = &rt->rt6i_gateway;
499                 else
500                         target = &hdr->daddr;
501
502                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
503
504                 /* Limit redirects both by destination (here)
505                    and by source (inside ndisc_send_redirect)
506                  */
507                 if (inet_peer_xrlim_allow(peer, 1*HZ))
508                         ndisc_send_redirect(skb, target);
509                 if (peer)
510                         inet_putpeer(peer);
511         } else {
512                 int addrtype = ipv6_addr_type(&hdr->saddr);
513
514                 /* This check is security critical. */
515                 if (addrtype == IPV6_ADDR_ANY ||
516                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
517                         goto error;
518                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
519                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
520                                     ICMPV6_NOT_NEIGHBOUR, 0);
521                         goto error;
522                 }
523         }
524
525         mtu = ip6_dst_mtu_forward(dst);
526         if (mtu < IPV6_MIN_MTU)
527                 mtu = IPV6_MIN_MTU;
528
529         if (ip6_pkt_too_big(skb, mtu)) {
530                 /* Again, force OUTPUT device used as source address */
531                 skb->dev = dst->dev;
532                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
533                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
534                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
535                                 IPSTATS_MIB_FRAGFAILS);
536                 kfree_skb(skb);
537                 return -EMSGSIZE;
538         }
539
540         if (skb_cow(skb, dst->dev->hard_header_len)) {
541                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
542                                 IPSTATS_MIB_OUTDISCARDS);
543                 goto drop;
544         }
545
546         hdr = ipv6_hdr(skb);
547
548         /* Mangling hops number delayed to point after skb COW */
549
550         hdr->hop_limit--;
551
552         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
553                        net, NULL, skb, skb->dev, dst->dev,
554                        ip6_forward_finish);
555
556 error:
557         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
558 drop:
559         kfree_skb(skb);
560         return -EINVAL;
561 }
562
563 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
564 {
565         to->pkt_type = from->pkt_type;
566         to->priority = from->priority;
567         to->protocol = from->protocol;
568         skb_dst_drop(to);
569         skb_dst_set(to, dst_clone(skb_dst(from)));
570         to->dev = from->dev;
571         to->mark = from->mark;
572
573         skb_copy_hash(to, from);
574
575 #ifdef CONFIG_NET_SCHED
576         to->tc_index = from->tc_index;
577 #endif
578         nf_copy(to, from);
579         skb_copy_secmark(to, from);
580 }
581
582 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
583                  int (*output)(struct net *, struct sock *, struct sk_buff *))
584 {
585         struct sk_buff *frag;
586         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
587         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
588                                 inet6_sk(skb->sk) : NULL;
589         struct ipv6hdr *tmp_hdr;
590         struct frag_hdr *fh;
591         unsigned int mtu, hlen, left, len;
592         int hroom, troom;
593         __be32 frag_id;
594         int ptr, offset = 0, err = 0;
595         u8 *prevhdr, nexthdr = 0;
596
597         err = ip6_find_1stfragopt(skb, &prevhdr);
598         if (err < 0)
599                 goto fail;
600         hlen = err;
601         nexthdr = *prevhdr;
602
603         mtu = ip6_skb_dst_mtu(skb);
604
605         /* We must not fragment if the socket is set to force MTU discovery
606          * or if the skb it not generated by a local socket.
607          */
608         if (unlikely(!skb->ignore_df && skb->len > mtu))
609                 goto fail_toobig;
610
611         if (IP6CB(skb)->frag_max_size) {
612                 if (IP6CB(skb)->frag_max_size > mtu)
613                         goto fail_toobig;
614
615                 /* don't send fragments larger than what we received */
616                 mtu = IP6CB(skb)->frag_max_size;
617                 if (mtu < IPV6_MIN_MTU)
618                         mtu = IPV6_MIN_MTU;
619         }
620
621         if (np && np->frag_size < mtu) {
622                 if (np->frag_size)
623                         mtu = np->frag_size;
624         }
625         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
626                 goto fail_toobig;
627         mtu -= hlen + sizeof(struct frag_hdr);
628
629         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
630                                     &ipv6_hdr(skb)->saddr);
631
632         if (skb->ip_summed == CHECKSUM_PARTIAL &&
633             (err = skb_checksum_help(skb)))
634                 goto fail;
635
636         hroom = LL_RESERVED_SPACE(rt->dst.dev);
637         if (skb_has_frag_list(skb)) {
638                 unsigned int first_len = skb_pagelen(skb);
639                 struct sk_buff *frag2;
640
641                 if (first_len - hlen > mtu ||
642                     ((first_len - hlen) & 7) ||
643                     skb_cloned(skb) ||
644                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
645                         goto slow_path;
646
647                 skb_walk_frags(skb, frag) {
648                         /* Correct geometry. */
649                         if (frag->len > mtu ||
650                             ((frag->len & 7) && frag->next) ||
651                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
652                                 goto slow_path_clean;
653
654                         /* Partially cloned skb? */
655                         if (skb_shared(frag))
656                                 goto slow_path_clean;
657
658                         BUG_ON(frag->sk);
659                         if (skb->sk) {
660                                 frag->sk = skb->sk;
661                                 frag->destructor = sock_wfree;
662                         }
663                         skb->truesize -= frag->truesize;
664                 }
665
666                 err = 0;
667                 offset = 0;
668                 /* BUILD HEADER */
669
670                 *prevhdr = NEXTHDR_FRAGMENT;
671                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
672                 if (!tmp_hdr) {
673                         err = -ENOMEM;
674                         goto fail;
675                 }
676                 frag = skb_shinfo(skb)->frag_list;
677                 skb_frag_list_init(skb);
678
679                 __skb_pull(skb, hlen);
680                 fh = __skb_push(skb, sizeof(struct frag_hdr));
681                 __skb_push(skb, hlen);
682                 skb_reset_network_header(skb);
683                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
684
685                 fh->nexthdr = nexthdr;
686                 fh->reserved = 0;
687                 fh->frag_off = htons(IP6_MF);
688                 fh->identification = frag_id;
689
690                 first_len = skb_pagelen(skb);
691                 skb->data_len = first_len - skb_headlen(skb);
692                 skb->len = first_len;
693                 ipv6_hdr(skb)->payload_len = htons(first_len -
694                                                    sizeof(struct ipv6hdr));
695
696                 for (;;) {
697                         /* Prepare header of the next frame,
698                          * before previous one went down. */
699                         if (frag) {
700                                 frag->ip_summed = CHECKSUM_NONE;
701                                 skb_reset_transport_header(frag);
702                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
703                                 __skb_push(frag, hlen);
704                                 skb_reset_network_header(frag);
705                                 memcpy(skb_network_header(frag), tmp_hdr,
706                                        hlen);
707                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
708                                 fh->nexthdr = nexthdr;
709                                 fh->reserved = 0;
710                                 fh->frag_off = htons(offset);
711                                 if (frag->next)
712                                         fh->frag_off |= htons(IP6_MF);
713                                 fh->identification = frag_id;
714                                 ipv6_hdr(frag)->payload_len =
715                                                 htons(frag->len -
716                                                       sizeof(struct ipv6hdr));
717                                 ip6_copy_metadata(frag, skb);
718                         }
719
720                         err = output(net, sk, skb);
721                         if (!err)
722                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
723                                               IPSTATS_MIB_FRAGCREATES);
724
725                         if (err || !frag)
726                                 break;
727
728                         skb = frag;
729                         frag = skb->next;
730                         skb->next = NULL;
731                 }
732
733                 kfree(tmp_hdr);
734
735                 if (err == 0) {
736                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
737                                       IPSTATS_MIB_FRAGOKS);
738                         return 0;
739                 }
740
741                 kfree_skb_list(frag);
742
743                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
744                               IPSTATS_MIB_FRAGFAILS);
745                 return err;
746
747 slow_path_clean:
748                 skb_walk_frags(skb, frag2) {
749                         if (frag2 == frag)
750                                 break;
751                         frag2->sk = NULL;
752                         frag2->destructor = NULL;
753                         skb->truesize += frag2->truesize;
754                 }
755         }
756
757 slow_path:
758         left = skb->len - hlen;         /* Space per frame */
759         ptr = hlen;                     /* Where to start from */
760
761         /*
762          *      Fragment the datagram.
763          */
764
765         troom = rt->dst.dev->needed_tailroom;
766
767         /*
768          *      Keep copying data until we run out.
769          */
770         while (left > 0)        {
771                 u8 *fragnexthdr_offset;
772
773                 len = left;
774                 /* IF: it doesn't fit, use 'mtu' - the data space left */
775                 if (len > mtu)
776                         len = mtu;
777                 /* IF: we are not sending up to and including the packet end
778                    then align the next start on an eight byte boundary */
779                 if (len < left) {
780                         len &= ~7;
781                 }
782
783                 /* Allocate buffer */
784                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
785                                  hroom + troom, GFP_ATOMIC);
786                 if (!frag) {
787                         err = -ENOMEM;
788                         goto fail;
789                 }
790
791                 /*
792                  *      Set up data on packet
793                  */
794
795                 ip6_copy_metadata(frag, skb);
796                 skb_reserve(frag, hroom);
797                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
798                 skb_reset_network_header(frag);
799                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
800                 frag->transport_header = (frag->network_header + hlen +
801                                           sizeof(struct frag_hdr));
802
803                 /*
804                  *      Charge the memory for the fragment to any owner
805                  *      it might possess
806                  */
807                 if (skb->sk)
808                         skb_set_owner_w(frag, skb->sk);
809
810                 /*
811                  *      Copy the packet header into the new buffer.
812                  */
813                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
814
815                 fragnexthdr_offset = skb_network_header(frag);
816                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
817                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
818
819                 /*
820                  *      Build fragment header.
821                  */
822                 fh->nexthdr = nexthdr;
823                 fh->reserved = 0;
824                 fh->identification = frag_id;
825
826                 /*
827                  *      Copy a block of the IP datagram.
828                  */
829                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
830                                      len));
831                 left -= len;
832
833                 fh->frag_off = htons(offset);
834                 if (left > 0)
835                         fh->frag_off |= htons(IP6_MF);
836                 ipv6_hdr(frag)->payload_len = htons(frag->len -
837                                                     sizeof(struct ipv6hdr));
838
839                 ptr += len;
840                 offset += len;
841
842                 /*
843                  *      Put this fragment into the sending queue.
844                  */
845                 err = output(net, sk, frag);
846                 if (err)
847                         goto fail;
848
849                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
850                               IPSTATS_MIB_FRAGCREATES);
851         }
852         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
853                       IPSTATS_MIB_FRAGOKS);
854         consume_skb(skb);
855         return err;
856
857 fail_toobig:
858         if (skb->sk && dst_allfrag(skb_dst(skb)))
859                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
860
861         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
862         err = -EMSGSIZE;
863
864 fail:
865         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
866                       IPSTATS_MIB_FRAGFAILS);
867         kfree_skb(skb);
868         return err;
869 }
870
871 static inline int ip6_rt_check(const struct rt6key *rt_key,
872                                const struct in6_addr *fl_addr,
873                                const struct in6_addr *addr_cache)
874 {
875         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
876                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
877 }
878
879 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
880                                           struct dst_entry *dst,
881                                           const struct flowi6 *fl6)
882 {
883         struct ipv6_pinfo *np = inet6_sk(sk);
884         struct rt6_info *rt;
885
886         if (!dst)
887                 goto out;
888
889         if (dst->ops->family != AF_INET6) {
890                 dst_release(dst);
891                 return NULL;
892         }
893
894         rt = (struct rt6_info *)dst;
895         /* Yes, checking route validity in not connected
896          * case is not very simple. Take into account,
897          * that we do not support routing by source, TOS,
898          * and MSG_DONTROUTE            --ANK (980726)
899          *
900          * 1. ip6_rt_check(): If route was host route,
901          *    check that cached destination is current.
902          *    If it is network route, we still may
903          *    check its validity using saved pointer
904          *    to the last used address: daddr_cache.
905          *    We do not want to save whole address now,
906          *    (because main consumer of this service
907          *    is tcp, which has not this problem),
908          *    so that the last trick works only on connected
909          *    sockets.
910          * 2. oif also should be the same.
911          */
912         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
913 #ifdef CONFIG_IPV6_SUBTREES
914             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
915 #endif
916            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
917               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
918                 dst_release(dst);
919                 dst = NULL;
920         }
921
922 out:
923         return dst;
924 }
925
926 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
927                                struct dst_entry **dst, struct flowi6 *fl6)
928 {
929 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
930         struct neighbour *n;
931         struct rt6_info *rt;
932 #endif
933         int err;
934         int flags = 0;
935
936         /* The correct way to handle this would be to do
937          * ip6_route_get_saddr, and then ip6_route_output; however,
938          * the route-specific preferred source forces the
939          * ip6_route_output call _before_ ip6_route_get_saddr.
940          *
941          * In source specific routing (no src=any default route),
942          * ip6_route_output will fail given src=any saddr, though, so
943          * that's why we try it again later.
944          */
945         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
946                 struct fib6_info *from;
947                 struct rt6_info *rt;
948                 bool had_dst = *dst != NULL;
949
950                 if (!had_dst)
951                         *dst = ip6_route_output(net, sk, fl6);
952                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
953
954                 rcu_read_lock();
955                 from = rt ? rcu_dereference(rt->from) : NULL;
956                 err = ip6_route_get_saddr(net, from, &fl6->daddr,
957                                           sk ? inet6_sk(sk)->srcprefs : 0,
958                                           &fl6->saddr);
959                 rcu_read_unlock();
960
961                 if (err)
962                         goto out_err_release;
963
964                 /* If we had an erroneous initial result, pretend it
965                  * never existed and let the SA-enabled version take
966                  * over.
967                  */
968                 if (!had_dst && (*dst)->error) {
969                         dst_release(*dst);
970                         *dst = NULL;
971                 }
972
973                 if (fl6->flowi6_oif)
974                         flags |= RT6_LOOKUP_F_IFACE;
975         }
976
977         if (!*dst)
978                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
979
980         err = (*dst)->error;
981         if (err)
982                 goto out_err_release;
983
984 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
985         /*
986          * Here if the dst entry we've looked up
987          * has a neighbour entry that is in the INCOMPLETE
988          * state and the src address from the flow is
989          * marked as OPTIMISTIC, we release the found
990          * dst entry and replace it instead with the
991          * dst entry of the nexthop router
992          */
993         rt = (struct rt6_info *) *dst;
994         rcu_read_lock_bh();
995         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
996                                       rt6_nexthop(rt, &fl6->daddr));
997         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
998         rcu_read_unlock_bh();
999
1000         if (err) {
1001                 struct inet6_ifaddr *ifp;
1002                 struct flowi6 fl_gw6;
1003                 int redirect;
1004
1005                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1006                                       (*dst)->dev, 1);
1007
1008                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1009                 if (ifp)
1010                         in6_ifa_put(ifp);
1011
1012                 if (redirect) {
1013                         /*
1014                          * We need to get the dst entry for the
1015                          * default router instead
1016                          */
1017                         dst_release(*dst);
1018                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1019                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1020                         *dst = ip6_route_output(net, sk, &fl_gw6);
1021                         err = (*dst)->error;
1022                         if (err)
1023                                 goto out_err_release;
1024                 }
1025         }
1026 #endif
1027         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1028             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1029                 err = -EAFNOSUPPORT;
1030                 goto out_err_release;
1031         }
1032
1033         return 0;
1034
1035 out_err_release:
1036         dst_release(*dst);
1037         *dst = NULL;
1038
1039         if (err == -ENETUNREACH)
1040                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1041         return err;
1042 }
1043
1044 /**
1045  *      ip6_dst_lookup - perform route lookup on flow
1046  *      @sk: socket which provides route info
1047  *      @dst: pointer to dst_entry * for result
1048  *      @fl6: flow to lookup
1049  *
1050  *      This function performs a route lookup on the given flow.
1051  *
1052  *      It returns zero on success, or a standard errno code on error.
1053  */
1054 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1055                    struct flowi6 *fl6)
1056 {
1057         *dst = NULL;
1058         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1059 }
1060 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1061
1062 /**
1063  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1064  *      @sk: socket which provides route info
1065  *      @fl6: flow to lookup
1066  *      @final_dst: final destination address for ipsec lookup
1067  *
1068  *      This function performs a route lookup on the given flow.
1069  *
1070  *      It returns a valid dst pointer on success, or a pointer encoded
1071  *      error code.
1072  */
1073 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1074                                       const struct in6_addr *final_dst)
1075 {
1076         struct dst_entry *dst = NULL;
1077         int err;
1078
1079         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1080         if (err)
1081                 return ERR_PTR(err);
1082         if (final_dst)
1083                 fl6->daddr = *final_dst;
1084
1085         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1086 }
1087 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1088
1089 /**
1090  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1091  *      @sk: socket which provides the dst cache and route info
1092  *      @fl6: flow to lookup
1093  *      @final_dst: final destination address for ipsec lookup
1094  *      @connected: whether @sk is connected or not
1095  *
1096  *      This function performs a route lookup on the given flow with the
1097  *      possibility of using the cached route in the socket if it is valid.
1098  *      It will take the socket dst lock when operating on the dst cache.
1099  *      As a result, this function can only be used in process context.
1100  *
1101  *      In addition, for a connected socket, cache the dst in the socket
1102  *      if the current cache is not valid.
1103  *
1104  *      It returns a valid dst pointer on success, or a pointer encoded
1105  *      error code.
1106  */
1107 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1108                                          const struct in6_addr *final_dst,
1109                                          bool connected)
1110 {
1111         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1112
1113         dst = ip6_sk_dst_check(sk, dst, fl6);
1114         if (dst)
1115                 return dst;
1116
1117         dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1118         if (connected && !IS_ERR(dst))
1119                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1120
1121         return dst;
1122 }
1123 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1124
1125 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1126                                                gfp_t gfp)
1127 {
1128         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1129 }
1130
1131 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1132                                                 gfp_t gfp)
1133 {
1134         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1135 }
1136
1137 static void ip6_append_data_mtu(unsigned int *mtu,
1138                                 int *maxfraglen,
1139                                 unsigned int fragheaderlen,
1140                                 struct sk_buff *skb,
1141                                 struct rt6_info *rt,
1142                                 unsigned int orig_mtu)
1143 {
1144         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1145                 if (!skb) {
1146                         /* first fragment, reserve header_len */
1147                         *mtu = orig_mtu - rt->dst.header_len;
1148
1149                 } else {
1150                         /*
1151                          * this fragment is not first, the headers
1152                          * space is regarded as data space.
1153                          */
1154                         *mtu = orig_mtu;
1155                 }
1156                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1157                               + fragheaderlen - sizeof(struct frag_hdr);
1158         }
1159 }
1160
1161 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1162                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1163                           struct rt6_info *rt, struct flowi6 *fl6)
1164 {
1165         struct ipv6_pinfo *np = inet6_sk(sk);
1166         unsigned int mtu;
1167         struct ipv6_txoptions *opt = ipc6->opt;
1168
1169         /*
1170          * setup for corking
1171          */
1172         if (opt) {
1173                 if (WARN_ON(v6_cork->opt))
1174                         return -EINVAL;
1175
1176                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1177                 if (unlikely(!v6_cork->opt))
1178                         return -ENOBUFS;
1179
1180                 v6_cork->opt->tot_len = sizeof(*opt);
1181                 v6_cork->opt->opt_flen = opt->opt_flen;
1182                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1183
1184                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1185                                                     sk->sk_allocation);
1186                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1187                         return -ENOBUFS;
1188
1189                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1190                                                     sk->sk_allocation);
1191                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1192                         return -ENOBUFS;
1193
1194                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1195                                                    sk->sk_allocation);
1196                 if (opt->hopopt && !v6_cork->opt->hopopt)
1197                         return -ENOBUFS;
1198
1199                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1200                                                     sk->sk_allocation);
1201                 if (opt->srcrt && !v6_cork->opt->srcrt)
1202                         return -ENOBUFS;
1203
1204                 /* need source address above miyazawa*/
1205         }
1206         dst_hold(&rt->dst);
1207         cork->base.dst = &rt->dst;
1208         cork->fl.u.ip6 = *fl6;
1209         v6_cork->hop_limit = ipc6->hlimit;
1210         v6_cork->tclass = ipc6->tclass;
1211         if (rt->dst.flags & DST_XFRM_TUNNEL)
1212                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1213                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1214         else
1215                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1216                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1217         if (np->frag_size < mtu) {
1218                 if (np->frag_size)
1219                         mtu = np->frag_size;
1220         }
1221         if (mtu < IPV6_MIN_MTU)
1222                 return -EINVAL;
1223         cork->base.fragsize = mtu;
1224         cork->base.gso_size = ipc6->gso_size;
1225         cork->base.tx_flags = 0;
1226         sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1227
1228         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1229                 cork->base.flags |= IPCORK_ALLFRAG;
1230         cork->base.length = 0;
1231
1232         cork->base.transmit_time = ipc6->sockc.transmit_time;
1233
1234         return 0;
1235 }
1236
1237 static int __ip6_append_data(struct sock *sk,
1238                              struct flowi6 *fl6,
1239                              struct sk_buff_head *queue,
1240                              struct inet_cork *cork,
1241                              struct inet6_cork *v6_cork,
1242                              struct page_frag *pfrag,
1243                              int getfrag(void *from, char *to, int offset,
1244                                          int len, int odd, struct sk_buff *skb),
1245                              void *from, int length, int transhdrlen,
1246                              unsigned int flags, struct ipcm6_cookie *ipc6)
1247 {
1248         struct sk_buff *skb, *skb_prev = NULL;
1249         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1250         int exthdrlen = 0;
1251         int dst_exthdrlen = 0;
1252         int hh_len;
1253         int copy;
1254         int err;
1255         int offset = 0;
1256         u32 tskey = 0;
1257         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1258         struct ipv6_txoptions *opt = v6_cork->opt;
1259         int csummode = CHECKSUM_NONE;
1260         unsigned int maxnonfragsize, headersize;
1261         unsigned int wmem_alloc_delta = 0;
1262         bool paged;
1263
1264         skb = skb_peek_tail(queue);
1265         if (!skb) {
1266                 exthdrlen = opt ? opt->opt_flen : 0;
1267                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1268         }
1269
1270         paged = !!cork->gso_size;
1271         mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1272         orig_mtu = mtu;
1273
1274         if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1275             sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1276                 tskey = sk->sk_tskey++;
1277
1278         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1279
1280         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1281                         (opt ? opt->opt_nflen : 0);
1282         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1283                      sizeof(struct frag_hdr);
1284
1285         headersize = sizeof(struct ipv6hdr) +
1286                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1287                      (dst_allfrag(&rt->dst) ?
1288                       sizeof(struct frag_hdr) : 0) +
1289                      rt->rt6i_nfheader_len;
1290
1291         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1292          * the first fragment
1293          */
1294         if (headersize + transhdrlen > mtu)
1295                 goto emsgsize;
1296
1297         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1298             (sk->sk_protocol == IPPROTO_UDP ||
1299              sk->sk_protocol == IPPROTO_RAW)) {
1300                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1301                                 sizeof(struct ipv6hdr));
1302                 goto emsgsize;
1303         }
1304
1305         if (ip6_sk_ignore_df(sk))
1306                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1307         else
1308                 maxnonfragsize = mtu;
1309
1310         if (cork->length + length > maxnonfragsize - headersize) {
1311 emsgsize:
1312                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1313                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1314                 return -EMSGSIZE;
1315         }
1316
1317         /* CHECKSUM_PARTIAL only with no extension headers and when
1318          * we are not going to fragment
1319          */
1320         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1321             headersize == sizeof(struct ipv6hdr) &&
1322             length <= mtu - headersize &&
1323             (!(flags & MSG_MORE) || cork->gso_size) &&
1324             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1325                 csummode = CHECKSUM_PARTIAL;
1326
1327         /*
1328          * Let's try using as much space as possible.
1329          * Use MTU if total length of the message fits into the MTU.
1330          * Otherwise, we need to reserve fragment header and
1331          * fragment alignment (= 8-15 octects, in total).
1332          *
1333          * Note that we may need to "move" the data from the tail of
1334          * of the buffer to the new fragment when we split
1335          * the message.
1336          *
1337          * FIXME: It may be fragmented into multiple chunks
1338          *        at once if non-fragmentable extension headers
1339          *        are too large.
1340          * --yoshfuji
1341          */
1342
1343         cork->length += length;
1344         if (!skb)
1345                 goto alloc_new_skb;
1346
1347         while (length > 0) {
1348                 /* Check if the remaining data fits into current packet. */
1349                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1350                 if (copy < length)
1351                         copy = maxfraglen - skb->len;
1352
1353                 if (copy <= 0) {
1354                         char *data;
1355                         unsigned int datalen;
1356                         unsigned int fraglen;
1357                         unsigned int fraggap;
1358                         unsigned int alloclen;
1359                         unsigned int pagedlen = 0;
1360 alloc_new_skb:
1361                         /* There's no room in the current skb */
1362                         if (skb)
1363                                 fraggap = skb->len - maxfraglen;
1364                         else
1365                                 fraggap = 0;
1366                         /* update mtu and maxfraglen if necessary */
1367                         if (!skb || !skb_prev)
1368                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1369                                                     fragheaderlen, skb, rt,
1370                                                     orig_mtu);
1371
1372                         skb_prev = skb;
1373
1374                         /*
1375                          * If remaining data exceeds the mtu,
1376                          * we know we need more fragment(s).
1377                          */
1378                         datalen = length + fraggap;
1379
1380                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1381                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1382                         fraglen = datalen + fragheaderlen;
1383
1384                         if ((flags & MSG_MORE) &&
1385                             !(rt->dst.dev->features&NETIF_F_SG))
1386                                 alloclen = mtu;
1387                         else if (!paged)
1388                                 alloclen = fraglen;
1389                         else {
1390                                 alloclen = min_t(int, fraglen, MAX_HEADER);
1391                                 pagedlen = fraglen - alloclen;
1392                         }
1393
1394                         alloclen += dst_exthdrlen;
1395
1396                         if (datalen != length + fraggap) {
1397                                 /*
1398                                  * this is not the last fragment, the trailer
1399                                  * space is regarded as data space.
1400                                  */
1401                                 datalen += rt->dst.trailer_len;
1402                         }
1403
1404                         alloclen += rt->dst.trailer_len;
1405                         fraglen = datalen + fragheaderlen;
1406
1407                         /*
1408                          * We just reserve space for fragment header.
1409                          * Note: this may be overallocation if the message
1410                          * (without MSG_MORE) fits into the MTU.
1411                          */
1412                         alloclen += sizeof(struct frag_hdr);
1413
1414                         copy = datalen - transhdrlen - fraggap - pagedlen;
1415                         if (copy < 0) {
1416                                 err = -EINVAL;
1417                                 goto error;
1418                         }
1419                         if (transhdrlen) {
1420                                 skb = sock_alloc_send_skb(sk,
1421                                                 alloclen + hh_len,
1422                                                 (flags & MSG_DONTWAIT), &err);
1423                         } else {
1424                                 skb = NULL;
1425                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1426                                     2 * sk->sk_sndbuf)
1427                                         skb = alloc_skb(alloclen + hh_len,
1428                                                         sk->sk_allocation);
1429                                 if (unlikely(!skb))
1430                                         err = -ENOBUFS;
1431                         }
1432                         if (!skb)
1433                                 goto error;
1434                         /*
1435                          *      Fill in the control structures
1436                          */
1437                         skb->protocol = htons(ETH_P_IPV6);
1438                         skb->ip_summed = csummode;
1439                         skb->csum = 0;
1440                         /* reserve for fragmentation and ipsec header */
1441                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1442                                     dst_exthdrlen);
1443
1444                         /* Only the initial fragment is time stamped */
1445                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
1446                         cork->tx_flags = 0;
1447                         skb_shinfo(skb)->tskey = tskey;
1448                         tskey = 0;
1449
1450                         /*
1451                          *      Find where to start putting bytes
1452                          */
1453                         data = skb_put(skb, fraglen - pagedlen);
1454                         skb_set_network_header(skb, exthdrlen);
1455                         data += fragheaderlen;
1456                         skb->transport_header = (skb->network_header +
1457                                                  fragheaderlen);
1458                         if (fraggap) {
1459                                 skb->csum = skb_copy_and_csum_bits(
1460                                         skb_prev, maxfraglen,
1461                                         data + transhdrlen, fraggap, 0);
1462                                 skb_prev->csum = csum_sub(skb_prev->csum,
1463                                                           skb->csum);
1464                                 data += fraggap;
1465                                 pskb_trim_unique(skb_prev, maxfraglen);
1466                         }
1467                         if (copy > 0 &&
1468                             getfrag(from, data + transhdrlen, offset,
1469                                     copy, fraggap, skb) < 0) {
1470                                 err = -EFAULT;
1471                                 kfree_skb(skb);
1472                                 goto error;
1473                         }
1474
1475                         offset += copy;
1476                         length -= copy + transhdrlen;
1477                         transhdrlen = 0;
1478                         exthdrlen = 0;
1479                         dst_exthdrlen = 0;
1480
1481                         if ((flags & MSG_CONFIRM) && !skb_prev)
1482                                 skb_set_dst_pending_confirm(skb, 1);
1483
1484                         /*
1485                          * Put the packet on the pending queue
1486                          */
1487                         if (!skb->destructor) {
1488                                 skb->destructor = sock_wfree;
1489                                 skb->sk = sk;
1490                                 wmem_alloc_delta += skb->truesize;
1491                         }
1492                         __skb_queue_tail(queue, skb);
1493                         continue;
1494                 }
1495
1496                 if (copy > length)
1497                         copy = length;
1498
1499                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1500                     skb_tailroom(skb) >= copy) {
1501                         unsigned int off;
1502
1503                         off = skb->len;
1504                         if (getfrag(from, skb_put(skb, copy),
1505                                                 offset, copy, off, skb) < 0) {
1506                                 __skb_trim(skb, off);
1507                                 err = -EFAULT;
1508                                 goto error;
1509                         }
1510                 } else {
1511                         int i = skb_shinfo(skb)->nr_frags;
1512
1513                         err = -ENOMEM;
1514                         if (!sk_page_frag_refill(sk, pfrag))
1515                                 goto error;
1516
1517                         if (!skb_can_coalesce(skb, i, pfrag->page,
1518                                               pfrag->offset)) {
1519                                 err = -EMSGSIZE;
1520                                 if (i == MAX_SKB_FRAGS)
1521                                         goto error;
1522
1523                                 __skb_fill_page_desc(skb, i, pfrag->page,
1524                                                      pfrag->offset, 0);
1525                                 skb_shinfo(skb)->nr_frags = ++i;
1526                                 get_page(pfrag->page);
1527                         }
1528                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1529                         if (getfrag(from,
1530                                     page_address(pfrag->page) + pfrag->offset,
1531                                     offset, copy, skb->len, skb) < 0)
1532                                 goto error_efault;
1533
1534                         pfrag->offset += copy;
1535                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1536                         skb->len += copy;
1537                         skb->data_len += copy;
1538                         skb->truesize += copy;
1539                         wmem_alloc_delta += copy;
1540                 }
1541                 offset += copy;
1542                 length -= copy;
1543         }
1544
1545         if (wmem_alloc_delta)
1546                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1547         return 0;
1548
1549 error_efault:
1550         err = -EFAULT;
1551 error:
1552         cork->length -= length;
1553         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1554         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1555         return err;
1556 }
1557
1558 int ip6_append_data(struct sock *sk,
1559                     int getfrag(void *from, char *to, int offset, int len,
1560                                 int odd, struct sk_buff *skb),
1561                     void *from, int length, int transhdrlen,
1562                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1563                     struct rt6_info *rt, unsigned int flags)
1564 {
1565         struct inet_sock *inet = inet_sk(sk);
1566         struct ipv6_pinfo *np = inet6_sk(sk);
1567         int exthdrlen;
1568         int err;
1569
1570         if (flags&MSG_PROBE)
1571                 return 0;
1572         if (skb_queue_empty(&sk->sk_write_queue)) {
1573                 /*
1574                  * setup for corking
1575                  */
1576                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1577                                      ipc6, rt, fl6);
1578                 if (err)
1579                         return err;
1580
1581                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1582                 length += exthdrlen;
1583                 transhdrlen += exthdrlen;
1584         } else {
1585                 fl6 = &inet->cork.fl.u.ip6;
1586                 transhdrlen = 0;
1587         }
1588
1589         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1590                                  &np->cork, sk_page_frag(sk), getfrag,
1591                                  from, length, transhdrlen, flags, ipc6);
1592 }
1593 EXPORT_SYMBOL_GPL(ip6_append_data);
1594
1595 static void ip6_cork_release(struct inet_cork_full *cork,
1596                              struct inet6_cork *v6_cork)
1597 {
1598         if (v6_cork->opt) {
1599                 kfree(v6_cork->opt->dst0opt);
1600                 kfree(v6_cork->opt->dst1opt);
1601                 kfree(v6_cork->opt->hopopt);
1602                 kfree(v6_cork->opt->srcrt);
1603                 kfree(v6_cork->opt);
1604                 v6_cork->opt = NULL;
1605         }
1606
1607         if (cork->base.dst) {
1608                 dst_release(cork->base.dst);
1609                 cork->base.dst = NULL;
1610                 cork->base.flags &= ~IPCORK_ALLFRAG;
1611         }
1612         memset(&cork->fl, 0, sizeof(cork->fl));
1613 }
1614
1615 struct sk_buff *__ip6_make_skb(struct sock *sk,
1616                                struct sk_buff_head *queue,
1617                                struct inet_cork_full *cork,
1618                                struct inet6_cork *v6_cork)
1619 {
1620         struct sk_buff *skb, *tmp_skb;
1621         struct sk_buff **tail_skb;
1622         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1623         struct ipv6_pinfo *np = inet6_sk(sk);
1624         struct net *net = sock_net(sk);
1625         struct ipv6hdr *hdr;
1626         struct ipv6_txoptions *opt = v6_cork->opt;
1627         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1628         struct flowi6 *fl6 = &cork->fl.u.ip6;
1629         unsigned char proto = fl6->flowi6_proto;
1630
1631         skb = __skb_dequeue(queue);
1632         if (!skb)
1633                 goto out;
1634         tail_skb = &(skb_shinfo(skb)->frag_list);
1635
1636         /* move skb->data to ip header from ext header */
1637         if (skb->data < skb_network_header(skb))
1638                 __skb_pull(skb, skb_network_offset(skb));
1639         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1640                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1641                 *tail_skb = tmp_skb;
1642                 tail_skb = &(tmp_skb->next);
1643                 skb->len += tmp_skb->len;
1644                 skb->data_len += tmp_skb->len;
1645                 skb->truesize += tmp_skb->truesize;
1646                 tmp_skb->destructor = NULL;
1647                 tmp_skb->sk = NULL;
1648         }
1649
1650         /* Allow local fragmentation. */
1651         skb->ignore_df = ip6_sk_ignore_df(sk);
1652
1653         *final_dst = fl6->daddr;
1654         __skb_pull(skb, skb_network_header_len(skb));
1655         if (opt && opt->opt_flen)
1656                 ipv6_push_frag_opts(skb, opt, &proto);
1657         if (opt && opt->opt_nflen)
1658                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1659
1660         skb_push(skb, sizeof(struct ipv6hdr));
1661         skb_reset_network_header(skb);
1662         hdr = ipv6_hdr(skb);
1663
1664         ip6_flow_hdr(hdr, v6_cork->tclass,
1665                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1666                                         ip6_autoflowlabel(net, np), fl6));
1667         hdr->hop_limit = v6_cork->hop_limit;
1668         hdr->nexthdr = proto;
1669         hdr->saddr = fl6->saddr;
1670         hdr->daddr = *final_dst;
1671
1672         skb->priority = sk->sk_priority;
1673         skb->mark = sk->sk_mark;
1674
1675         skb->tstamp = cork->base.transmit_time;
1676
1677         skb_dst_set(skb, dst_clone(&rt->dst));
1678         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1679         if (proto == IPPROTO_ICMPV6) {
1680                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1681
1682                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1683                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1684         }
1685
1686         ip6_cork_release(cork, v6_cork);
1687 out:
1688         return skb;
1689 }
1690
1691 int ip6_send_skb(struct sk_buff *skb)
1692 {
1693         struct net *net = sock_net(skb->sk);
1694         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1695         int err;
1696
1697         err = ip6_local_out(net, skb->sk, skb);
1698         if (err) {
1699                 if (err > 0)
1700                         err = net_xmit_errno(err);
1701                 if (err)
1702                         IP6_INC_STATS(net, rt->rt6i_idev,
1703                                       IPSTATS_MIB_OUTDISCARDS);
1704         }
1705
1706         return err;
1707 }
1708
1709 int ip6_push_pending_frames(struct sock *sk)
1710 {
1711         struct sk_buff *skb;
1712
1713         skb = ip6_finish_skb(sk);
1714         if (!skb)
1715                 return 0;
1716
1717         return ip6_send_skb(skb);
1718 }
1719 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1720
1721 static void __ip6_flush_pending_frames(struct sock *sk,
1722                                        struct sk_buff_head *queue,
1723                                        struct inet_cork_full *cork,
1724                                        struct inet6_cork *v6_cork)
1725 {
1726         struct sk_buff *skb;
1727
1728         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1729                 if (skb_dst(skb))
1730                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1731                                       IPSTATS_MIB_OUTDISCARDS);
1732                 kfree_skb(skb);
1733         }
1734
1735         ip6_cork_release(cork, v6_cork);
1736 }
1737
1738 void ip6_flush_pending_frames(struct sock *sk)
1739 {
1740         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1741                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1742 }
1743 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1744
1745 struct sk_buff *ip6_make_skb(struct sock *sk,
1746                              int getfrag(void *from, char *to, int offset,
1747                                          int len, int odd, struct sk_buff *skb),
1748                              void *from, int length, int transhdrlen,
1749                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1750                              struct rt6_info *rt, unsigned int flags,
1751                              struct inet_cork_full *cork)
1752 {
1753         struct inet6_cork v6_cork;
1754         struct sk_buff_head queue;
1755         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1756         int err;
1757
1758         if (flags & MSG_PROBE)
1759                 return NULL;
1760
1761         __skb_queue_head_init(&queue);
1762
1763         cork->base.flags = 0;
1764         cork->base.addr = 0;
1765         cork->base.opt = NULL;
1766         cork->base.dst = NULL;
1767         v6_cork.opt = NULL;
1768         err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1769         if (err) {
1770                 ip6_cork_release(cork, &v6_cork);
1771                 return ERR_PTR(err);
1772         }
1773         if (ipc6->dontfrag < 0)
1774                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1775
1776         err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1777                                 &current->task_frag, getfrag, from,
1778                                 length + exthdrlen, transhdrlen + exthdrlen,
1779                                 flags, ipc6);
1780         if (err) {
1781                 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1782                 return ERR_PTR(err);
1783         }
1784
1785         return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1786 }