]> asedeno.scripts.mit.edu Git - linux.git/blob - net/ipv6/ip6_output.c
Merge branches 'pm-core', 'pm-qos', 'pm-domains' and 'pm-opp'
[linux.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
61
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63 {
64         struct dst_entry *dst = skb_dst(skb);
65         struct net_device *dev = dst->dev;
66         struct neighbour *neigh;
67         struct in6_addr *nexthop;
68         int ret;
69
70         skb->protocol = htons(ETH_P_IPV6);
71         skb->dev = dev;
72
73         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
74                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
75
76                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
77                     ((mroute6_socket(net, skb) &&
78                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
79                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
80                                          &ipv6_hdr(skb)->saddr))) {
81                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
82
83                         /* Do not check for IFF_ALLMULTI; multicast routing
84                            is not supported in any case.
85                          */
86                         if (newskb)
87                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
88                                         net, sk, newskb, NULL, newskb->dev,
89                                         dev_loopback_xmit);
90
91                         if (ipv6_hdr(skb)->hop_limit == 0) {
92                                 IP6_INC_STATS(net, idev,
93                                               IPSTATS_MIB_OUTDISCARDS);
94                                 kfree_skb(skb);
95                                 return 0;
96                         }
97                 }
98
99                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
100
101                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
102                     IPV6_ADDR_SCOPE_NODELOCAL &&
103                     !(dev->flags & IFF_LOOPBACK)) {
104                         kfree_skb(skb);
105                         return 0;
106                 }
107         }
108
109         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
110                 int res = lwtunnel_xmit(skb);
111
112                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
113                         return res;
114         }
115
116         rcu_read_lock_bh();
117         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
118         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
119         if (unlikely(!neigh))
120                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
121         if (!IS_ERR(neigh)) {
122                 ret = dst_neigh_output(dst, neigh, skb);
123                 rcu_read_unlock_bh();
124                 return ret;
125         }
126         rcu_read_unlock_bh();
127
128         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
129         kfree_skb(skb);
130         return -EINVAL;
131 }
132
133 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
134 {
135         int ret;
136
137         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
138         if (ret) {
139                 kfree_skb(skb);
140                 return ret;
141         }
142
143         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
144             dst_allfrag(skb_dst(skb)) ||
145             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
146                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
147         else
148                 return ip6_finish_output2(net, sk, skb);
149 }
150
151 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
152 {
153         struct net_device *dev = skb_dst(skb)->dev;
154         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
155
156         if (unlikely(idev->cnf.disable_ipv6)) {
157                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
158                 kfree_skb(skb);
159                 return 0;
160         }
161
162         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
163                             net, sk, skb, NULL, dev,
164                             ip6_finish_output,
165                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
166 }
167
168 /*
169  * xmit an sk_buff (used by TCP, SCTP and DCCP)
170  * Note : socket lock is not held for SYNACK packets, but might be modified
171  * by calls to skb_set_owner_w() and ipv6_local_error(),
172  * which are using proper atomic operations or spinlocks.
173  */
174 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
175              __u32 mark, struct ipv6_txoptions *opt, int tclass)
176 {
177         struct net *net = sock_net(sk);
178         const struct ipv6_pinfo *np = inet6_sk(sk);
179         struct in6_addr *first_hop = &fl6->daddr;
180         struct dst_entry *dst = skb_dst(skb);
181         struct ipv6hdr *hdr;
182         u8  proto = fl6->flowi6_proto;
183         int seg_len = skb->len;
184         int hlimit = -1;
185         u32 mtu;
186
187         if (opt) {
188                 unsigned int head_room;
189
190                 /* First: exthdrs may take lots of space (~8K for now)
191                    MAX_HEADER is not enough.
192                  */
193                 head_room = opt->opt_nflen + opt->opt_flen;
194                 seg_len += head_room;
195                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
196
197                 if (skb_headroom(skb) < head_room) {
198                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
199                         if (!skb2) {
200                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
201                                               IPSTATS_MIB_OUTDISCARDS);
202                                 kfree_skb(skb);
203                                 return -ENOBUFS;
204                         }
205                         consume_skb(skb);
206                         skb = skb2;
207                         /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
208                          * it is safe to call in our context (socket lock not held)
209                          */
210                         skb_set_owner_w(skb, (struct sock *)sk);
211                 }
212                 if (opt->opt_flen)
213                         ipv6_push_frag_opts(skb, opt, &proto);
214                 if (opt->opt_nflen)
215                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
216                                              &fl6->saddr);
217         }
218
219         skb_push(skb, sizeof(struct ipv6hdr));
220         skb_reset_network_header(skb);
221         hdr = ipv6_hdr(skb);
222
223         /*
224          *      Fill in the IPv6 header
225          */
226         if (np)
227                 hlimit = np->hop_limit;
228         if (hlimit < 0)
229                 hlimit = ip6_dst_hoplimit(dst);
230
231         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
232                                                      np->autoflowlabel, fl6));
233
234         hdr->payload_len = htons(seg_len);
235         hdr->nexthdr = proto;
236         hdr->hop_limit = hlimit;
237
238         hdr->saddr = fl6->saddr;
239         hdr->daddr = *first_hop;
240
241         skb->protocol = htons(ETH_P_IPV6);
242         skb->priority = sk->sk_priority;
243         skb->mark = mark;
244
245         mtu = dst_mtu(dst);
246         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
247                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
248                               IPSTATS_MIB_OUT, skb->len);
249
250                 /* if egress device is enslaved to an L3 master device pass the
251                  * skb to its handler for processing
252                  */
253                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
254                 if (unlikely(!skb))
255                         return 0;
256
257                 /* hooks should never assume socket lock is held.
258                  * we promote our socket to non const
259                  */
260                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
261                                net, (struct sock *)sk, skb, NULL, dst->dev,
262                                dst_output);
263         }
264
265         skb->dev = dst->dev;
266         /* ipv6_local_error() does not require socket lock,
267          * we promote our socket to non const
268          */
269         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
270
271         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
272         kfree_skb(skb);
273         return -EMSGSIZE;
274 }
275 EXPORT_SYMBOL(ip6_xmit);
276
277 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
278 {
279         struct ip6_ra_chain *ra;
280         struct sock *last = NULL;
281
282         read_lock(&ip6_ra_lock);
283         for (ra = ip6_ra_chain; ra; ra = ra->next) {
284                 struct sock *sk = ra->sk;
285                 if (sk && ra->sel == sel &&
286                     (!sk->sk_bound_dev_if ||
287                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
288                         if (last) {
289                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
290                                 if (skb2)
291                                         rawv6_rcv(last, skb2);
292                         }
293                         last = sk;
294                 }
295         }
296
297         if (last) {
298                 rawv6_rcv(last, skb);
299                 read_unlock(&ip6_ra_lock);
300                 return 1;
301         }
302         read_unlock(&ip6_ra_lock);
303         return 0;
304 }
305
306 static int ip6_forward_proxy_check(struct sk_buff *skb)
307 {
308         struct ipv6hdr *hdr = ipv6_hdr(skb);
309         u8 nexthdr = hdr->nexthdr;
310         __be16 frag_off;
311         int offset;
312
313         if (ipv6_ext_hdr(nexthdr)) {
314                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
315                 if (offset < 0)
316                         return 0;
317         } else
318                 offset = sizeof(struct ipv6hdr);
319
320         if (nexthdr == IPPROTO_ICMPV6) {
321                 struct icmp6hdr *icmp6;
322
323                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
324                                          offset + 1 - skb->data)))
325                         return 0;
326
327                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
328
329                 switch (icmp6->icmp6_type) {
330                 case NDISC_ROUTER_SOLICITATION:
331                 case NDISC_ROUTER_ADVERTISEMENT:
332                 case NDISC_NEIGHBOUR_SOLICITATION:
333                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
334                 case NDISC_REDIRECT:
335                         /* For reaction involving unicast neighbor discovery
336                          * message destined to the proxied address, pass it to
337                          * input function.
338                          */
339                         return 1;
340                 default:
341                         break;
342                 }
343         }
344
345         /*
346          * The proxying router can't forward traffic sent to a link-local
347          * address, so signal the sender and discard the packet. This
348          * behavior is clarified by the MIPv6 specification.
349          */
350         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
351                 dst_link_failure(skb);
352                 return -1;
353         }
354
355         return 0;
356 }
357
358 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
359                                      struct sk_buff *skb)
360 {
361         return dst_output(net, sk, skb);
362 }
363
364 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
365 {
366         unsigned int mtu;
367         struct inet6_dev *idev;
368
369         if (dst_metric_locked(dst, RTAX_MTU)) {
370                 mtu = dst_metric_raw(dst, RTAX_MTU);
371                 if (mtu)
372                         return mtu;
373         }
374
375         mtu = IPV6_MIN_MTU;
376         rcu_read_lock();
377         idev = __in6_dev_get(dst->dev);
378         if (idev)
379                 mtu = idev->cnf.mtu6;
380         rcu_read_unlock();
381
382         return mtu;
383 }
384
385 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
386 {
387         if (skb->len <= mtu)
388                 return false;
389
390         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
391         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
392                 return true;
393
394         if (skb->ignore_df)
395                 return false;
396
397         if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
398                 return false;
399
400         return true;
401 }
402
403 int ip6_forward(struct sk_buff *skb)
404 {
405         struct dst_entry *dst = skb_dst(skb);
406         struct ipv6hdr *hdr = ipv6_hdr(skb);
407         struct inet6_skb_parm *opt = IP6CB(skb);
408         struct net *net = dev_net(dst->dev);
409         u32 mtu;
410
411         if (net->ipv6.devconf_all->forwarding == 0)
412                 goto error;
413
414         if (skb->pkt_type != PACKET_HOST)
415                 goto drop;
416
417         if (unlikely(skb->sk))
418                 goto drop;
419
420         if (skb_warn_if_lro(skb))
421                 goto drop;
422
423         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
424                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
425                                 IPSTATS_MIB_INDISCARDS);
426                 goto drop;
427         }
428
429         skb_forward_csum(skb);
430
431         /*
432          *      We DO NOT make any processing on
433          *      RA packets, pushing them to user level AS IS
434          *      without ane WARRANTY that application will be able
435          *      to interpret them. The reason is that we
436          *      cannot make anything clever here.
437          *
438          *      We are not end-node, so that if packet contains
439          *      AH/ESP, we cannot make anything.
440          *      Defragmentation also would be mistake, RA packets
441          *      cannot be fragmented, because there is no warranty
442          *      that different fragments will go along one path. --ANK
443          */
444         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
445                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
446                         return 0;
447         }
448
449         /*
450          *      check and decrement ttl
451          */
452         if (hdr->hop_limit <= 1) {
453                 /* Force OUTPUT device used as source address */
454                 skb->dev = dst->dev;
455                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
456                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
457                                 IPSTATS_MIB_INHDRERRORS);
458
459                 kfree_skb(skb);
460                 return -ETIMEDOUT;
461         }
462
463         /* XXX: idev->cnf.proxy_ndp? */
464         if (net->ipv6.devconf_all->proxy_ndp &&
465             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
466                 int proxied = ip6_forward_proxy_check(skb);
467                 if (proxied > 0)
468                         return ip6_input(skb);
469                 else if (proxied < 0) {
470                         __IP6_INC_STATS(net, ip6_dst_idev(dst),
471                                         IPSTATS_MIB_INDISCARDS);
472                         goto drop;
473                 }
474         }
475
476         if (!xfrm6_route_forward(skb)) {
477                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
478                                 IPSTATS_MIB_INDISCARDS);
479                 goto drop;
480         }
481         dst = skb_dst(skb);
482
483         /* IPv6 specs say nothing about it, but it is clear that we cannot
484            send redirects to source routed frames.
485            We don't send redirects to frames decapsulated from IPsec.
486          */
487         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
488                 struct in6_addr *target = NULL;
489                 struct inet_peer *peer;
490                 struct rt6_info *rt;
491
492                 /*
493                  *      incoming and outgoing devices are the same
494                  *      send a redirect.
495                  */
496
497                 rt = (struct rt6_info *) dst;
498                 if (rt->rt6i_flags & RTF_GATEWAY)
499                         target = &rt->rt6i_gateway;
500                 else
501                         target = &hdr->daddr;
502
503                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
504
505                 /* Limit redirects both by destination (here)
506                    and by source (inside ndisc_send_redirect)
507                  */
508                 if (inet_peer_xrlim_allow(peer, 1*HZ))
509                         ndisc_send_redirect(skb, target);
510                 if (peer)
511                         inet_putpeer(peer);
512         } else {
513                 int addrtype = ipv6_addr_type(&hdr->saddr);
514
515                 /* This check is security critical. */
516                 if (addrtype == IPV6_ADDR_ANY ||
517                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
518                         goto error;
519                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
520                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
521                                     ICMPV6_NOT_NEIGHBOUR, 0);
522                         goto error;
523                 }
524         }
525
526         mtu = ip6_dst_mtu_forward(dst);
527         if (mtu < IPV6_MIN_MTU)
528                 mtu = IPV6_MIN_MTU;
529
530         if (ip6_pkt_too_big(skb, mtu)) {
531                 /* Again, force OUTPUT device used as source address */
532                 skb->dev = dst->dev;
533                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
534                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
535                                 IPSTATS_MIB_INTOOBIGERRORS);
536                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
537                                 IPSTATS_MIB_FRAGFAILS);
538                 kfree_skb(skb);
539                 return -EMSGSIZE;
540         }
541
542         if (skb_cow(skb, dst->dev->hard_header_len)) {
543                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
544                                 IPSTATS_MIB_OUTDISCARDS);
545                 goto drop;
546         }
547
548         hdr = ipv6_hdr(skb);
549
550         /* Mangling hops number delayed to point after skb COW */
551
552         hdr->hop_limit--;
553
554         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
555         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
556         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
557                        net, NULL, skb, skb->dev, dst->dev,
558                        ip6_forward_finish);
559
560 error:
561         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
562 drop:
563         kfree_skb(skb);
564         return -EINVAL;
565 }
566
567 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
568 {
569         to->pkt_type = from->pkt_type;
570         to->priority = from->priority;
571         to->protocol = from->protocol;
572         skb_dst_drop(to);
573         skb_dst_set(to, dst_clone(skb_dst(from)));
574         to->dev = from->dev;
575         to->mark = from->mark;
576
577 #ifdef CONFIG_NET_SCHED
578         to->tc_index = from->tc_index;
579 #endif
580         nf_copy(to, from);
581         skb_copy_secmark(to, from);
582 }
583
584 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
585                  int (*output)(struct net *, struct sock *, struct sk_buff *))
586 {
587         struct sk_buff *frag;
588         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
589         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
590                                 inet6_sk(skb->sk) : NULL;
591         struct ipv6hdr *tmp_hdr;
592         struct frag_hdr *fh;
593         unsigned int mtu, hlen, left, len;
594         int hroom, troom;
595         __be32 frag_id;
596         int ptr, offset = 0, err = 0;
597         u8 *prevhdr, nexthdr = 0;
598
599         hlen = ip6_find_1stfragopt(skb, &prevhdr);
600         nexthdr = *prevhdr;
601
602         mtu = ip6_skb_dst_mtu(skb);
603
604         /* We must not fragment if the socket is set to force MTU discovery
605          * or if the skb it not generated by a local socket.
606          */
607         if (unlikely(!skb->ignore_df && skb->len > mtu))
608                 goto fail_toobig;
609
610         if (IP6CB(skb)->frag_max_size) {
611                 if (IP6CB(skb)->frag_max_size > mtu)
612                         goto fail_toobig;
613
614                 /* don't send fragments larger than what we received */
615                 mtu = IP6CB(skb)->frag_max_size;
616                 if (mtu < IPV6_MIN_MTU)
617                         mtu = IPV6_MIN_MTU;
618         }
619
620         if (np && np->frag_size < mtu) {
621                 if (np->frag_size)
622                         mtu = np->frag_size;
623         }
624         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
625                 goto fail_toobig;
626         mtu -= hlen + sizeof(struct frag_hdr);
627
628         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
629                                     &ipv6_hdr(skb)->saddr);
630
631         if (skb->ip_summed == CHECKSUM_PARTIAL &&
632             (err = skb_checksum_help(skb)))
633                 goto fail;
634
635         hroom = LL_RESERVED_SPACE(rt->dst.dev);
636         if (skb_has_frag_list(skb)) {
637                 unsigned int first_len = skb_pagelen(skb);
638                 struct sk_buff *frag2;
639
640                 if (first_len - hlen > mtu ||
641                     ((first_len - hlen) & 7) ||
642                     skb_cloned(skb) ||
643                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
644                         goto slow_path;
645
646                 skb_walk_frags(skb, frag) {
647                         /* Correct geometry. */
648                         if (frag->len > mtu ||
649                             ((frag->len & 7) && frag->next) ||
650                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
651                                 goto slow_path_clean;
652
653                         /* Partially cloned skb? */
654                         if (skb_shared(frag))
655                                 goto slow_path_clean;
656
657                         BUG_ON(frag->sk);
658                         if (skb->sk) {
659                                 frag->sk = skb->sk;
660                                 frag->destructor = sock_wfree;
661                         }
662                         skb->truesize -= frag->truesize;
663                 }
664
665                 err = 0;
666                 offset = 0;
667                 /* BUILD HEADER */
668
669                 *prevhdr = NEXTHDR_FRAGMENT;
670                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
671                 if (!tmp_hdr) {
672                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
673                                       IPSTATS_MIB_FRAGFAILS);
674                         err = -ENOMEM;
675                         goto fail;
676                 }
677                 frag = skb_shinfo(skb)->frag_list;
678                 skb_frag_list_init(skb);
679
680                 __skb_pull(skb, hlen);
681                 fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
682                 __skb_push(skb, hlen);
683                 skb_reset_network_header(skb);
684                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
685
686                 fh->nexthdr = nexthdr;
687                 fh->reserved = 0;
688                 fh->frag_off = htons(IP6_MF);
689                 fh->identification = frag_id;
690
691                 first_len = skb_pagelen(skb);
692                 skb->data_len = first_len - skb_headlen(skb);
693                 skb->len = first_len;
694                 ipv6_hdr(skb)->payload_len = htons(first_len -
695                                                    sizeof(struct ipv6hdr));
696
697                 dst_hold(&rt->dst);
698
699                 for (;;) {
700                         /* Prepare header of the next frame,
701                          * before previous one went down. */
702                         if (frag) {
703                                 frag->ip_summed = CHECKSUM_NONE;
704                                 skb_reset_transport_header(frag);
705                                 fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
706                                 __skb_push(frag, hlen);
707                                 skb_reset_network_header(frag);
708                                 memcpy(skb_network_header(frag), tmp_hdr,
709                                        hlen);
710                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
711                                 fh->nexthdr = nexthdr;
712                                 fh->reserved = 0;
713                                 fh->frag_off = htons(offset);
714                                 if (frag->next)
715                                         fh->frag_off |= htons(IP6_MF);
716                                 fh->identification = frag_id;
717                                 ipv6_hdr(frag)->payload_len =
718                                                 htons(frag->len -
719                                                       sizeof(struct ipv6hdr));
720                                 ip6_copy_metadata(frag, skb);
721                         }
722
723                         err = output(net, sk, skb);
724                         if (!err)
725                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
726                                               IPSTATS_MIB_FRAGCREATES);
727
728                         if (err || !frag)
729                                 break;
730
731                         skb = frag;
732                         frag = skb->next;
733                         skb->next = NULL;
734                 }
735
736                 kfree(tmp_hdr);
737
738                 if (err == 0) {
739                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
740                                       IPSTATS_MIB_FRAGOKS);
741                         ip6_rt_put(rt);
742                         return 0;
743                 }
744
745                 kfree_skb_list(frag);
746
747                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
748                               IPSTATS_MIB_FRAGFAILS);
749                 ip6_rt_put(rt);
750                 return err;
751
752 slow_path_clean:
753                 skb_walk_frags(skb, frag2) {
754                         if (frag2 == frag)
755                                 break;
756                         frag2->sk = NULL;
757                         frag2->destructor = NULL;
758                         skb->truesize += frag2->truesize;
759                 }
760         }
761
762 slow_path:
763         left = skb->len - hlen;         /* Space per frame */
764         ptr = hlen;                     /* Where to start from */
765
766         /*
767          *      Fragment the datagram.
768          */
769
770         *prevhdr = NEXTHDR_FRAGMENT;
771         troom = rt->dst.dev->needed_tailroom;
772
773         /*
774          *      Keep copying data until we run out.
775          */
776         while (left > 0)        {
777                 len = left;
778                 /* IF: it doesn't fit, use 'mtu' - the data space left */
779                 if (len > mtu)
780                         len = mtu;
781                 /* IF: we are not sending up to and including the packet end
782                    then align the next start on an eight byte boundary */
783                 if (len < left) {
784                         len &= ~7;
785                 }
786
787                 /* Allocate buffer */
788                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
789                                  hroom + troom, GFP_ATOMIC);
790                 if (!frag) {
791                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
792                                       IPSTATS_MIB_FRAGFAILS);
793                         err = -ENOMEM;
794                         goto fail;
795                 }
796
797                 /*
798                  *      Set up data on packet
799                  */
800
801                 ip6_copy_metadata(frag, skb);
802                 skb_reserve(frag, hroom);
803                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
804                 skb_reset_network_header(frag);
805                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
806                 frag->transport_header = (frag->network_header + hlen +
807                                           sizeof(struct frag_hdr));
808
809                 /*
810                  *      Charge the memory for the fragment to any owner
811                  *      it might possess
812                  */
813                 if (skb->sk)
814                         skb_set_owner_w(frag, skb->sk);
815
816                 /*
817                  *      Copy the packet header into the new buffer.
818                  */
819                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
820
821                 /*
822                  *      Build fragment header.
823                  */
824                 fh->nexthdr = nexthdr;
825                 fh->reserved = 0;
826                 fh->identification = frag_id;
827
828                 /*
829                  *      Copy a block of the IP datagram.
830                  */
831                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
832                                      len));
833                 left -= len;
834
835                 fh->frag_off = htons(offset);
836                 if (left > 0)
837                         fh->frag_off |= htons(IP6_MF);
838                 ipv6_hdr(frag)->payload_len = htons(frag->len -
839                                                     sizeof(struct ipv6hdr));
840
841                 ptr += len;
842                 offset += len;
843
844                 /*
845                  *      Put this fragment into the sending queue.
846                  */
847                 err = output(net, sk, frag);
848                 if (err)
849                         goto fail;
850
851                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
852                               IPSTATS_MIB_FRAGCREATES);
853         }
854         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
855                       IPSTATS_MIB_FRAGOKS);
856         consume_skb(skb);
857         return err;
858
859 fail_toobig:
860         if (skb->sk && dst_allfrag(skb_dst(skb)))
861                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
862
863         skb->dev = skb_dst(skb)->dev;
864         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
865         err = -EMSGSIZE;
866
867 fail:
868         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
869                       IPSTATS_MIB_FRAGFAILS);
870         kfree_skb(skb);
871         return err;
872 }
873
874 static inline int ip6_rt_check(const struct rt6key *rt_key,
875                                const struct in6_addr *fl_addr,
876                                const struct in6_addr *addr_cache)
877 {
878         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
879                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
880 }
881
882 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
883                                           struct dst_entry *dst,
884                                           const struct flowi6 *fl6)
885 {
886         struct ipv6_pinfo *np = inet6_sk(sk);
887         struct rt6_info *rt;
888
889         if (!dst)
890                 goto out;
891
892         if (dst->ops->family != AF_INET6) {
893                 dst_release(dst);
894                 return NULL;
895         }
896
897         rt = (struct rt6_info *)dst;
898         /* Yes, checking route validity in not connected
899          * case is not very simple. Take into account,
900          * that we do not support routing by source, TOS,
901          * and MSG_DONTROUTE            --ANK (980726)
902          *
903          * 1. ip6_rt_check(): If route was host route,
904          *    check that cached destination is current.
905          *    If it is network route, we still may
906          *    check its validity using saved pointer
907          *    to the last used address: daddr_cache.
908          *    We do not want to save whole address now,
909          *    (because main consumer of this service
910          *    is tcp, which has not this problem),
911          *    so that the last trick works only on connected
912          *    sockets.
913          * 2. oif also should be the same.
914          */
915         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
916 #ifdef CONFIG_IPV6_SUBTREES
917             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
918 #endif
919            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
920               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
921                 dst_release(dst);
922                 dst = NULL;
923         }
924
925 out:
926         return dst;
927 }
928
929 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
930                                struct dst_entry **dst, struct flowi6 *fl6)
931 {
932 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
933         struct neighbour *n;
934         struct rt6_info *rt;
935 #endif
936         int err;
937         int flags = 0;
938
939         /* The correct way to handle this would be to do
940          * ip6_route_get_saddr, and then ip6_route_output; however,
941          * the route-specific preferred source forces the
942          * ip6_route_output call _before_ ip6_route_get_saddr.
943          *
944          * In source specific routing (no src=any default route),
945          * ip6_route_output will fail given src=any saddr, though, so
946          * that's why we try it again later.
947          */
948         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
949                 struct rt6_info *rt;
950                 bool had_dst = *dst != NULL;
951
952                 if (!had_dst)
953                         *dst = ip6_route_output(net, sk, fl6);
954                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
955                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
956                                           sk ? inet6_sk(sk)->srcprefs : 0,
957                                           &fl6->saddr);
958                 if (err)
959                         goto out_err_release;
960
961                 /* If we had an erroneous initial result, pretend it
962                  * never existed and let the SA-enabled version take
963                  * over.
964                  */
965                 if (!had_dst && (*dst)->error) {
966                         dst_release(*dst);
967                         *dst = NULL;
968                 }
969
970                 if (fl6->flowi6_oif)
971                         flags |= RT6_LOOKUP_F_IFACE;
972         }
973
974         if (!*dst)
975                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
976
977         err = (*dst)->error;
978         if (err)
979                 goto out_err_release;
980
981 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
982         /*
983          * Here if the dst entry we've looked up
984          * has a neighbour entry that is in the INCOMPLETE
985          * state and the src address from the flow is
986          * marked as OPTIMISTIC, we release the found
987          * dst entry and replace it instead with the
988          * dst entry of the nexthop router
989          */
990         rt = (struct rt6_info *) *dst;
991         rcu_read_lock_bh();
992         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
993                                       rt6_nexthop(rt, &fl6->daddr));
994         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
995         rcu_read_unlock_bh();
996
997         if (err) {
998                 struct inet6_ifaddr *ifp;
999                 struct flowi6 fl_gw6;
1000                 int redirect;
1001
1002                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1003                                       (*dst)->dev, 1);
1004
1005                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1006                 if (ifp)
1007                         in6_ifa_put(ifp);
1008
1009                 if (redirect) {
1010                         /*
1011                          * We need to get the dst entry for the
1012                          * default router instead
1013                          */
1014                         dst_release(*dst);
1015                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1016                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1017                         *dst = ip6_route_output(net, sk, &fl_gw6);
1018                         err = (*dst)->error;
1019                         if (err)
1020                                 goto out_err_release;
1021                 }
1022         }
1023 #endif
1024         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1025             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1026                 err = -EAFNOSUPPORT;
1027                 goto out_err_release;
1028         }
1029
1030         return 0;
1031
1032 out_err_release:
1033         dst_release(*dst);
1034         *dst = NULL;
1035
1036         if (err == -ENETUNREACH)
1037                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1038         return err;
1039 }
1040
1041 /**
1042  *      ip6_dst_lookup - perform route lookup on flow
1043  *      @sk: socket which provides route info
1044  *      @dst: pointer to dst_entry * for result
1045  *      @fl6: flow to lookup
1046  *
1047  *      This function performs a route lookup on the given flow.
1048  *
1049  *      It returns zero on success, or a standard errno code on error.
1050  */
1051 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1052                    struct flowi6 *fl6)
1053 {
1054         *dst = NULL;
1055         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1056 }
1057 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1058
1059 /**
1060  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1061  *      @sk: socket which provides route info
1062  *      @fl6: flow to lookup
1063  *      @final_dst: final destination address for ipsec lookup
1064  *
1065  *      This function performs a route lookup on the given flow.
1066  *
1067  *      It returns a valid dst pointer on success, or a pointer encoded
1068  *      error code.
1069  */
1070 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1071                                       const struct in6_addr *final_dst)
1072 {
1073         struct dst_entry *dst = NULL;
1074         int err;
1075
1076         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1077         if (err)
1078                 return ERR_PTR(err);
1079         if (final_dst)
1080                 fl6->daddr = *final_dst;
1081
1082         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1083 }
1084 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1085
1086 /**
1087  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1088  *      @sk: socket which provides the dst cache and route info
1089  *      @fl6: flow to lookup
1090  *      @final_dst: final destination address for ipsec lookup
1091  *
1092  *      This function performs a route lookup on the given flow with the
1093  *      possibility of using the cached route in the socket if it is valid.
1094  *      It will take the socket dst lock when operating on the dst cache.
1095  *      As a result, this function can only be used in process context.
1096  *
1097  *      It returns a valid dst pointer on success, or a pointer encoded
1098  *      error code.
1099  */
1100 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1101                                          const struct in6_addr *final_dst)
1102 {
1103         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1104
1105         dst = ip6_sk_dst_check(sk, dst, fl6);
1106         if (!dst)
1107                 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1108
1109         return dst;
1110 }
1111 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1112
1113 static inline int ip6_ufo_append_data(struct sock *sk,
1114                         struct sk_buff_head *queue,
1115                         int getfrag(void *from, char *to, int offset, int len,
1116                         int odd, struct sk_buff *skb),
1117                         void *from, int length, int hh_len, int fragheaderlen,
1118                         int exthdrlen, int transhdrlen, int mtu,
1119                         unsigned int flags, const struct flowi6 *fl6)
1120
1121 {
1122         struct sk_buff *skb;
1123         int err;
1124
1125         /* There is support for UDP large send offload by network
1126          * device, so create one single skb packet containing complete
1127          * udp datagram
1128          */
1129         skb = skb_peek_tail(queue);
1130         if (!skb) {
1131                 skb = sock_alloc_send_skb(sk,
1132                         hh_len + fragheaderlen + transhdrlen + 20,
1133                         (flags & MSG_DONTWAIT), &err);
1134                 if (!skb)
1135                         return err;
1136
1137                 /* reserve space for Hardware header */
1138                 skb_reserve(skb, hh_len);
1139
1140                 /* create space for UDP/IP header */
1141                 skb_put(skb, fragheaderlen + transhdrlen);
1142
1143                 /* initialize network header pointer */
1144                 skb_set_network_header(skb, exthdrlen);
1145
1146                 /* initialize protocol header pointer */
1147                 skb->transport_header = skb->network_header + fragheaderlen;
1148
1149                 skb->protocol = htons(ETH_P_IPV6);
1150                 skb->csum = 0;
1151
1152                 __skb_queue_tail(queue, skb);
1153         } else if (skb_is_gso(skb)) {
1154                 goto append;
1155         }
1156
1157         skb->ip_summed = CHECKSUM_PARTIAL;
1158         /* Specify the length of each IPv6 datagram fragment.
1159          * It has to be a multiple of 8.
1160          */
1161         skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1162                                      sizeof(struct frag_hdr)) & ~7;
1163         skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1164         skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1165                                                          &fl6->daddr,
1166                                                          &fl6->saddr);
1167
1168 append:
1169         return skb_append_datato_frags(sk, skb, getfrag, from,
1170                                        (length - transhdrlen));
1171 }
1172
1173 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1174                                                gfp_t gfp)
1175 {
1176         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1177 }
1178
1179 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1180                                                 gfp_t gfp)
1181 {
1182         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1183 }
1184
1185 static void ip6_append_data_mtu(unsigned int *mtu,
1186                                 int *maxfraglen,
1187                                 unsigned int fragheaderlen,
1188                                 struct sk_buff *skb,
1189                                 struct rt6_info *rt,
1190                                 unsigned int orig_mtu)
1191 {
1192         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1193                 if (!skb) {
1194                         /* first fragment, reserve header_len */
1195                         *mtu = orig_mtu - rt->dst.header_len;
1196
1197                 } else {
1198                         /*
1199                          * this fragment is not first, the headers
1200                          * space is regarded as data space.
1201                          */
1202                         *mtu = orig_mtu;
1203                 }
1204                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1205                               + fragheaderlen - sizeof(struct frag_hdr);
1206         }
1207 }
1208
1209 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1210                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1211                           struct rt6_info *rt, struct flowi6 *fl6)
1212 {
1213         struct ipv6_pinfo *np = inet6_sk(sk);
1214         unsigned int mtu;
1215         struct ipv6_txoptions *opt = ipc6->opt;
1216
1217         /*
1218          * setup for corking
1219          */
1220         if (opt) {
1221                 if (WARN_ON(v6_cork->opt))
1222                         return -EINVAL;
1223
1224                 v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1225                 if (unlikely(!v6_cork->opt))
1226                         return -ENOBUFS;
1227
1228                 v6_cork->opt->tot_len = opt->tot_len;
1229                 v6_cork->opt->opt_flen = opt->opt_flen;
1230                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1231
1232                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1233                                                     sk->sk_allocation);
1234                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1235                         return -ENOBUFS;
1236
1237                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1238                                                     sk->sk_allocation);
1239                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1240                         return -ENOBUFS;
1241
1242                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1243                                                    sk->sk_allocation);
1244                 if (opt->hopopt && !v6_cork->opt->hopopt)
1245                         return -ENOBUFS;
1246
1247                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1248                                                     sk->sk_allocation);
1249                 if (opt->srcrt && !v6_cork->opt->srcrt)
1250                         return -ENOBUFS;
1251
1252                 /* need source address above miyazawa*/
1253         }
1254         dst_hold(&rt->dst);
1255         cork->base.dst = &rt->dst;
1256         cork->fl.u.ip6 = *fl6;
1257         v6_cork->hop_limit = ipc6->hlimit;
1258         v6_cork->tclass = ipc6->tclass;
1259         if (rt->dst.flags & DST_XFRM_TUNNEL)
1260                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1261                       rt->dst.dev->mtu : dst_mtu(&rt->dst);
1262         else
1263                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1264                       rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1265         if (np->frag_size < mtu) {
1266                 if (np->frag_size)
1267                         mtu = np->frag_size;
1268         }
1269         cork->base.fragsize = mtu;
1270         if (dst_allfrag(rt->dst.path))
1271                 cork->base.flags |= IPCORK_ALLFRAG;
1272         cork->base.length = 0;
1273
1274         return 0;
1275 }
1276
1277 static int __ip6_append_data(struct sock *sk,
1278                              struct flowi6 *fl6,
1279                              struct sk_buff_head *queue,
1280                              struct inet_cork *cork,
1281                              struct inet6_cork *v6_cork,
1282                              struct page_frag *pfrag,
1283                              int getfrag(void *from, char *to, int offset,
1284                                          int len, int odd, struct sk_buff *skb),
1285                              void *from, int length, int transhdrlen,
1286                              unsigned int flags, struct ipcm6_cookie *ipc6,
1287                              const struct sockcm_cookie *sockc)
1288 {
1289         struct sk_buff *skb, *skb_prev = NULL;
1290         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1291         int exthdrlen = 0;
1292         int dst_exthdrlen = 0;
1293         int hh_len;
1294         int copy;
1295         int err;
1296         int offset = 0;
1297         __u8 tx_flags = 0;
1298         u32 tskey = 0;
1299         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1300         struct ipv6_txoptions *opt = v6_cork->opt;
1301         int csummode = CHECKSUM_NONE;
1302         unsigned int maxnonfragsize, headersize;
1303
1304         skb = skb_peek_tail(queue);
1305         if (!skb) {
1306                 exthdrlen = opt ? opt->opt_flen : 0;
1307                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1308         }
1309
1310         mtu = cork->fragsize;
1311         orig_mtu = mtu;
1312
1313         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1314
1315         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1316                         (opt ? opt->opt_nflen : 0);
1317         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1318                      sizeof(struct frag_hdr);
1319
1320         headersize = sizeof(struct ipv6hdr) +
1321                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1322                      (dst_allfrag(&rt->dst) ?
1323                       sizeof(struct frag_hdr) : 0) +
1324                      rt->rt6i_nfheader_len;
1325
1326         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1327             (sk->sk_protocol == IPPROTO_UDP ||
1328              sk->sk_protocol == IPPROTO_RAW)) {
1329                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1330                                 sizeof(struct ipv6hdr));
1331                 goto emsgsize;
1332         }
1333
1334         if (ip6_sk_ignore_df(sk))
1335                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1336         else
1337                 maxnonfragsize = mtu;
1338
1339         if (cork->length + length > maxnonfragsize - headersize) {
1340 emsgsize:
1341                 ipv6_local_error(sk, EMSGSIZE, fl6,
1342                                  mtu - headersize +
1343                                  sizeof(struct ipv6hdr));
1344                 return -EMSGSIZE;
1345         }
1346
1347         /* CHECKSUM_PARTIAL only with no extension headers and when
1348          * we are not going to fragment
1349          */
1350         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1351             headersize == sizeof(struct ipv6hdr) &&
1352             length <= mtu - headersize &&
1353             !(flags & MSG_MORE) &&
1354             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1355                 csummode = CHECKSUM_PARTIAL;
1356
1357         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1358                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1359                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1360                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1361                         tskey = sk->sk_tskey++;
1362         }
1363
1364         /*
1365          * Let's try using as much space as possible.
1366          * Use MTU if total length of the message fits into the MTU.
1367          * Otherwise, we need to reserve fragment header and
1368          * fragment alignment (= 8-15 octects, in total).
1369          *
1370          * Note that we may need to "move" the data from the tail of
1371          * of the buffer to the new fragment when we split
1372          * the message.
1373          *
1374          * FIXME: It may be fragmented into multiple chunks
1375          *        at once if non-fragmentable extension headers
1376          *        are too large.
1377          * --yoshfuji
1378          */
1379
1380         cork->length += length;
1381         if ((((length + fragheaderlen) > mtu) ||
1382              (skb && skb_is_gso(skb))) &&
1383             (sk->sk_protocol == IPPROTO_UDP) &&
1384             (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
1385             (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) {
1386                 err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1387                                           hh_len, fragheaderlen, exthdrlen,
1388                                           transhdrlen, mtu, flags, fl6);
1389                 if (err)
1390                         goto error;
1391                 return 0;
1392         }
1393
1394         if (!skb)
1395                 goto alloc_new_skb;
1396
1397         while (length > 0) {
1398                 /* Check if the remaining data fits into current packet. */
1399                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1400                 if (copy < length)
1401                         copy = maxfraglen - skb->len;
1402
1403                 if (copy <= 0) {
1404                         char *data;
1405                         unsigned int datalen;
1406                         unsigned int fraglen;
1407                         unsigned int fraggap;
1408                         unsigned int alloclen;
1409 alloc_new_skb:
1410                         /* There's no room in the current skb */
1411                         if (skb)
1412                                 fraggap = skb->len - maxfraglen;
1413                         else
1414                                 fraggap = 0;
1415                         /* update mtu and maxfraglen if necessary */
1416                         if (!skb || !skb_prev)
1417                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1418                                                     fragheaderlen, skb, rt,
1419                                                     orig_mtu);
1420
1421                         skb_prev = skb;
1422
1423                         /*
1424                          * If remaining data exceeds the mtu,
1425                          * we know we need more fragment(s).
1426                          */
1427                         datalen = length + fraggap;
1428
1429                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1430                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1431                         if ((flags & MSG_MORE) &&
1432                             !(rt->dst.dev->features&NETIF_F_SG))
1433                                 alloclen = mtu;
1434                         else
1435                                 alloclen = datalen + fragheaderlen;
1436
1437                         alloclen += dst_exthdrlen;
1438
1439                         if (datalen != length + fraggap) {
1440                                 /*
1441                                  * this is not the last fragment, the trailer
1442                                  * space is regarded as data space.
1443                                  */
1444                                 datalen += rt->dst.trailer_len;
1445                         }
1446
1447                         alloclen += rt->dst.trailer_len;
1448                         fraglen = datalen + fragheaderlen;
1449
1450                         /*
1451                          * We just reserve space for fragment header.
1452                          * Note: this may be overallocation if the message
1453                          * (without MSG_MORE) fits into the MTU.
1454                          */
1455                         alloclen += sizeof(struct frag_hdr);
1456
1457                         if (transhdrlen) {
1458                                 skb = sock_alloc_send_skb(sk,
1459                                                 alloclen + hh_len,
1460                                                 (flags & MSG_DONTWAIT), &err);
1461                         } else {
1462                                 skb = NULL;
1463                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1464                                     2 * sk->sk_sndbuf)
1465                                         skb = sock_wmalloc(sk,
1466                                                            alloclen + hh_len, 1,
1467                                                            sk->sk_allocation);
1468                                 if (unlikely(!skb))
1469                                         err = -ENOBUFS;
1470                         }
1471                         if (!skb)
1472                                 goto error;
1473                         /*
1474                          *      Fill in the control structures
1475                          */
1476                         skb->protocol = htons(ETH_P_IPV6);
1477                         skb->ip_summed = csummode;
1478                         skb->csum = 0;
1479                         /* reserve for fragmentation and ipsec header */
1480                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1481                                     dst_exthdrlen);
1482
1483                         /* Only the initial fragment is time stamped */
1484                         skb_shinfo(skb)->tx_flags = tx_flags;
1485                         tx_flags = 0;
1486                         skb_shinfo(skb)->tskey = tskey;
1487                         tskey = 0;
1488
1489                         /*
1490                          *      Find where to start putting bytes
1491                          */
1492                         data = skb_put(skb, fraglen);
1493                         skb_set_network_header(skb, exthdrlen);
1494                         data += fragheaderlen;
1495                         skb->transport_header = (skb->network_header +
1496                                                  fragheaderlen);
1497                         if (fraggap) {
1498                                 skb->csum = skb_copy_and_csum_bits(
1499                                         skb_prev, maxfraglen,
1500                                         data + transhdrlen, fraggap, 0);
1501                                 skb_prev->csum = csum_sub(skb_prev->csum,
1502                                                           skb->csum);
1503                                 data += fraggap;
1504                                 pskb_trim_unique(skb_prev, maxfraglen);
1505                         }
1506                         copy = datalen - transhdrlen - fraggap;
1507
1508                         if (copy < 0) {
1509                                 err = -EINVAL;
1510                                 kfree_skb(skb);
1511                                 goto error;
1512                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1513                                 err = -EFAULT;
1514                                 kfree_skb(skb);
1515                                 goto error;
1516                         }
1517
1518                         offset += copy;
1519                         length -= datalen - fraggap;
1520                         transhdrlen = 0;
1521                         exthdrlen = 0;
1522                         dst_exthdrlen = 0;
1523
1524                         /*
1525                          * Put the packet on the pending queue
1526                          */
1527                         __skb_queue_tail(queue, skb);
1528                         continue;
1529                 }
1530
1531                 if (copy > length)
1532                         copy = length;
1533
1534                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1535                         unsigned int off;
1536
1537                         off = skb->len;
1538                         if (getfrag(from, skb_put(skb, copy),
1539                                                 offset, copy, off, skb) < 0) {
1540                                 __skb_trim(skb, off);
1541                                 err = -EFAULT;
1542                                 goto error;
1543                         }
1544                 } else {
1545                         int i = skb_shinfo(skb)->nr_frags;
1546
1547                         err = -ENOMEM;
1548                         if (!sk_page_frag_refill(sk, pfrag))
1549                                 goto error;
1550
1551                         if (!skb_can_coalesce(skb, i, pfrag->page,
1552                                               pfrag->offset)) {
1553                                 err = -EMSGSIZE;
1554                                 if (i == MAX_SKB_FRAGS)
1555                                         goto error;
1556
1557                                 __skb_fill_page_desc(skb, i, pfrag->page,
1558                                                      pfrag->offset, 0);
1559                                 skb_shinfo(skb)->nr_frags = ++i;
1560                                 get_page(pfrag->page);
1561                         }
1562                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1563                         if (getfrag(from,
1564                                     page_address(pfrag->page) + pfrag->offset,
1565                                     offset, copy, skb->len, skb) < 0)
1566                                 goto error_efault;
1567
1568                         pfrag->offset += copy;
1569                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1570                         skb->len += copy;
1571                         skb->data_len += copy;
1572                         skb->truesize += copy;
1573                         atomic_add(copy, &sk->sk_wmem_alloc);
1574                 }
1575                 offset += copy;
1576                 length -= copy;
1577         }
1578
1579         return 0;
1580
1581 error_efault:
1582         err = -EFAULT;
1583 error:
1584         cork->length -= length;
1585         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1586         return err;
1587 }
1588
1589 int ip6_append_data(struct sock *sk,
1590                     int getfrag(void *from, char *to, int offset, int len,
1591                                 int odd, struct sk_buff *skb),
1592                     void *from, int length, int transhdrlen,
1593                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1594                     struct rt6_info *rt, unsigned int flags,
1595                     const struct sockcm_cookie *sockc)
1596 {
1597         struct inet_sock *inet = inet_sk(sk);
1598         struct ipv6_pinfo *np = inet6_sk(sk);
1599         int exthdrlen;
1600         int err;
1601
1602         if (flags&MSG_PROBE)
1603                 return 0;
1604         if (skb_queue_empty(&sk->sk_write_queue)) {
1605                 /*
1606                  * setup for corking
1607                  */
1608                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1609                                      ipc6, rt, fl6);
1610                 if (err)
1611                         return err;
1612
1613                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1614                 length += exthdrlen;
1615                 transhdrlen += exthdrlen;
1616         } else {
1617                 fl6 = &inet->cork.fl.u.ip6;
1618                 transhdrlen = 0;
1619         }
1620
1621         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1622                                  &np->cork, sk_page_frag(sk), getfrag,
1623                                  from, length, transhdrlen, flags, ipc6, sockc);
1624 }
1625 EXPORT_SYMBOL_GPL(ip6_append_data);
1626
1627 static void ip6_cork_release(struct inet_cork_full *cork,
1628                              struct inet6_cork *v6_cork)
1629 {
1630         if (v6_cork->opt) {
1631                 kfree(v6_cork->opt->dst0opt);
1632                 kfree(v6_cork->opt->dst1opt);
1633                 kfree(v6_cork->opt->hopopt);
1634                 kfree(v6_cork->opt->srcrt);
1635                 kfree(v6_cork->opt);
1636                 v6_cork->opt = NULL;
1637         }
1638
1639         if (cork->base.dst) {
1640                 dst_release(cork->base.dst);
1641                 cork->base.dst = NULL;
1642                 cork->base.flags &= ~IPCORK_ALLFRAG;
1643         }
1644         memset(&cork->fl, 0, sizeof(cork->fl));
1645 }
1646
1647 struct sk_buff *__ip6_make_skb(struct sock *sk,
1648                                struct sk_buff_head *queue,
1649                                struct inet_cork_full *cork,
1650                                struct inet6_cork *v6_cork)
1651 {
1652         struct sk_buff *skb, *tmp_skb;
1653         struct sk_buff **tail_skb;
1654         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1655         struct ipv6_pinfo *np = inet6_sk(sk);
1656         struct net *net = sock_net(sk);
1657         struct ipv6hdr *hdr;
1658         struct ipv6_txoptions *opt = v6_cork->opt;
1659         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1660         struct flowi6 *fl6 = &cork->fl.u.ip6;
1661         unsigned char proto = fl6->flowi6_proto;
1662
1663         skb = __skb_dequeue(queue);
1664         if (!skb)
1665                 goto out;
1666         tail_skb = &(skb_shinfo(skb)->frag_list);
1667
1668         /* move skb->data to ip header from ext header */
1669         if (skb->data < skb_network_header(skb))
1670                 __skb_pull(skb, skb_network_offset(skb));
1671         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1672                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1673                 *tail_skb = tmp_skb;
1674                 tail_skb = &(tmp_skb->next);
1675                 skb->len += tmp_skb->len;
1676                 skb->data_len += tmp_skb->len;
1677                 skb->truesize += tmp_skb->truesize;
1678                 tmp_skb->destructor = NULL;
1679                 tmp_skb->sk = NULL;
1680         }
1681
1682         /* Allow local fragmentation. */
1683         skb->ignore_df = ip6_sk_ignore_df(sk);
1684
1685         *final_dst = fl6->daddr;
1686         __skb_pull(skb, skb_network_header_len(skb));
1687         if (opt && opt->opt_flen)
1688                 ipv6_push_frag_opts(skb, opt, &proto);
1689         if (opt && opt->opt_nflen)
1690                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1691
1692         skb_push(skb, sizeof(struct ipv6hdr));
1693         skb_reset_network_header(skb);
1694         hdr = ipv6_hdr(skb);
1695
1696         ip6_flow_hdr(hdr, v6_cork->tclass,
1697                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1698                                         np->autoflowlabel, fl6));
1699         hdr->hop_limit = v6_cork->hop_limit;
1700         hdr->nexthdr = proto;
1701         hdr->saddr = fl6->saddr;
1702         hdr->daddr = *final_dst;
1703
1704         skb->priority = sk->sk_priority;
1705         skb->mark = sk->sk_mark;
1706
1707         skb_dst_set(skb, dst_clone(&rt->dst));
1708         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1709         if (proto == IPPROTO_ICMPV6) {
1710                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1711
1712                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1713                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1714         }
1715
1716         ip6_cork_release(cork, v6_cork);
1717 out:
1718         return skb;
1719 }
1720
1721 int ip6_send_skb(struct sk_buff *skb)
1722 {
1723         struct net *net = sock_net(skb->sk);
1724         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1725         int err;
1726
1727         err = ip6_local_out(net, skb->sk, skb);
1728         if (err) {
1729                 if (err > 0)
1730                         err = net_xmit_errno(err);
1731                 if (err)
1732                         IP6_INC_STATS(net, rt->rt6i_idev,
1733                                       IPSTATS_MIB_OUTDISCARDS);
1734         }
1735
1736         return err;
1737 }
1738
1739 int ip6_push_pending_frames(struct sock *sk)
1740 {
1741         struct sk_buff *skb;
1742
1743         skb = ip6_finish_skb(sk);
1744         if (!skb)
1745                 return 0;
1746
1747         return ip6_send_skb(skb);
1748 }
1749 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1750
1751 static void __ip6_flush_pending_frames(struct sock *sk,
1752                                        struct sk_buff_head *queue,
1753                                        struct inet_cork_full *cork,
1754                                        struct inet6_cork *v6_cork)
1755 {
1756         struct sk_buff *skb;
1757
1758         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1759                 if (skb_dst(skb))
1760                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1761                                       IPSTATS_MIB_OUTDISCARDS);
1762                 kfree_skb(skb);
1763         }
1764
1765         ip6_cork_release(cork, v6_cork);
1766 }
1767
1768 void ip6_flush_pending_frames(struct sock *sk)
1769 {
1770         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1771                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1772 }
1773 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1774
1775 struct sk_buff *ip6_make_skb(struct sock *sk,
1776                              int getfrag(void *from, char *to, int offset,
1777                                          int len, int odd, struct sk_buff *skb),
1778                              void *from, int length, int transhdrlen,
1779                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1780                              struct rt6_info *rt, unsigned int flags,
1781                              const struct sockcm_cookie *sockc)
1782 {
1783         struct inet_cork_full cork;
1784         struct inet6_cork v6_cork;
1785         struct sk_buff_head queue;
1786         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1787         int err;
1788
1789         if (flags & MSG_PROBE)
1790                 return NULL;
1791
1792         __skb_queue_head_init(&queue);
1793
1794         cork.base.flags = 0;
1795         cork.base.addr = 0;
1796         cork.base.opt = NULL;
1797         v6_cork.opt = NULL;
1798         err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1799         if (err)
1800                 return ERR_PTR(err);
1801
1802         if (ipc6->dontfrag < 0)
1803                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1804
1805         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1806                                 &current->task_frag, getfrag, from,
1807                                 length + exthdrlen, transhdrlen + exthdrlen,
1808                                 flags, ipc6, sockc);
1809         if (err) {
1810                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1811                 return ERR_PTR(err);
1812         }
1813
1814         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1815 }