ipv6: make fragment identifications less predictable, CVE-2011-2699
[linux-flexiantxendom0-natty.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63         int len;
64
65         len = skb->len - sizeof(struct ipv6hdr);
66         if (len > IPV6_MAXPLEN)
67                 len = 0;
68         ipv6_hdr(skb)->payload_len = htons(len);
69
70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71                        skb_dst(skb)->dev, dst_output);
72 }
73
74 int ip6_local_out(struct sk_buff *skb)
75 {
76         int err;
77
78         err = __ip6_local_out(skb);
79         if (likely(err == 1))
80                 err = dst_output(skb);
81
82         return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89         skb_reset_mac_header(newskb);
90         __skb_pull(newskb, skb_network_offset(newskb));
91         newskb->pkt_type = PACKET_LOOPBACK;
92         newskb->ip_summed = CHECKSUM_UNNECESSARY;
93         WARN_ON(!skb_dst(newskb));
94
95         netif_rx_ni(newskb);
96         return 0;
97 }
98
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101         struct dst_entry *dst = skb_dst(skb);
102         struct net_device *dev = dst->dev;
103
104         skb->protocol = htons(ETH_P_IPV6);
105         skb->dev = dev;
106
107         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
108                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
109
110                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
111                     ((mroute6_socket(dev_net(dev), skb) &&
112                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
113                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
114                                          &ipv6_hdr(skb)->saddr))) {
115                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
116
117                         /* Do not check for IFF_ALLMULTI; multicast routing
118                            is not supported in any case.
119                          */
120                         if (newskb)
121                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
122                                         newskb, NULL, newskb->dev,
123                                         ip6_dev_loopback_xmit);
124
125                         if (ipv6_hdr(skb)->hop_limit == 0) {
126                                 IP6_INC_STATS(dev_net(dev), idev,
127                                               IPSTATS_MIB_OUTDISCARDS);
128                                 kfree_skb(skb);
129                                 return 0;
130                         }
131                 }
132
133                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
134                                 skb->len);
135         }
136
137         if (dst->hh)
138                 return neigh_hh_output(dst->hh, skb);
139         else if (dst->neighbour)
140                 return dst->neighbour->output(skb);
141
142         IP6_INC_STATS_BH(dev_net(dst->dev),
143                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
144         kfree_skb(skb);
145         return -EINVAL;
146 }
147
148 static int ip6_finish_output(struct sk_buff *skb)
149 {
150         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
151             dst_allfrag(skb_dst(skb)))
152                 return ip6_fragment(skb, ip6_finish_output2);
153         else
154                 return ip6_finish_output2(skb);
155 }
156
157 int ip6_output(struct sk_buff *skb)
158 {
159         struct net_device *dev = skb_dst(skb)->dev;
160         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161         if (unlikely(idev->cnf.disable_ipv6)) {
162                 IP6_INC_STATS(dev_net(dev), idev,
163                               IPSTATS_MIB_OUTDISCARDS);
164                 kfree_skb(skb);
165                 return 0;
166         }
167
168         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
169                             ip6_finish_output,
170                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
171 }
172
173 /*
174  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
175  */
176
177 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
178              struct ipv6_txoptions *opt)
179 {
180         struct net *net = sock_net(sk);
181         struct ipv6_pinfo *np = inet6_sk(sk);
182         struct in6_addr *first_hop = &fl->fl6_dst;
183         struct dst_entry *dst = skb_dst(skb);
184         struct ipv6hdr *hdr;
185         u8  proto = fl->proto;
186         int seg_len = skb->len;
187         int hlimit = -1;
188         int tclass = 0;
189         u32 mtu;
190
191         if (opt) {
192                 unsigned int head_room;
193
194                 /* First: exthdrs may take lots of space (~8K for now)
195                    MAX_HEADER is not enough.
196                  */
197                 head_room = opt->opt_nflen + opt->opt_flen;
198                 seg_len += head_room;
199                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
200
201                 if (skb_headroom(skb) < head_room) {
202                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
203                         if (skb2 == NULL) {
204                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
205                                               IPSTATS_MIB_OUTDISCARDS);
206                                 kfree_skb(skb);
207                                 return -ENOBUFS;
208                         }
209                         kfree_skb(skb);
210                         skb = skb2;
211                         skb_set_owner_w(skb, sk);
212                 }
213                 if (opt->opt_flen)
214                         ipv6_push_frag_opts(skb, opt, &proto);
215                 if (opt->opt_nflen)
216                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
217         }
218
219         skb_push(skb, sizeof(struct ipv6hdr));
220         skb_reset_network_header(skb);
221         hdr = ipv6_hdr(skb);
222
223         /*
224          *      Fill in the IPv6 header
225          */
226         if (np) {
227                 tclass = np->tclass;
228                 hlimit = np->hop_limit;
229         }
230         if (hlimit < 0)
231                 hlimit = ip6_dst_hoplimit(dst);
232
233         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
234
235         hdr->payload_len = htons(seg_len);
236         hdr->nexthdr = proto;
237         hdr->hop_limit = hlimit;
238
239         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
240         ipv6_addr_copy(&hdr->daddr, first_hop);
241
242         skb->priority = sk->sk_priority;
243         skb->mark = sk->sk_mark;
244
245         mtu = dst_mtu(dst);
246         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
247                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
248                               IPSTATS_MIB_OUT, skb->len);
249                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
250                                dst->dev, dst_output);
251         }
252
253         if (net_ratelimit())
254                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
255         skb->dev = dst->dev;
256         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
257         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
258         kfree_skb(skb);
259         return -EMSGSIZE;
260 }
261
262 EXPORT_SYMBOL(ip6_xmit);
263
264 /*
265  *      To avoid extra problems ND packets are send through this
266  *      routine. It's code duplication but I really want to avoid
267  *      extra checks since ipv6_build_header is used by TCP (which
268  *      is for us performance critical)
269  */
270
271 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
272                const struct in6_addr *saddr, const struct in6_addr *daddr,
273                int proto, int len)
274 {
275         struct ipv6_pinfo *np = inet6_sk(sk);
276         struct ipv6hdr *hdr;
277         int totlen;
278
279         skb->protocol = htons(ETH_P_IPV6);
280         skb->dev = dev;
281
282         totlen = len + sizeof(struct ipv6hdr);
283
284         skb_reset_network_header(skb);
285         skb_put(skb, sizeof(struct ipv6hdr));
286         hdr = ipv6_hdr(skb);
287
288         *(__be32*)hdr = htonl(0x60000000);
289
290         hdr->payload_len = htons(len);
291         hdr->nexthdr = proto;
292         hdr->hop_limit = np->hop_limit;
293
294         ipv6_addr_copy(&hdr->saddr, saddr);
295         ipv6_addr_copy(&hdr->daddr, daddr);
296
297         return 0;
298 }
299
300 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
301 {
302         struct ip6_ra_chain *ra;
303         struct sock *last = NULL;
304
305         read_lock(&ip6_ra_lock);
306         for (ra = ip6_ra_chain; ra; ra = ra->next) {
307                 struct sock *sk = ra->sk;
308                 if (sk && ra->sel == sel &&
309                     (!sk->sk_bound_dev_if ||
310                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
311                         if (last) {
312                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
313                                 if (skb2)
314                                         rawv6_rcv(last, skb2);
315                         }
316                         last = sk;
317                 }
318         }
319
320         if (last) {
321                 rawv6_rcv(last, skb);
322                 read_unlock(&ip6_ra_lock);
323                 return 1;
324         }
325         read_unlock(&ip6_ra_lock);
326         return 0;
327 }
328
329 static int ip6_forward_proxy_check(struct sk_buff *skb)
330 {
331         struct ipv6hdr *hdr = ipv6_hdr(skb);
332         u8 nexthdr = hdr->nexthdr;
333         int offset;
334
335         if (ipv6_ext_hdr(nexthdr)) {
336                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
337                 if (offset < 0)
338                         return 0;
339         } else
340                 offset = sizeof(struct ipv6hdr);
341
342         if (nexthdr == IPPROTO_ICMPV6) {
343                 struct icmp6hdr *icmp6;
344
345                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
346                                          offset + 1 - skb->data)))
347                         return 0;
348
349                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
350
351                 switch (icmp6->icmp6_type) {
352                 case NDISC_ROUTER_SOLICITATION:
353                 case NDISC_ROUTER_ADVERTISEMENT:
354                 case NDISC_NEIGHBOUR_SOLICITATION:
355                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
356                 case NDISC_REDIRECT:
357                         /* For reaction involving unicast neighbor discovery
358                          * message destined to the proxied address, pass it to
359                          * input function.
360                          */
361                         return 1;
362                 default:
363                         break;
364                 }
365         }
366
367         /*
368          * The proxying router can't forward traffic sent to a link-local
369          * address, so signal the sender and discard the packet. This
370          * behavior is clarified by the MIPv6 specification.
371          */
372         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
373                 dst_link_failure(skb);
374                 return -1;
375         }
376
377         return 0;
378 }
379
380 static inline int ip6_forward_finish(struct sk_buff *skb)
381 {
382         return dst_output(skb);
383 }
384
385 int ip6_forward(struct sk_buff *skb)
386 {
387         struct dst_entry *dst = skb_dst(skb);
388         struct ipv6hdr *hdr = ipv6_hdr(skb);
389         struct inet6_skb_parm *opt = IP6CB(skb);
390         struct net *net = dev_net(dst->dev);
391         u32 mtu;
392
393         if (net->ipv6.devconf_all->forwarding == 0)
394                 goto error;
395
396         if (skb_warn_if_lro(skb))
397                 goto drop;
398
399         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
400                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
401                 goto drop;
402         }
403
404         if (skb->pkt_type != PACKET_HOST)
405                 goto drop;
406
407         skb_forward_csum(skb);
408
409         /*
410          *      We DO NOT make any processing on
411          *      RA packets, pushing them to user level AS IS
412          *      without ane WARRANTY that application will be able
413          *      to interpret them. The reason is that we
414          *      cannot make anything clever here.
415          *
416          *      We are not end-node, so that if packet contains
417          *      AH/ESP, we cannot make anything.
418          *      Defragmentation also would be mistake, RA packets
419          *      cannot be fragmented, because there is no warranty
420          *      that different fragments will go along one path. --ANK
421          */
422         if (opt->ra) {
423                 u8 *ptr = skb_network_header(skb) + opt->ra;
424                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
425                         return 0;
426         }
427
428         /*
429          *      check and decrement ttl
430          */
431         if (hdr->hop_limit <= 1) {
432                 /* Force OUTPUT device used as source address */
433                 skb->dev = dst->dev;
434                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
435                 IP6_INC_STATS_BH(net,
436                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
437
438                 kfree_skb(skb);
439                 return -ETIMEDOUT;
440         }
441
442         /* XXX: idev->cnf.proxy_ndp? */
443         if (net->ipv6.devconf_all->proxy_ndp &&
444             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
445                 int proxied = ip6_forward_proxy_check(skb);
446                 if (proxied > 0)
447                         return ip6_input(skb);
448                 else if (proxied < 0) {
449                         IP6_INC_STATS(net, ip6_dst_idev(dst),
450                                       IPSTATS_MIB_INDISCARDS);
451                         goto drop;
452                 }
453         }
454
455         if (!xfrm6_route_forward(skb)) {
456                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
457                 goto drop;
458         }
459         dst = skb_dst(skb);
460
461         /* IPv6 specs say nothing about it, but it is clear that we cannot
462            send redirects to source routed frames.
463            We don't send redirects to frames decapsulated from IPsec.
464          */
465         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
466             !skb_sec_path(skb)) {
467                 struct in6_addr *target = NULL;
468                 struct rt6_info *rt;
469                 struct neighbour *n = dst->neighbour;
470
471                 /*
472                  *      incoming and outgoing devices are the same
473                  *      send a redirect.
474                  */
475
476                 rt = (struct rt6_info *) dst;
477                 if ((rt->rt6i_flags & RTF_GATEWAY))
478                         target = (struct in6_addr*)&n->primary_key;
479                 else
480                         target = &hdr->daddr;
481
482                 /* Limit redirects both by destination (here)
483                    and by source (inside ndisc_send_redirect)
484                  */
485                 if (xrlim_allow(dst, 1*HZ))
486                         ndisc_send_redirect(skb, n, target);
487         } else {
488                 int addrtype = ipv6_addr_type(&hdr->saddr);
489
490                 /* This check is security critical. */
491                 if (addrtype == IPV6_ADDR_ANY ||
492                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
493                         goto error;
494                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
495                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
496                                     ICMPV6_NOT_NEIGHBOUR, 0);
497                         goto error;
498                 }
499         }
500
501         mtu = dst_mtu(dst);
502         if (mtu < IPV6_MIN_MTU)
503                 mtu = IPV6_MIN_MTU;
504
505         if (skb->len > mtu && !skb_is_gso(skb)) {
506                 /* Again, force OUTPUT device used as source address */
507                 skb->dev = dst->dev;
508                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
509                 IP6_INC_STATS_BH(net,
510                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
511                 IP6_INC_STATS_BH(net,
512                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
513                 kfree_skb(skb);
514                 return -EMSGSIZE;
515         }
516
517         if (skb_cow(skb, dst->dev->hard_header_len)) {
518                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
519                 goto drop;
520         }
521
522         hdr = ipv6_hdr(skb);
523
524         /* Mangling hops number delayed to point after skb COW */
525
526         hdr->hop_limit--;
527
528         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
529         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
530                        ip6_forward_finish);
531
532 error:
533         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
534 drop:
535         kfree_skb(skb);
536         return -EINVAL;
537 }
538
539 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
540 {
541         to->pkt_type = from->pkt_type;
542         to->priority = from->priority;
543         to->protocol = from->protocol;
544         skb_dst_drop(to);
545         skb_dst_set(to, dst_clone(skb_dst(from)));
546         to->dev = from->dev;
547         to->mark = from->mark;
548
549 #ifdef CONFIG_NET_SCHED
550         to->tc_index = from->tc_index;
551 #endif
552         nf_copy(to, from);
553 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
554     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
555         to->nf_trace = from->nf_trace;
556 #endif
557         skb_copy_secmark(to, from);
558 }
559
560 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
561 {
562         u16 offset = sizeof(struct ipv6hdr);
563         struct ipv6_opt_hdr *exthdr =
564                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
565         unsigned int packet_len = skb->tail - skb->network_header;
566         int found_rhdr = 0;
567         *nexthdr = &ipv6_hdr(skb)->nexthdr;
568
569         while (offset + 1 <= packet_len) {
570
571                 switch (**nexthdr) {
572
573                 case NEXTHDR_HOP:
574                         break;
575                 case NEXTHDR_ROUTING:
576                         found_rhdr = 1;
577                         break;
578                 case NEXTHDR_DEST:
579 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
580                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
581                                 break;
582 #endif
583                         if (found_rhdr)
584                                 return offset;
585                         break;
586                 default :
587                         return offset;
588                 }
589
590                 offset += ipv6_optlen(exthdr);
591                 *nexthdr = &exthdr->nexthdr;
592                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
593                                                  offset);
594         }
595
596         return offset;
597 }
598
599 static u32 hashidentrnd __read_mostly;
600 #define FID_HASH_SZ 16
601 static u32 ipv6_fragmentation_id[FID_HASH_SZ];
602
603 void __init initialize_hashidentrnd(void)
604 {
605         get_random_bytes(&hashidentrnd, sizeof(hashidentrnd));
606 }
607
608 static u32 __ipv6_select_ident(const struct in6_addr *addr)
609 {
610         u32 newid, oldid, hash = jhash2((u32 *)addr, 4, hashidentrnd);
611         u32 *pid = &ipv6_fragmentation_id[hash % FID_HASH_SZ];
612
613         do {
614                 oldid = *pid;
615                 newid = oldid + 1;
616                 if (!(hash + newid))
617                         newid++;
618         } while (cmpxchg(pid, oldid, newid) != oldid);
619
620         return hash + newid;
621 }
622
623 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
624 {
625         fhdr->identification = htonl(__ipv6_select_ident(&rt->rt6i_dst.addr));
626 }
627
628 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
629 {
630         struct sk_buff *frag;
631         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
632         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
633         struct ipv6hdr *tmp_hdr;
634         struct frag_hdr *fh;
635         unsigned int mtu, hlen, left, len;
636         __be32 frag_id = 0;
637         int ptr, offset = 0, err=0;
638         u8 *prevhdr, nexthdr = 0;
639         struct net *net = dev_net(skb_dst(skb)->dev);
640
641         hlen = ip6_find_1stfragopt(skb, &prevhdr);
642         nexthdr = *prevhdr;
643
644         mtu = ip6_skb_dst_mtu(skb);
645
646         /* We must not fragment if the socket is set to force MTU discovery
647          * or if the skb it not generated by a local socket.
648          */
649         if (!skb->local_df && skb->len > mtu) {
650                 skb->dev = skb_dst(skb)->dev;
651                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
652                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
653                               IPSTATS_MIB_FRAGFAILS);
654                 kfree_skb(skb);
655                 return -EMSGSIZE;
656         }
657
658         if (np && np->frag_size < mtu) {
659                 if (np->frag_size)
660                         mtu = np->frag_size;
661         }
662         mtu -= hlen + sizeof(struct frag_hdr);
663
664         if (skb_has_frag_list(skb)) {
665                 int first_len = skb_pagelen(skb);
666                 struct sk_buff *frag2;
667
668                 if (first_len - hlen > mtu ||
669                     ((first_len - hlen) & 7) ||
670                     skb_cloned(skb))
671                         goto slow_path;
672
673                 skb_walk_frags(skb, frag) {
674                         /* Correct geometry. */
675                         if (frag->len > mtu ||
676                             ((frag->len & 7) && frag->next) ||
677                             skb_headroom(frag) < hlen)
678                                 goto slow_path_clean;
679
680                         /* Partially cloned skb? */
681                         if (skb_shared(frag))
682                                 goto slow_path_clean;
683
684                         BUG_ON(frag->sk);
685                         if (skb->sk) {
686                                 frag->sk = skb->sk;
687                                 frag->destructor = sock_wfree;
688                         }
689                         skb->truesize -= frag->truesize;
690                 }
691
692                 err = 0;
693                 offset = 0;
694                 frag = skb_shinfo(skb)->frag_list;
695                 skb_frag_list_init(skb);
696                 /* BUILD HEADER */
697
698                 *prevhdr = NEXTHDR_FRAGMENT;
699                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
700                 if (!tmp_hdr) {
701                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
702                                       IPSTATS_MIB_FRAGFAILS);
703                         return -ENOMEM;
704                 }
705
706                 __skb_pull(skb, hlen);
707                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
708                 __skb_push(skb, hlen);
709                 skb_reset_network_header(skb);
710                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
711
712                 ipv6_select_ident(fh, rt);
713                 fh->nexthdr = nexthdr;
714                 fh->reserved = 0;
715                 fh->frag_off = htons(IP6_MF);
716                 frag_id = fh->identification;
717
718                 first_len = skb_pagelen(skb);
719                 skb->data_len = first_len - skb_headlen(skb);
720                 skb->len = first_len;
721                 ipv6_hdr(skb)->payload_len = htons(first_len -
722                                                    sizeof(struct ipv6hdr));
723
724                 dst_hold(&rt->dst);
725
726                 for (;;) {
727                         /* Prepare header of the next frame,
728                          * before previous one went down. */
729                         if (frag) {
730                                 frag->ip_summed = CHECKSUM_NONE;
731                                 skb_reset_transport_header(frag);
732                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
733                                 __skb_push(frag, hlen);
734                                 skb_reset_network_header(frag);
735                                 memcpy(skb_network_header(frag), tmp_hdr,
736                                        hlen);
737                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
738                                 fh->nexthdr = nexthdr;
739                                 fh->reserved = 0;
740                                 fh->frag_off = htons(offset);
741                                 if (frag->next != NULL)
742                                         fh->frag_off |= htons(IP6_MF);
743                                 fh->identification = frag_id;
744                                 ipv6_hdr(frag)->payload_len =
745                                                 htons(frag->len -
746                                                       sizeof(struct ipv6hdr));
747                                 ip6_copy_metadata(frag, skb);
748                         }
749
750                         err = output(skb);
751                         if(!err)
752                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
753                                               IPSTATS_MIB_FRAGCREATES);
754
755                         if (err || !frag)
756                                 break;
757
758                         skb = frag;
759                         frag = skb->next;
760                         skb->next = NULL;
761                 }
762
763                 kfree(tmp_hdr);
764
765                 if (err == 0) {
766                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
767                                       IPSTATS_MIB_FRAGOKS);
768                         dst_release(&rt->dst);
769                         return 0;
770                 }
771
772                 while (frag) {
773                         skb = frag->next;
774                         kfree_skb(frag);
775                         frag = skb;
776                 }
777
778                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
779                               IPSTATS_MIB_FRAGFAILS);
780                 dst_release(&rt->dst);
781                 return err;
782
783 slow_path_clean:
784                 skb_walk_frags(skb, frag2) {
785                         if (frag2 == frag)
786                                 break;
787                         frag2->sk = NULL;
788                         frag2->destructor = NULL;
789                         skb->truesize += frag2->truesize;
790                 }
791         }
792
793 slow_path:
794         left = skb->len - hlen;         /* Space per frame */
795         ptr = hlen;                     /* Where to start from */
796
797         /*
798          *      Fragment the datagram.
799          */
800
801         *prevhdr = NEXTHDR_FRAGMENT;
802
803         /*
804          *      Keep copying data until we run out.
805          */
806         while(left > 0) {
807                 len = left;
808                 /* IF: it doesn't fit, use 'mtu' - the data space left */
809                 if (len > mtu)
810                         len = mtu;
811                 /* IF: we are not sending upto and including the packet end
812                    then align the next start on an eight byte boundary */
813                 if (len < left) {
814                         len &= ~7;
815                 }
816                 /*
817                  *      Allocate buffer.
818                  */
819
820                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
821                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
822                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
823                                       IPSTATS_MIB_FRAGFAILS);
824                         err = -ENOMEM;
825                         goto fail;
826                 }
827
828                 /*
829                  *      Set up data on packet
830                  */
831
832                 ip6_copy_metadata(frag, skb);
833                 skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
834                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
835                 skb_reset_network_header(frag);
836                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
837                 frag->transport_header = (frag->network_header + hlen +
838                                           sizeof(struct frag_hdr));
839
840                 /*
841                  *      Charge the memory for the fragment to any owner
842                  *      it might possess
843                  */
844                 if (skb->sk)
845                         skb_set_owner_w(frag, skb->sk);
846
847                 /*
848                  *      Copy the packet header into the new buffer.
849                  */
850                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
851
852                 /*
853                  *      Build fragment header.
854                  */
855                 fh->nexthdr = nexthdr;
856                 fh->reserved = 0;
857                 if (!frag_id) {
858                         ipv6_select_ident(fh, rt);
859                         frag_id = fh->identification;
860                 } else
861                         fh->identification = frag_id;
862
863                 /*
864                  *      Copy a block of the IP datagram.
865                  */
866                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
867                         BUG();
868                 left -= len;
869
870                 fh->frag_off = htons(offset);
871                 if (left > 0)
872                         fh->frag_off |= htons(IP6_MF);
873                 ipv6_hdr(frag)->payload_len = htons(frag->len -
874                                                     sizeof(struct ipv6hdr));
875
876                 ptr += len;
877                 offset += len;
878
879                 /*
880                  *      Put this fragment into the sending queue.
881                  */
882                 err = output(frag);
883                 if (err)
884                         goto fail;
885
886                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
887                               IPSTATS_MIB_FRAGCREATES);
888         }
889         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
890                       IPSTATS_MIB_FRAGOKS);
891         kfree_skb(skb);
892         return err;
893
894 fail:
895         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
896                       IPSTATS_MIB_FRAGFAILS);
897         kfree_skb(skb);
898         return err;
899 }
900
901 static inline int ip6_rt_check(struct rt6key *rt_key,
902                                struct in6_addr *fl_addr,
903                                struct in6_addr *addr_cache)
904 {
905         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
906                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
907 }
908
909 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
910                                           struct dst_entry *dst,
911                                           struct flowi *fl)
912 {
913         struct ipv6_pinfo *np = inet6_sk(sk);
914         struct rt6_info *rt = (struct rt6_info *)dst;
915
916         if (!dst)
917                 goto out;
918
919         /* Yes, checking route validity in not connected
920          * case is not very simple. Take into account,
921          * that we do not support routing by source, TOS,
922          * and MSG_DONTROUTE            --ANK (980726)
923          *
924          * 1. ip6_rt_check(): If route was host route,
925          *    check that cached destination is current.
926          *    If it is network route, we still may
927          *    check its validity using saved pointer
928          *    to the last used address: daddr_cache.
929          *    We do not want to save whole address now,
930          *    (because main consumer of this service
931          *    is tcp, which has not this problem),
932          *    so that the last trick works only on connected
933          *    sockets.
934          * 2. oif also should be the same.
935          */
936         if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
937 #ifdef CONFIG_IPV6_SUBTREES
938             ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
939 #endif
940             (fl->oif && fl->oif != dst->dev->ifindex)) {
941                 dst_release(dst);
942                 dst = NULL;
943         }
944
945 out:
946         return dst;
947 }
948
949 static int ip6_dst_lookup_tail(struct sock *sk,
950                                struct dst_entry **dst, struct flowi *fl)
951 {
952         int err;
953         struct net *net = sock_net(sk);
954
955         if (*dst == NULL)
956                 *dst = ip6_route_output(net, sk, fl);
957
958         if ((err = (*dst)->error))
959                 goto out_err_release;
960
961         if (ipv6_addr_any(&fl->fl6_src)) {
962                 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
963                                          &fl->fl6_dst,
964                                          sk ? inet6_sk(sk)->srcprefs : 0,
965                                          &fl->fl6_src);
966                 if (err)
967                         goto out_err_release;
968         }
969
970 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
971         /*
972          * Here if the dst entry we've looked up
973          * has a neighbour entry that is in the INCOMPLETE
974          * state and the src address from the flow is
975          * marked as OPTIMISTIC, we release the found
976          * dst entry and replace it instead with the
977          * dst entry of the nexthop router
978          */
979         if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
980                 struct inet6_ifaddr *ifp;
981                 struct flowi fl_gw;
982                 int redirect;
983
984                 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
985                                       (*dst)->dev, 1);
986
987                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
988                 if (ifp)
989                         in6_ifa_put(ifp);
990
991                 if (redirect) {
992                         /*
993                          * We need to get the dst entry for the
994                          * default router instead
995                          */
996                         dst_release(*dst);
997                         memcpy(&fl_gw, fl, sizeof(struct flowi));
998                         memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
999                         *dst = ip6_route_output(net, sk, &fl_gw);
1000                         if ((err = (*dst)->error))
1001                                 goto out_err_release;
1002                 }
1003         }
1004 #endif
1005
1006         return 0;
1007
1008 out_err_release:
1009         if (err == -ENETUNREACH)
1010                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1011         dst_release(*dst);
1012         *dst = NULL;
1013         return err;
1014 }
1015
1016 /**
1017  *      ip6_dst_lookup - perform route lookup on flow
1018  *      @sk: socket which provides route info
1019  *      @dst: pointer to dst_entry * for result
1020  *      @fl: flow to lookup
1021  *
1022  *      This function performs a route lookup on the given flow.
1023  *
1024  *      It returns zero on success, or a standard errno code on error.
1025  */
1026 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1027 {
1028         *dst = NULL;
1029         return ip6_dst_lookup_tail(sk, dst, fl);
1030 }
1031 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1032
1033 /**
1034  *      ip6_sk_dst_lookup - perform socket cached route lookup on flow
1035  *      @sk: socket which provides the dst cache and route info
1036  *      @dst: pointer to dst_entry * for result
1037  *      @fl: flow to lookup
1038  *
1039  *      This function performs a route lookup on the given flow with the
1040  *      possibility of using the cached route in the socket if it is valid.
1041  *      It will take the socket dst lock when operating on the dst cache.
1042  *      As a result, this function can only be used in process context.
1043  *
1044  *      It returns zero on success, or a standard errno code on error.
1045  */
1046 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1047 {
1048         *dst = NULL;
1049         if (sk) {
1050                 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1051                 *dst = ip6_sk_dst_check(sk, *dst, fl);
1052         }
1053
1054         return ip6_dst_lookup_tail(sk, dst, fl);
1055 }
1056 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1057
1058 static inline int ip6_ufo_append_data(struct sock *sk,
1059                         int getfrag(void *from, char *to, int offset, int len,
1060                         int odd, struct sk_buff *skb),
1061                         void *from, int length, int hh_len, int fragheaderlen,
1062                         int transhdrlen, int mtu,unsigned int flags,
1063                         struct rt6_info *rt)
1064
1065 {
1066         struct sk_buff *skb;
1067         int err;
1068
1069         /* There is support for UDP large send offload by network
1070          * device, so create one single skb packet containing complete
1071          * udp datagram
1072          */
1073         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1074                 skb = sock_alloc_send_skb(sk,
1075                         hh_len + fragheaderlen + transhdrlen + 20,
1076                         (flags & MSG_DONTWAIT), &err);
1077                 if (skb == NULL)
1078                         return -ENOMEM;
1079
1080                 /* reserve space for Hardware header */
1081                 skb_reserve(skb, hh_len);
1082
1083                 /* create space for UDP/IP header */
1084                 skb_put(skb,fragheaderlen + transhdrlen);
1085
1086                 /* initialize network header pointer */
1087                 skb_reset_network_header(skb);
1088
1089                 /* initialize protocol header pointer */
1090                 skb->transport_header = skb->network_header + fragheaderlen;
1091
1092                 skb->ip_summed = CHECKSUM_PARTIAL;
1093                 skb->csum = 0;
1094                 sk->sk_sndmsg_off = 0;
1095         }
1096
1097         err = skb_append_datato_frags(sk,skb, getfrag, from,
1098                                       (length - transhdrlen));
1099         if (!err) {
1100                 struct frag_hdr fhdr;
1101
1102                 /* Specify the length of each IPv6 datagram fragment.
1103                  * It has to be a multiple of 8.
1104                  */
1105                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1106                                              sizeof(struct frag_hdr)) & ~7;
1107                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1108                 ipv6_select_ident(&fhdr, rt);
1109                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1110                 __skb_queue_tail(&sk->sk_write_queue, skb);
1111
1112                 return 0;
1113         }
1114         /* There is not enough support do UPD LSO,
1115          * so follow normal path
1116          */
1117         kfree_skb(skb);
1118
1119         return err;
1120 }
1121
1122 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1123                                                gfp_t gfp)
1124 {
1125         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1126 }
1127
1128 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1129                                                 gfp_t gfp)
1130 {
1131         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1132 }
1133
1134 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1135         int offset, int len, int odd, struct sk_buff *skb),
1136         void *from, int length, int transhdrlen,
1137         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1138         struct rt6_info *rt, unsigned int flags, int dontfrag)
1139 {
1140         struct inet_sock *inet = inet_sk(sk);
1141         struct ipv6_pinfo *np = inet6_sk(sk);
1142         struct sk_buff *skb;
1143         unsigned int maxfraglen, fragheaderlen;
1144         int exthdrlen;
1145         int hh_len;
1146         int mtu;
1147         int copy;
1148         int err;
1149         int offset = 0;
1150         int csummode = CHECKSUM_NONE;
1151
1152         if (flags&MSG_PROBE)
1153                 return 0;
1154         if (skb_queue_empty(&sk->sk_write_queue)) {
1155                 /*
1156                  * setup for corking
1157                  */
1158                 if (opt) {
1159                         if (WARN_ON(np->cork.opt))
1160                                 return -EINVAL;
1161
1162                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1163                         if (unlikely(np->cork.opt == NULL))
1164                                 return -ENOBUFS;
1165
1166                         np->cork.opt->tot_len = opt->tot_len;
1167                         np->cork.opt->opt_flen = opt->opt_flen;
1168                         np->cork.opt->opt_nflen = opt->opt_nflen;
1169
1170                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1171                                                             sk->sk_allocation);
1172                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1173                                 return -ENOBUFS;
1174
1175                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1176                                                             sk->sk_allocation);
1177                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1178                                 return -ENOBUFS;
1179
1180                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1181                                                            sk->sk_allocation);
1182                         if (opt->hopopt && !np->cork.opt->hopopt)
1183                                 return -ENOBUFS;
1184
1185                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1186                                                             sk->sk_allocation);
1187                         if (opt->srcrt && !np->cork.opt->srcrt)
1188                                 return -ENOBUFS;
1189
1190                         /* need source address above miyazawa*/
1191                 }
1192                 dst_hold(&rt->dst);
1193                 inet->cork.dst = &rt->dst;
1194                 inet->cork.fl = *fl;
1195                 np->cork.hop_limit = hlimit;
1196                 np->cork.tclass = tclass;
1197                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1198                       rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1199                 if (np->frag_size < mtu) {
1200                         if (np->frag_size)
1201                                 mtu = np->frag_size;
1202                 }
1203                 inet->cork.fragsize = mtu;
1204                 if (dst_allfrag(rt->dst.path))
1205                         inet->cork.flags |= IPCORK_ALLFRAG;
1206                 inet->cork.length = 0;
1207                 sk->sk_sndmsg_page = NULL;
1208                 sk->sk_sndmsg_off = 0;
1209                 exthdrlen = rt->dst.header_len + (opt ? opt->opt_flen : 0) -
1210                             rt->rt6i_nfheader_len;
1211                 length += exthdrlen;
1212                 transhdrlen += exthdrlen;
1213         } else {
1214                 rt = (struct rt6_info *)inet->cork.dst;
1215                 fl = &inet->cork.fl;
1216                 opt = np->cork.opt;
1217                 transhdrlen = 0;
1218                 exthdrlen = 0;
1219                 mtu = inet->cork.fragsize;
1220         }
1221
1222         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1223
1224         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1225                         (opt ? opt->opt_nflen : 0);
1226         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1227
1228         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1229                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1230                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1231                         return -EMSGSIZE;
1232                 }
1233         }
1234
1235         /*
1236          * Let's try using as much space as possible.
1237          * Use MTU if total length of the message fits into the MTU.
1238          * Otherwise, we need to reserve fragment header and
1239          * fragment alignment (= 8-15 octects, in total).
1240          *
1241          * Note that we may need to "move" the data from the tail of
1242          * of the buffer to the new fragment when we split
1243          * the message.
1244          *
1245          * FIXME: It may be fragmented into multiple chunks
1246          *        at once if non-fragmentable extension headers
1247          *        are too large.
1248          * --yoshfuji
1249          */
1250
1251         inet->cork.length += length;
1252         if (length > mtu) {
1253                 int proto = sk->sk_protocol;
1254                 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1255                         ipv6_local_rxpmtu(sk, fl, mtu-exthdrlen);
1256                         return -EMSGSIZE;
1257                 }
1258
1259                 if (proto == IPPROTO_UDP &&
1260                     (rt->dst.dev->features & NETIF_F_UFO)) {
1261
1262                         err = ip6_ufo_append_data(sk, getfrag, from, length,
1263                                                   hh_len, fragheaderlen,
1264                                                   transhdrlen, mtu, flags, rt);
1265                         if (err)
1266                                 goto error;
1267                         return 0;
1268                 }
1269         }
1270
1271         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1272                 goto alloc_new_skb;
1273
1274         while (length > 0) {
1275                 /* Check if the remaining data fits into current packet. */
1276                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1277                 if (copy < length)
1278                         copy = maxfraglen - skb->len;
1279
1280                 if (copy <= 0) {
1281                         char *data;
1282                         unsigned int datalen;
1283                         unsigned int fraglen;
1284                         unsigned int fraggap;
1285                         unsigned int alloclen;
1286                         struct sk_buff *skb_prev;
1287 alloc_new_skb:
1288                         skb_prev = skb;
1289
1290                         /* There's no room in the current skb */
1291                         if (skb_prev)
1292                                 fraggap = skb_prev->len - maxfraglen;
1293                         else
1294                                 fraggap = 0;
1295
1296                         /*
1297                          * If remaining data exceeds the mtu,
1298                          * we know we need more fragment(s).
1299                          */
1300                         datalen = length + fraggap;
1301                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1302                                 datalen = maxfraglen - fragheaderlen;
1303
1304                         fraglen = datalen + fragheaderlen;
1305                         if ((flags & MSG_MORE) &&
1306                             !(rt->dst.dev->features&NETIF_F_SG))
1307                                 alloclen = mtu;
1308                         else
1309                                 alloclen = datalen + fragheaderlen;
1310
1311                         /*
1312                          * The last fragment gets additional space at tail.
1313                          * Note: we overallocate on fragments with MSG_MODE
1314                          * because we have no idea if we're the last one.
1315                          */
1316                         if (datalen == length + fraggap)
1317                                 alloclen += rt->dst.trailer_len;
1318
1319                         /*
1320                          * We just reserve space for fragment header.
1321                          * Note: this may be overallocation if the message
1322                          * (without MSG_MORE) fits into the MTU.
1323                          */
1324                         alloclen += sizeof(struct frag_hdr);
1325
1326                         if (transhdrlen) {
1327                                 skb = sock_alloc_send_skb(sk,
1328                                                 alloclen + hh_len,
1329                                                 (flags & MSG_DONTWAIT), &err);
1330                         } else {
1331                                 skb = NULL;
1332                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1333                                     2 * sk->sk_sndbuf)
1334                                         skb = sock_wmalloc(sk,
1335                                                            alloclen + hh_len, 1,
1336                                                            sk->sk_allocation);
1337                                 if (unlikely(skb == NULL))
1338                                         err = -ENOBUFS;
1339                         }
1340                         if (skb == NULL)
1341                                 goto error;
1342                         /*
1343                          *      Fill in the control structures
1344                          */
1345                         skb->ip_summed = csummode;
1346                         skb->csum = 0;
1347                         /* reserve for fragmentation */
1348                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1349
1350                         /*
1351                          *      Find where to start putting bytes
1352                          */
1353                         data = skb_put(skb, fraglen);
1354                         skb_set_network_header(skb, exthdrlen);
1355                         data += fragheaderlen;
1356                         skb->transport_header = (skb->network_header +
1357                                                  fragheaderlen);
1358                         if (fraggap) {
1359                                 skb->csum = skb_copy_and_csum_bits(
1360                                         skb_prev, maxfraglen,
1361                                         data + transhdrlen, fraggap, 0);
1362                                 skb_prev->csum = csum_sub(skb_prev->csum,
1363                                                           skb->csum);
1364                                 data += fraggap;
1365                                 pskb_trim_unique(skb_prev, maxfraglen);
1366                         }
1367                         copy = datalen - transhdrlen - fraggap;
1368                         if (copy < 0) {
1369                                 err = -EINVAL;
1370                                 kfree_skb(skb);
1371                                 goto error;
1372                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1373                                 err = -EFAULT;
1374                                 kfree_skb(skb);
1375                                 goto error;
1376                         }
1377
1378                         offset += copy;
1379                         length -= datalen - fraggap;
1380                         transhdrlen = 0;
1381                         exthdrlen = 0;
1382                         csummode = CHECKSUM_NONE;
1383
1384                         /*
1385                          * Put the packet on the pending queue
1386                          */
1387                         __skb_queue_tail(&sk->sk_write_queue, skb);
1388                         continue;
1389                 }
1390
1391                 if (copy > length)
1392                         copy = length;
1393
1394                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1395                         unsigned int off;
1396
1397                         off = skb->len;
1398                         if (getfrag(from, skb_put(skb, copy),
1399                                                 offset, copy, off, skb) < 0) {
1400                                 __skb_trim(skb, off);
1401                                 err = -EFAULT;
1402                                 goto error;
1403                         }
1404                 } else {
1405                         int i = skb_shinfo(skb)->nr_frags;
1406                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1407                         struct page *page = sk->sk_sndmsg_page;
1408                         int off = sk->sk_sndmsg_off;
1409                         unsigned int left;
1410
1411                         if (page && (left = PAGE_SIZE - off) > 0) {
1412                                 if (copy >= left)
1413                                         copy = left;
1414                                 if (page != frag->page) {
1415                                         if (i == MAX_SKB_FRAGS) {
1416                                                 err = -EMSGSIZE;
1417                                                 goto error;
1418                                         }
1419                                         get_page(page);
1420                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1421                                         frag = &skb_shinfo(skb)->frags[i];
1422                                 }
1423                         } else if(i < MAX_SKB_FRAGS) {
1424                                 if (copy > PAGE_SIZE)
1425                                         copy = PAGE_SIZE;
1426                                 page = alloc_pages(sk->sk_allocation, 0);
1427                                 if (page == NULL) {
1428                                         err = -ENOMEM;
1429                                         goto error;
1430                                 }
1431                                 sk->sk_sndmsg_page = page;
1432                                 sk->sk_sndmsg_off = 0;
1433
1434                                 skb_fill_page_desc(skb, i, page, 0, 0);
1435                                 frag = &skb_shinfo(skb)->frags[i];
1436                         } else {
1437                                 err = -EMSGSIZE;
1438                                 goto error;
1439                         }
1440                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1441                                 err = -EFAULT;
1442                                 goto error;
1443                         }
1444                         sk->sk_sndmsg_off += copy;
1445                         frag->size += copy;
1446                         skb->len += copy;
1447                         skb->data_len += copy;
1448                         skb->truesize += copy;
1449                         atomic_add(copy, &sk->sk_wmem_alloc);
1450                 }
1451                 offset += copy;
1452                 length -= copy;
1453         }
1454         return 0;
1455 error:
1456         inet->cork.length -= length;
1457         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1458         return err;
1459 }
1460
1461 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1462 {
1463         if (np->cork.opt) {
1464                 kfree(np->cork.opt->dst0opt);
1465                 kfree(np->cork.opt->dst1opt);
1466                 kfree(np->cork.opt->hopopt);
1467                 kfree(np->cork.opt->srcrt);
1468                 kfree(np->cork.opt);
1469                 np->cork.opt = NULL;
1470         }
1471
1472         if (inet->cork.dst) {
1473                 dst_release(inet->cork.dst);
1474                 inet->cork.dst = NULL;
1475                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1476         }
1477         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1478 }
1479
1480 int ip6_push_pending_frames(struct sock *sk)
1481 {
1482         struct sk_buff *skb, *tmp_skb;
1483         struct sk_buff **tail_skb;
1484         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1485         struct inet_sock *inet = inet_sk(sk);
1486         struct ipv6_pinfo *np = inet6_sk(sk);
1487         struct net *net = sock_net(sk);
1488         struct ipv6hdr *hdr;
1489         struct ipv6_txoptions *opt = np->cork.opt;
1490         struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1491         struct flowi *fl = &inet->cork.fl;
1492         unsigned char proto = fl->proto;
1493         int err = 0;
1494
1495         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1496                 goto out;
1497         tail_skb = &(skb_shinfo(skb)->frag_list);
1498
1499         /* move skb->data to ip header from ext header */
1500         if (skb->data < skb_network_header(skb))
1501                 __skb_pull(skb, skb_network_offset(skb));
1502         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1503                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1504                 *tail_skb = tmp_skb;
1505                 tail_skb = &(tmp_skb->next);
1506                 skb->len += tmp_skb->len;
1507                 skb->data_len += tmp_skb->len;
1508                 skb->truesize += tmp_skb->truesize;
1509                 tmp_skb->destructor = NULL;
1510                 tmp_skb->sk = NULL;
1511         }
1512
1513         /* Allow local fragmentation. */
1514         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1515                 skb->local_df = 1;
1516
1517         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1518         __skb_pull(skb, skb_network_header_len(skb));
1519         if (opt && opt->opt_flen)
1520                 ipv6_push_frag_opts(skb, opt, &proto);
1521         if (opt && opt->opt_nflen)
1522                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1523
1524         skb_push(skb, sizeof(struct ipv6hdr));
1525         skb_reset_network_header(skb);
1526         hdr = ipv6_hdr(skb);
1527
1528         *(__be32*)hdr = fl->fl6_flowlabel |
1529                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1530
1531         hdr->hop_limit = np->cork.hop_limit;
1532         hdr->nexthdr = proto;
1533         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1534         ipv6_addr_copy(&hdr->daddr, final_dst);
1535
1536         skb->priority = sk->sk_priority;
1537         skb->mark = sk->sk_mark;
1538
1539         skb_dst_set(skb, dst_clone(&rt->dst));
1540         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1541         if (proto == IPPROTO_ICMPV6) {
1542                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1543
1544                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1545                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1546         }
1547
1548         err = ip6_local_out(skb);
1549         if (err) {
1550                 if (err > 0)
1551                         err = net_xmit_errno(err);
1552                 if (err)
1553                         goto error;
1554         }
1555
1556 out:
1557         ip6_cork_release(inet, np);
1558         return err;
1559 error:
1560         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1561         goto out;
1562 }
1563
1564 void ip6_flush_pending_frames(struct sock *sk)
1565 {
1566         struct sk_buff *skb;
1567
1568         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1569                 if (skb_dst(skb))
1570                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1571                                       IPSTATS_MIB_OUTDISCARDS);
1572                 kfree_skb(skb);
1573         }
1574
1575         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1576 }