Merge branch 'master' of /repos/git/net-next-2.6
[linux-flexiantxendom0-natty.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63         int len;
64
65         len = skb->len - sizeof(struct ipv6hdr);
66         if (len > IPV6_MAXPLEN)
67                 len = 0;
68         ipv6_hdr(skb)->payload_len = htons(len);
69
70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71                        skb_dst(skb)->dev, dst_output);
72 }
73
74 int ip6_local_out(struct sk_buff *skb)
75 {
76         int err;
77
78         err = __ip6_local_out(skb);
79         if (likely(err == 1))
80                 err = dst_output(skb);
81
82         return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89         skb_reset_mac_header(newskb);
90         __skb_pull(newskb, skb_network_offset(newskb));
91         newskb->pkt_type = PACKET_LOOPBACK;
92         newskb->ip_summed = CHECKSUM_UNNECESSARY;
93         WARN_ON(!skb_dst(newskb));
94
95         netif_rx(newskb);
96         return 0;
97 }
98
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101         struct dst_entry *dst = skb_dst(skb);
102         struct net_device *dev = dst->dev;
103
104         skb->protocol = htons(ETH_P_IPV6);
105         skb->dev = dev;
106
107         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
108                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
109
110                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
111                     ((mroute6_socket(dev_net(dev)) &&
112                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
113                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
114                                          &ipv6_hdr(skb)->saddr))) {
115                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
116
117                         /* Do not check for IFF_ALLMULTI; multicast routing
118                            is not supported in any case.
119                          */
120                         if (newskb)
121                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
122                                         newskb, NULL, newskb->dev,
123                                         ip6_dev_loopback_xmit);
124
125                         if (ipv6_hdr(skb)->hop_limit == 0) {
126                                 IP6_INC_STATS(dev_net(dev), idev,
127                                               IPSTATS_MIB_OUTDISCARDS);
128                                 kfree_skb(skb);
129                                 return 0;
130                         }
131                 }
132
133                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
134                                 skb->len);
135         }
136
137         if (dst->hh)
138                 return neigh_hh_output(dst->hh, skb);
139         else if (dst->neighbour)
140                 return dst->neighbour->output(skb);
141
142         IP6_INC_STATS_BH(dev_net(dst->dev),
143                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
144         kfree_skb(skb);
145         return -EINVAL;
146 }
147
148 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
149 {
150         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
151
152         return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
153                skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
154 }
155
156 static int ip6_finish_output(struct sk_buff *skb)
157 {
158         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
159             dst_allfrag(skb_dst(skb)))
160                 return ip6_fragment(skb, ip6_finish_output2);
161         else
162                 return ip6_finish_output2(skb);
163 }
164
165 int ip6_output(struct sk_buff *skb)
166 {
167         struct net_device *dev = skb_dst(skb)->dev;
168         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
169         if (unlikely(idev->cnf.disable_ipv6)) {
170                 IP6_INC_STATS(dev_net(dev), idev,
171                               IPSTATS_MIB_OUTDISCARDS);
172                 kfree_skb(skb);
173                 return 0;
174         }
175
176         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
177                             ip6_finish_output,
178                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
179 }
180
181 /*
182  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
183  */
184
185 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
186              struct ipv6_txoptions *opt)
187 {
188         struct net *net = sock_net(sk);
189         struct ipv6_pinfo *np = inet6_sk(sk);
190         struct in6_addr *first_hop = &fl->fl6_dst;
191         struct dst_entry *dst = skb_dst(skb);
192         struct ipv6hdr *hdr;
193         u8  proto = fl->proto;
194         int seg_len = skb->len;
195         int hlimit = -1;
196         int tclass = 0;
197         u32 mtu;
198
199         if (opt) {
200                 unsigned int head_room;
201
202                 /* First: exthdrs may take lots of space (~8K for now)
203                    MAX_HEADER is not enough.
204                  */
205                 head_room = opt->opt_nflen + opt->opt_flen;
206                 seg_len += head_room;
207                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
208
209                 if (skb_headroom(skb) < head_room) {
210                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
211                         if (skb2 == NULL) {
212                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
213                                               IPSTATS_MIB_OUTDISCARDS);
214                                 kfree_skb(skb);
215                                 return -ENOBUFS;
216                         }
217                         kfree_skb(skb);
218                         skb = skb2;
219                         if (sk)
220                                 skb_set_owner_w(skb, sk);
221                 }
222                 if (opt->opt_flen)
223                         ipv6_push_frag_opts(skb, opt, &proto);
224                 if (opt->opt_nflen)
225                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
226         }
227
228         skb_push(skb, sizeof(struct ipv6hdr));
229         skb_reset_network_header(skb);
230         hdr = ipv6_hdr(skb);
231
232         /*
233          *      Fill in the IPv6 header
234          */
235         if (np) {
236                 tclass = np->tclass;
237                 hlimit = np->hop_limit;
238         }
239         if (hlimit < 0)
240                 hlimit = ip6_dst_hoplimit(dst);
241
242         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
243
244         hdr->payload_len = htons(seg_len);
245         hdr->nexthdr = proto;
246         hdr->hop_limit = hlimit;
247
248         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
249         ipv6_addr_copy(&hdr->daddr, first_hop);
250
251         skb->priority = sk->sk_priority;
252         skb->mark = sk->sk_mark;
253
254         mtu = dst_mtu(dst);
255         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
256                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
257                               IPSTATS_MIB_OUT, skb->len);
258                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
259                                dst->dev, dst_output);
260         }
261
262         if (net_ratelimit())
263                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
264         skb->dev = dst->dev;
265         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
266         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
267         kfree_skb(skb);
268         return -EMSGSIZE;
269 }
270
271 EXPORT_SYMBOL(ip6_xmit);
272
273 /*
274  *      To avoid extra problems ND packets are send through this
275  *      routine. It's code duplication but I really want to avoid
276  *      extra checks since ipv6_build_header is used by TCP (which
277  *      is for us performance critical)
278  */
279
280 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
281                const struct in6_addr *saddr, const struct in6_addr *daddr,
282                int proto, int len)
283 {
284         struct ipv6_pinfo *np = inet6_sk(sk);
285         struct ipv6hdr *hdr;
286         int totlen;
287
288         skb->protocol = htons(ETH_P_IPV6);
289         skb->dev = dev;
290
291         totlen = len + sizeof(struct ipv6hdr);
292
293         skb_reset_network_header(skb);
294         skb_put(skb, sizeof(struct ipv6hdr));
295         hdr = ipv6_hdr(skb);
296
297         *(__be32*)hdr = htonl(0x60000000);
298
299         hdr->payload_len = htons(len);
300         hdr->nexthdr = proto;
301         hdr->hop_limit = np->hop_limit;
302
303         ipv6_addr_copy(&hdr->saddr, saddr);
304         ipv6_addr_copy(&hdr->daddr, daddr);
305
306         return 0;
307 }
308
309 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
310 {
311         struct ip6_ra_chain *ra;
312         struct sock *last = NULL;
313
314         read_lock(&ip6_ra_lock);
315         for (ra = ip6_ra_chain; ra; ra = ra->next) {
316                 struct sock *sk = ra->sk;
317                 if (sk && ra->sel == sel &&
318                     (!sk->sk_bound_dev_if ||
319                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
320                         if (last) {
321                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
322                                 if (skb2)
323                                         rawv6_rcv(last, skb2);
324                         }
325                         last = sk;
326                 }
327         }
328
329         if (last) {
330                 rawv6_rcv(last, skb);
331                 read_unlock(&ip6_ra_lock);
332                 return 1;
333         }
334         read_unlock(&ip6_ra_lock);
335         return 0;
336 }
337
338 static int ip6_forward_proxy_check(struct sk_buff *skb)
339 {
340         struct ipv6hdr *hdr = ipv6_hdr(skb);
341         u8 nexthdr = hdr->nexthdr;
342         int offset;
343
344         if (ipv6_ext_hdr(nexthdr)) {
345                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
346                 if (offset < 0)
347                         return 0;
348         } else
349                 offset = sizeof(struct ipv6hdr);
350
351         if (nexthdr == IPPROTO_ICMPV6) {
352                 struct icmp6hdr *icmp6;
353
354                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
355                                          offset + 1 - skb->data)))
356                         return 0;
357
358                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
359
360                 switch (icmp6->icmp6_type) {
361                 case NDISC_ROUTER_SOLICITATION:
362                 case NDISC_ROUTER_ADVERTISEMENT:
363                 case NDISC_NEIGHBOUR_SOLICITATION:
364                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
365                 case NDISC_REDIRECT:
366                         /* For reaction involving unicast neighbor discovery
367                          * message destined to the proxied address, pass it to
368                          * input function.
369                          */
370                         return 1;
371                 default:
372                         break;
373                 }
374         }
375
376         /*
377          * The proxying router can't forward traffic sent to a link-local
378          * address, so signal the sender and discard the packet. This
379          * behavior is clarified by the MIPv6 specification.
380          */
381         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
382                 dst_link_failure(skb);
383                 return -1;
384         }
385
386         return 0;
387 }
388
389 static inline int ip6_forward_finish(struct sk_buff *skb)
390 {
391         return dst_output(skb);
392 }
393
394 int ip6_forward(struct sk_buff *skb)
395 {
396         struct dst_entry *dst = skb_dst(skb);
397         struct ipv6hdr *hdr = ipv6_hdr(skb);
398         struct inet6_skb_parm *opt = IP6CB(skb);
399         struct net *net = dev_net(dst->dev);
400         u32 mtu;
401
402         if (net->ipv6.devconf_all->forwarding == 0)
403                 goto error;
404
405         if (skb_warn_if_lro(skb))
406                 goto drop;
407
408         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
409                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
410                 goto drop;
411         }
412
413         skb_forward_csum(skb);
414
415         /*
416          *      We DO NOT make any processing on
417          *      RA packets, pushing them to user level AS IS
418          *      without ane WARRANTY that application will be able
419          *      to interpret them. The reason is that we
420          *      cannot make anything clever here.
421          *
422          *      We are not end-node, so that if packet contains
423          *      AH/ESP, we cannot make anything.
424          *      Defragmentation also would be mistake, RA packets
425          *      cannot be fragmented, because there is no warranty
426          *      that different fragments will go along one path. --ANK
427          */
428         if (opt->ra) {
429                 u8 *ptr = skb_network_header(skb) + opt->ra;
430                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
431                         return 0;
432         }
433
434         /*
435          *      check and decrement ttl
436          */
437         if (hdr->hop_limit <= 1) {
438                 /* Force OUTPUT device used as source address */
439                 skb->dev = dst->dev;
440                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
441                 IP6_INC_STATS_BH(net,
442                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
443
444                 kfree_skb(skb);
445                 return -ETIMEDOUT;
446         }
447
448         /* XXX: idev->cnf.proxy_ndp? */
449         if (net->ipv6.devconf_all->proxy_ndp &&
450             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
451                 int proxied = ip6_forward_proxy_check(skb);
452                 if (proxied > 0)
453                         return ip6_input(skb);
454                 else if (proxied < 0) {
455                         IP6_INC_STATS(net, ip6_dst_idev(dst),
456                                       IPSTATS_MIB_INDISCARDS);
457                         goto drop;
458                 }
459         }
460
461         if (!xfrm6_route_forward(skb)) {
462                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
463                 goto drop;
464         }
465         dst = skb_dst(skb);
466
467         /* IPv6 specs say nothing about it, but it is clear that we cannot
468            send redirects to source routed frames.
469            We don't send redirects to frames decapsulated from IPsec.
470          */
471         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
472             !skb_sec_path(skb)) {
473                 struct in6_addr *target = NULL;
474                 struct rt6_info *rt;
475                 struct neighbour *n = dst->neighbour;
476
477                 /*
478                  *      incoming and outgoing devices are the same
479                  *      send a redirect.
480                  */
481
482                 rt = (struct rt6_info *) dst;
483                 if ((rt->rt6i_flags & RTF_GATEWAY))
484                         target = (struct in6_addr*)&n->primary_key;
485                 else
486                         target = &hdr->daddr;
487
488                 /* Limit redirects both by destination (here)
489                    and by source (inside ndisc_send_redirect)
490                  */
491                 if (xrlim_allow(dst, 1*HZ))
492                         ndisc_send_redirect(skb, n, target);
493         } else {
494                 int addrtype = ipv6_addr_type(&hdr->saddr);
495
496                 /* This check is security critical. */
497                 if (addrtype == IPV6_ADDR_ANY ||
498                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
499                         goto error;
500                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
501                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
502                                     ICMPV6_NOT_NEIGHBOUR, 0);
503                         goto error;
504                 }
505         }
506
507         mtu = dst_mtu(dst);
508         if (mtu < IPV6_MIN_MTU)
509                 mtu = IPV6_MIN_MTU;
510
511         if (skb->len > mtu) {
512                 /* Again, force OUTPUT device used as source address */
513                 skb->dev = dst->dev;
514                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
515                 IP6_INC_STATS_BH(net,
516                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
517                 IP6_INC_STATS_BH(net,
518                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
519                 kfree_skb(skb);
520                 return -EMSGSIZE;
521         }
522
523         if (skb_cow(skb, dst->dev->hard_header_len)) {
524                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
525                 goto drop;
526         }
527
528         hdr = ipv6_hdr(skb);
529
530         /* Mangling hops number delayed to point after skb COW */
531
532         hdr->hop_limit--;
533
534         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
535         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
536                        ip6_forward_finish);
537
538 error:
539         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
540 drop:
541         kfree_skb(skb);
542         return -EINVAL;
543 }
544
545 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
546 {
547         to->pkt_type = from->pkt_type;
548         to->priority = from->priority;
549         to->protocol = from->protocol;
550         skb_dst_drop(to);
551         skb_dst_set(to, dst_clone(skb_dst(from)));
552         to->dev = from->dev;
553         to->mark = from->mark;
554
555 #ifdef CONFIG_NET_SCHED
556         to->tc_index = from->tc_index;
557 #endif
558         nf_copy(to, from);
559 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
560     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
561         to->nf_trace = from->nf_trace;
562 #endif
563         skb_copy_secmark(to, from);
564 }
565
566 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
567 {
568         u16 offset = sizeof(struct ipv6hdr);
569         struct ipv6_opt_hdr *exthdr =
570                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
571         unsigned int packet_len = skb->tail - skb->network_header;
572         int found_rhdr = 0;
573         *nexthdr = &ipv6_hdr(skb)->nexthdr;
574
575         while (offset + 1 <= packet_len) {
576
577                 switch (**nexthdr) {
578
579                 case NEXTHDR_HOP:
580                         break;
581                 case NEXTHDR_ROUTING:
582                         found_rhdr = 1;
583                         break;
584                 case NEXTHDR_DEST:
585 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
586                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
587                                 break;
588 #endif
589                         if (found_rhdr)
590                                 return offset;
591                         break;
592                 default :
593                         return offset;
594                 }
595
596                 offset += ipv6_optlen(exthdr);
597                 *nexthdr = &exthdr->nexthdr;
598                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
599                                                  offset);
600         }
601
602         return offset;
603 }
604
605 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
606 {
607         struct sk_buff *frag;
608         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
609         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
610         struct ipv6hdr *tmp_hdr;
611         struct frag_hdr *fh;
612         unsigned int mtu, hlen, left, len;
613         __be32 frag_id = 0;
614         int ptr, offset = 0, err=0;
615         u8 *prevhdr, nexthdr = 0;
616         struct net *net = dev_net(skb_dst(skb)->dev);
617
618         hlen = ip6_find_1stfragopt(skb, &prevhdr);
619         nexthdr = *prevhdr;
620
621         mtu = ip6_skb_dst_mtu(skb);
622
623         /* We must not fragment if the socket is set to force MTU discovery
624          * or if the skb it not generated by a local socket.
625          */
626         if (!skb->local_df) {
627                 skb->dev = skb_dst(skb)->dev;
628                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
629                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
630                               IPSTATS_MIB_FRAGFAILS);
631                 kfree_skb(skb);
632                 return -EMSGSIZE;
633         }
634
635         if (np && np->frag_size < mtu) {
636                 if (np->frag_size)
637                         mtu = np->frag_size;
638         }
639         mtu -= hlen + sizeof(struct frag_hdr);
640
641         if (skb_has_frags(skb)) {
642                 int first_len = skb_pagelen(skb);
643                 int truesizes = 0;
644
645                 if (first_len - hlen > mtu ||
646                     ((first_len - hlen) & 7) ||
647                     skb_cloned(skb))
648                         goto slow_path;
649
650                 skb_walk_frags(skb, frag) {
651                         /* Correct geometry. */
652                         if (frag->len > mtu ||
653                             ((frag->len & 7) && frag->next) ||
654                             skb_headroom(frag) < hlen)
655                             goto slow_path;
656
657                         /* Partially cloned skb? */
658                         if (skb_shared(frag))
659                                 goto slow_path;
660
661                         BUG_ON(frag->sk);
662                         if (skb->sk) {
663                                 frag->sk = skb->sk;
664                                 frag->destructor = sock_wfree;
665                                 truesizes += frag->truesize;
666                         }
667                 }
668
669                 err = 0;
670                 offset = 0;
671                 frag = skb_shinfo(skb)->frag_list;
672                 skb_frag_list_init(skb);
673                 /* BUILD HEADER */
674
675                 *prevhdr = NEXTHDR_FRAGMENT;
676                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
677                 if (!tmp_hdr) {
678                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
679                                       IPSTATS_MIB_FRAGFAILS);
680                         return -ENOMEM;
681                 }
682
683                 __skb_pull(skb, hlen);
684                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
685                 __skb_push(skb, hlen);
686                 skb_reset_network_header(skb);
687                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
688
689                 ipv6_select_ident(fh);
690                 fh->nexthdr = nexthdr;
691                 fh->reserved = 0;
692                 fh->frag_off = htons(IP6_MF);
693                 frag_id = fh->identification;
694
695                 first_len = skb_pagelen(skb);
696                 skb->data_len = first_len - skb_headlen(skb);
697                 skb->truesize -= truesizes;
698                 skb->len = first_len;
699                 ipv6_hdr(skb)->payload_len = htons(first_len -
700                                                    sizeof(struct ipv6hdr));
701
702                 dst_hold(&rt->u.dst);
703
704                 for (;;) {
705                         /* Prepare header of the next frame,
706                          * before previous one went down. */
707                         if (frag) {
708                                 frag->ip_summed = CHECKSUM_NONE;
709                                 skb_reset_transport_header(frag);
710                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
711                                 __skb_push(frag, hlen);
712                                 skb_reset_network_header(frag);
713                                 memcpy(skb_network_header(frag), tmp_hdr,
714                                        hlen);
715                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
716                                 fh->nexthdr = nexthdr;
717                                 fh->reserved = 0;
718                                 fh->frag_off = htons(offset);
719                                 if (frag->next != NULL)
720                                         fh->frag_off |= htons(IP6_MF);
721                                 fh->identification = frag_id;
722                                 ipv6_hdr(frag)->payload_len =
723                                                 htons(frag->len -
724                                                       sizeof(struct ipv6hdr));
725                                 ip6_copy_metadata(frag, skb);
726                         }
727
728                         err = output(skb);
729                         if(!err)
730                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
731                                               IPSTATS_MIB_FRAGCREATES);
732
733                         if (err || !frag)
734                                 break;
735
736                         skb = frag;
737                         frag = skb->next;
738                         skb->next = NULL;
739                 }
740
741                 kfree(tmp_hdr);
742
743                 if (err == 0) {
744                         IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
745                                       IPSTATS_MIB_FRAGOKS);
746                         dst_release(&rt->u.dst);
747                         return 0;
748                 }
749
750                 while (frag) {
751                         skb = frag->next;
752                         kfree_skb(frag);
753                         frag = skb;
754                 }
755
756                 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
757                               IPSTATS_MIB_FRAGFAILS);
758                 dst_release(&rt->u.dst);
759                 return err;
760         }
761
762 slow_path:
763         left = skb->len - hlen;         /* Space per frame */
764         ptr = hlen;                     /* Where to start from */
765
766         /*
767          *      Fragment the datagram.
768          */
769
770         *prevhdr = NEXTHDR_FRAGMENT;
771
772         /*
773          *      Keep copying data until we run out.
774          */
775         while(left > 0) {
776                 len = left;
777                 /* IF: it doesn't fit, use 'mtu' - the data space left */
778                 if (len > mtu)
779                         len = mtu;
780                 /* IF: we are not sending upto and including the packet end
781                    then align the next start on an eight byte boundary */
782                 if (len < left) {
783                         len &= ~7;
784                 }
785                 /*
786                  *      Allocate buffer.
787                  */
788
789                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
790                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
791                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
792                                       IPSTATS_MIB_FRAGFAILS);
793                         err = -ENOMEM;
794                         goto fail;
795                 }
796
797                 /*
798                  *      Set up data on packet
799                  */
800
801                 ip6_copy_metadata(frag, skb);
802                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
803                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
804                 skb_reset_network_header(frag);
805                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
806                 frag->transport_header = (frag->network_header + hlen +
807                                           sizeof(struct frag_hdr));
808
809                 /*
810                  *      Charge the memory for the fragment to any owner
811                  *      it might possess
812                  */
813                 if (skb->sk)
814                         skb_set_owner_w(frag, skb->sk);
815
816                 /*
817                  *      Copy the packet header into the new buffer.
818                  */
819                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
820
821                 /*
822                  *      Build fragment header.
823                  */
824                 fh->nexthdr = nexthdr;
825                 fh->reserved = 0;
826                 if (!frag_id) {
827                         ipv6_select_ident(fh);
828                         frag_id = fh->identification;
829                 } else
830                         fh->identification = frag_id;
831
832                 /*
833                  *      Copy a block of the IP datagram.
834                  */
835                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
836                         BUG();
837                 left -= len;
838
839                 fh->frag_off = htons(offset);
840                 if (left > 0)
841                         fh->frag_off |= htons(IP6_MF);
842                 ipv6_hdr(frag)->payload_len = htons(frag->len -
843                                                     sizeof(struct ipv6hdr));
844
845                 ptr += len;
846                 offset += len;
847
848                 /*
849                  *      Put this fragment into the sending queue.
850                  */
851                 err = output(frag);
852                 if (err)
853                         goto fail;
854
855                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
856                               IPSTATS_MIB_FRAGCREATES);
857         }
858         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
859                       IPSTATS_MIB_FRAGOKS);
860         kfree_skb(skb);
861         return err;
862
863 fail:
864         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
865                       IPSTATS_MIB_FRAGFAILS);
866         kfree_skb(skb);
867         return err;
868 }
869
870 static inline int ip6_rt_check(struct rt6key *rt_key,
871                                struct in6_addr *fl_addr,
872                                struct in6_addr *addr_cache)
873 {
874         return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
875                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
876 }
877
878 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
879                                           struct dst_entry *dst,
880                                           struct flowi *fl)
881 {
882         struct ipv6_pinfo *np = inet6_sk(sk);
883         struct rt6_info *rt = (struct rt6_info *)dst;
884
885         if (!dst)
886                 goto out;
887
888         /* Yes, checking route validity in not connected
889          * case is not very simple. Take into account,
890          * that we do not support routing by source, TOS,
891          * and MSG_DONTROUTE            --ANK (980726)
892          *
893          * 1. ip6_rt_check(): If route was host route,
894          *    check that cached destination is current.
895          *    If it is network route, we still may
896          *    check its validity using saved pointer
897          *    to the last used address: daddr_cache.
898          *    We do not want to save whole address now,
899          *    (because main consumer of this service
900          *    is tcp, which has not this problem),
901          *    so that the last trick works only on connected
902          *    sockets.
903          * 2. oif also should be the same.
904          */
905         if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
906 #ifdef CONFIG_IPV6_SUBTREES
907             ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
908 #endif
909             (fl->oif && fl->oif != dst->dev->ifindex)) {
910                 dst_release(dst);
911                 dst = NULL;
912         }
913
914 out:
915         return dst;
916 }
917
918 static int ip6_dst_lookup_tail(struct sock *sk,
919                                struct dst_entry **dst, struct flowi *fl)
920 {
921         int err;
922         struct net *net = sock_net(sk);
923
924         if (*dst == NULL)
925                 *dst = ip6_route_output(net, sk, fl);
926
927         if ((err = (*dst)->error))
928                 goto out_err_release;
929
930         if (ipv6_addr_any(&fl->fl6_src)) {
931                 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
932                                          &fl->fl6_dst,
933                                          sk ? inet6_sk(sk)->srcprefs : 0,
934                                          &fl->fl6_src);
935                 if (err)
936                         goto out_err_release;
937         }
938
939 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
940         /*
941          * Here if the dst entry we've looked up
942          * has a neighbour entry that is in the INCOMPLETE
943          * state and the src address from the flow is
944          * marked as OPTIMISTIC, we release the found
945          * dst entry and replace it instead with the
946          * dst entry of the nexthop router
947          */
948         if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
949                 struct inet6_ifaddr *ifp;
950                 struct flowi fl_gw;
951                 int redirect;
952
953                 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
954                                       (*dst)->dev, 1);
955
956                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
957                 if (ifp)
958                         in6_ifa_put(ifp);
959
960                 if (redirect) {
961                         /*
962                          * We need to get the dst entry for the
963                          * default router instead
964                          */
965                         dst_release(*dst);
966                         memcpy(&fl_gw, fl, sizeof(struct flowi));
967                         memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
968                         *dst = ip6_route_output(net, sk, &fl_gw);
969                         if ((err = (*dst)->error))
970                                 goto out_err_release;
971                 }
972         }
973 #endif
974
975         return 0;
976
977 out_err_release:
978         if (err == -ENETUNREACH)
979                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
980         dst_release(*dst);
981         *dst = NULL;
982         return err;
983 }
984
985 /**
986  *      ip6_dst_lookup - perform route lookup on flow
987  *      @sk: socket which provides route info
988  *      @dst: pointer to dst_entry * for result
989  *      @fl: flow to lookup
990  *
991  *      This function performs a route lookup on the given flow.
992  *
993  *      It returns zero on success, or a standard errno code on error.
994  */
995 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
996 {
997         *dst = NULL;
998         return ip6_dst_lookup_tail(sk, dst, fl);
999 }
1000 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1001
1002 /**
1003  *      ip6_sk_dst_lookup - perform socket cached route lookup on flow
1004  *      @sk: socket which provides the dst cache and route info
1005  *      @dst: pointer to dst_entry * for result
1006  *      @fl: flow to lookup
1007  *
1008  *      This function performs a route lookup on the given flow with the
1009  *      possibility of using the cached route in the socket if it is valid.
1010  *      It will take the socket dst lock when operating on the dst cache.
1011  *      As a result, this function can only be used in process context.
1012  *
1013  *      It returns zero on success, or a standard errno code on error.
1014  */
1015 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1016 {
1017         *dst = NULL;
1018         if (sk) {
1019                 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1020                 *dst = ip6_sk_dst_check(sk, *dst, fl);
1021         }
1022
1023         return ip6_dst_lookup_tail(sk, dst, fl);
1024 }
1025 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1026
1027 static inline int ip6_ufo_append_data(struct sock *sk,
1028                         int getfrag(void *from, char *to, int offset, int len,
1029                         int odd, struct sk_buff *skb),
1030                         void *from, int length, int hh_len, int fragheaderlen,
1031                         int transhdrlen, int mtu,unsigned int flags)
1032
1033 {
1034         struct sk_buff *skb;
1035         int err;
1036
1037         /* There is support for UDP large send offload by network
1038          * device, so create one single skb packet containing complete
1039          * udp datagram
1040          */
1041         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1042                 skb = sock_alloc_send_skb(sk,
1043                         hh_len + fragheaderlen + transhdrlen + 20,
1044                         (flags & MSG_DONTWAIT), &err);
1045                 if (skb == NULL)
1046                         return -ENOMEM;
1047
1048                 /* reserve space for Hardware header */
1049                 skb_reserve(skb, hh_len);
1050
1051                 /* create space for UDP/IP header */
1052                 skb_put(skb,fragheaderlen + transhdrlen);
1053
1054                 /* initialize network header pointer */
1055                 skb_reset_network_header(skb);
1056
1057                 /* initialize protocol header pointer */
1058                 skb->transport_header = skb->network_header + fragheaderlen;
1059
1060                 skb->ip_summed = CHECKSUM_PARTIAL;
1061                 skb->csum = 0;
1062                 sk->sk_sndmsg_off = 0;
1063         }
1064
1065         err = skb_append_datato_frags(sk,skb, getfrag, from,
1066                                       (length - transhdrlen));
1067         if (!err) {
1068                 struct frag_hdr fhdr;
1069
1070                 /* Specify the length of each IPv6 datagram fragment.
1071                  * It has to be a multiple of 8.
1072                  */
1073                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1074                                              sizeof(struct frag_hdr)) & ~7;
1075                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1076                 ipv6_select_ident(&fhdr);
1077                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1078                 __skb_queue_tail(&sk->sk_write_queue, skb);
1079
1080                 return 0;
1081         }
1082         /* There is not enough support do UPD LSO,
1083          * so follow normal path
1084          */
1085         kfree_skb(skb);
1086
1087         return err;
1088 }
1089
1090 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1091                                                gfp_t gfp)
1092 {
1093         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1094 }
1095
1096 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1097                                                 gfp_t gfp)
1098 {
1099         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1100 }
1101
1102 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1103         int offset, int len, int odd, struct sk_buff *skb),
1104         void *from, int length, int transhdrlen,
1105         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1106         struct rt6_info *rt, unsigned int flags)
1107 {
1108         struct inet_sock *inet = inet_sk(sk);
1109         struct ipv6_pinfo *np = inet6_sk(sk);
1110         struct sk_buff *skb;
1111         unsigned int maxfraglen, fragheaderlen;
1112         int exthdrlen;
1113         int hh_len;
1114         int mtu;
1115         int copy;
1116         int err;
1117         int offset = 0;
1118         int csummode = CHECKSUM_NONE;
1119
1120         if (flags&MSG_PROBE)
1121                 return 0;
1122         if (skb_queue_empty(&sk->sk_write_queue)) {
1123                 /*
1124                  * setup for corking
1125                  */
1126                 if (opt) {
1127                         if (WARN_ON(np->cork.opt))
1128                                 return -EINVAL;
1129
1130                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1131                         if (unlikely(np->cork.opt == NULL))
1132                                 return -ENOBUFS;
1133
1134                         np->cork.opt->tot_len = opt->tot_len;
1135                         np->cork.opt->opt_flen = opt->opt_flen;
1136                         np->cork.opt->opt_nflen = opt->opt_nflen;
1137
1138                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1139                                                             sk->sk_allocation);
1140                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1141                                 return -ENOBUFS;
1142
1143                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1144                                                             sk->sk_allocation);
1145                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1146                                 return -ENOBUFS;
1147
1148                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1149                                                            sk->sk_allocation);
1150                         if (opt->hopopt && !np->cork.opt->hopopt)
1151                                 return -ENOBUFS;
1152
1153                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1154                                                             sk->sk_allocation);
1155                         if (opt->srcrt && !np->cork.opt->srcrt)
1156                                 return -ENOBUFS;
1157
1158                         /* need source address above miyazawa*/
1159                 }
1160                 dst_hold(&rt->u.dst);
1161                 inet->cork.dst = &rt->u.dst;
1162                 inet->cork.fl = *fl;
1163                 np->cork.hop_limit = hlimit;
1164                 np->cork.tclass = tclass;
1165                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1166                       rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1167                 if (np->frag_size < mtu) {
1168                         if (np->frag_size)
1169                                 mtu = np->frag_size;
1170                 }
1171                 inet->cork.fragsize = mtu;
1172                 if (dst_allfrag(rt->u.dst.path))
1173                         inet->cork.flags |= IPCORK_ALLFRAG;
1174                 inet->cork.length = 0;
1175                 sk->sk_sndmsg_page = NULL;
1176                 sk->sk_sndmsg_off = 0;
1177                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1178                             rt->rt6i_nfheader_len;
1179                 length += exthdrlen;
1180                 transhdrlen += exthdrlen;
1181         } else {
1182                 rt = (struct rt6_info *)inet->cork.dst;
1183                 fl = &inet->cork.fl;
1184                 opt = np->cork.opt;
1185                 transhdrlen = 0;
1186                 exthdrlen = 0;
1187                 mtu = inet->cork.fragsize;
1188         }
1189
1190         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1191
1192         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1193                         (opt ? opt->opt_nflen : 0);
1194         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1195
1196         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1197                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1198                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1199                         return -EMSGSIZE;
1200                 }
1201         }
1202
1203         /*
1204          * Let's try using as much space as possible.
1205          * Use MTU if total length of the message fits into the MTU.
1206          * Otherwise, we need to reserve fragment header and
1207          * fragment alignment (= 8-15 octects, in total).
1208          *
1209          * Note that we may need to "move" the data from the tail of
1210          * of the buffer to the new fragment when we split
1211          * the message.
1212          *
1213          * FIXME: It may be fragmented into multiple chunks
1214          *        at once if non-fragmentable extension headers
1215          *        are too large.
1216          * --yoshfuji
1217          */
1218
1219         inet->cork.length += length;
1220         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1221             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1222
1223                 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1224                                           fragheaderlen, transhdrlen, mtu,
1225                                           flags);
1226                 if (err)
1227                         goto error;
1228                 return 0;
1229         }
1230
1231         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1232                 goto alloc_new_skb;
1233
1234         while (length > 0) {
1235                 /* Check if the remaining data fits into current packet. */
1236                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1237                 if (copy < length)
1238                         copy = maxfraglen - skb->len;
1239
1240                 if (copy <= 0) {
1241                         char *data;
1242                         unsigned int datalen;
1243                         unsigned int fraglen;
1244                         unsigned int fraggap;
1245                         unsigned int alloclen;
1246                         struct sk_buff *skb_prev;
1247 alloc_new_skb:
1248                         skb_prev = skb;
1249
1250                         /* There's no room in the current skb */
1251                         if (skb_prev)
1252                                 fraggap = skb_prev->len - maxfraglen;
1253                         else
1254                                 fraggap = 0;
1255
1256                         /*
1257                          * If remaining data exceeds the mtu,
1258                          * we know we need more fragment(s).
1259                          */
1260                         datalen = length + fraggap;
1261                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1262                                 datalen = maxfraglen - fragheaderlen;
1263
1264                         fraglen = datalen + fragheaderlen;
1265                         if ((flags & MSG_MORE) &&
1266                             !(rt->u.dst.dev->features&NETIF_F_SG))
1267                                 alloclen = mtu;
1268                         else
1269                                 alloclen = datalen + fragheaderlen;
1270
1271                         /*
1272                          * The last fragment gets additional space at tail.
1273                          * Note: we overallocate on fragments with MSG_MODE
1274                          * because we have no idea if we're the last one.
1275                          */
1276                         if (datalen == length + fraggap)
1277                                 alloclen += rt->u.dst.trailer_len;
1278
1279                         /*
1280                          * We just reserve space for fragment header.
1281                          * Note: this may be overallocation if the message
1282                          * (without MSG_MORE) fits into the MTU.
1283                          */
1284                         alloclen += sizeof(struct frag_hdr);
1285
1286                         if (transhdrlen) {
1287                                 skb = sock_alloc_send_skb(sk,
1288                                                 alloclen + hh_len,
1289                                                 (flags & MSG_DONTWAIT), &err);
1290                         } else {
1291                                 skb = NULL;
1292                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1293                                     2 * sk->sk_sndbuf)
1294                                         skb = sock_wmalloc(sk,
1295                                                            alloclen + hh_len, 1,
1296                                                            sk->sk_allocation);
1297                                 if (unlikely(skb == NULL))
1298                                         err = -ENOBUFS;
1299                         }
1300                         if (skb == NULL)
1301                                 goto error;
1302                         /*
1303                          *      Fill in the control structures
1304                          */
1305                         skb->ip_summed = csummode;
1306                         skb->csum = 0;
1307                         /* reserve for fragmentation */
1308                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1309
1310                         /*
1311                          *      Find where to start putting bytes
1312                          */
1313                         data = skb_put(skb, fraglen);
1314                         skb_set_network_header(skb, exthdrlen);
1315                         data += fragheaderlen;
1316                         skb->transport_header = (skb->network_header +
1317                                                  fragheaderlen);
1318                         if (fraggap) {
1319                                 skb->csum = skb_copy_and_csum_bits(
1320                                         skb_prev, maxfraglen,
1321                                         data + transhdrlen, fraggap, 0);
1322                                 skb_prev->csum = csum_sub(skb_prev->csum,
1323                                                           skb->csum);
1324                                 data += fraggap;
1325                                 pskb_trim_unique(skb_prev, maxfraglen);
1326                         }
1327                         copy = datalen - transhdrlen - fraggap;
1328                         if (copy < 0) {
1329                                 err = -EINVAL;
1330                                 kfree_skb(skb);
1331                                 goto error;
1332                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1333                                 err = -EFAULT;
1334                                 kfree_skb(skb);
1335                                 goto error;
1336                         }
1337
1338                         offset += copy;
1339                         length -= datalen - fraggap;
1340                         transhdrlen = 0;
1341                         exthdrlen = 0;
1342                         csummode = CHECKSUM_NONE;
1343
1344                         /*
1345                          * Put the packet on the pending queue
1346                          */
1347                         __skb_queue_tail(&sk->sk_write_queue, skb);
1348                         continue;
1349                 }
1350
1351                 if (copy > length)
1352                         copy = length;
1353
1354                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1355                         unsigned int off;
1356
1357                         off = skb->len;
1358                         if (getfrag(from, skb_put(skb, copy),
1359                                                 offset, copy, off, skb) < 0) {
1360                                 __skb_trim(skb, off);
1361                                 err = -EFAULT;
1362                                 goto error;
1363                         }
1364                 } else {
1365                         int i = skb_shinfo(skb)->nr_frags;
1366                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1367                         struct page *page = sk->sk_sndmsg_page;
1368                         int off = sk->sk_sndmsg_off;
1369                         unsigned int left;
1370
1371                         if (page && (left = PAGE_SIZE - off) > 0) {
1372                                 if (copy >= left)
1373                                         copy = left;
1374                                 if (page != frag->page) {
1375                                         if (i == MAX_SKB_FRAGS) {
1376                                                 err = -EMSGSIZE;
1377                                                 goto error;
1378                                         }
1379                                         get_page(page);
1380                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1381                                         frag = &skb_shinfo(skb)->frags[i];
1382                                 }
1383                         } else if(i < MAX_SKB_FRAGS) {
1384                                 if (copy > PAGE_SIZE)
1385                                         copy = PAGE_SIZE;
1386                                 page = alloc_pages(sk->sk_allocation, 0);
1387                                 if (page == NULL) {
1388                                         err = -ENOMEM;
1389                                         goto error;
1390                                 }
1391                                 sk->sk_sndmsg_page = page;
1392                                 sk->sk_sndmsg_off = 0;
1393
1394                                 skb_fill_page_desc(skb, i, page, 0, 0);
1395                                 frag = &skb_shinfo(skb)->frags[i];
1396                         } else {
1397                                 err = -EMSGSIZE;
1398                                 goto error;
1399                         }
1400                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1401                                 err = -EFAULT;
1402                                 goto error;
1403                         }
1404                         sk->sk_sndmsg_off += copy;
1405                         frag->size += copy;
1406                         skb->len += copy;
1407                         skb->data_len += copy;
1408                         skb->truesize += copy;
1409                         atomic_add(copy, &sk->sk_wmem_alloc);
1410                 }
1411                 offset += copy;
1412                 length -= copy;
1413         }
1414         return 0;
1415 error:
1416         inet->cork.length -= length;
1417         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1418         return err;
1419 }
1420
1421 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1422 {
1423         if (np->cork.opt) {
1424                 kfree(np->cork.opt->dst0opt);
1425                 kfree(np->cork.opt->dst1opt);
1426                 kfree(np->cork.opt->hopopt);
1427                 kfree(np->cork.opt->srcrt);
1428                 kfree(np->cork.opt);
1429                 np->cork.opt = NULL;
1430         }
1431
1432         if (inet->cork.dst) {
1433                 dst_release(inet->cork.dst);
1434                 inet->cork.dst = NULL;
1435                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1436         }
1437         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1438 }
1439
1440 int ip6_push_pending_frames(struct sock *sk)
1441 {
1442         struct sk_buff *skb, *tmp_skb;
1443         struct sk_buff **tail_skb;
1444         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1445         struct inet_sock *inet = inet_sk(sk);
1446         struct ipv6_pinfo *np = inet6_sk(sk);
1447         struct net *net = sock_net(sk);
1448         struct ipv6hdr *hdr;
1449         struct ipv6_txoptions *opt = np->cork.opt;
1450         struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1451         struct flowi *fl = &inet->cork.fl;
1452         unsigned char proto = fl->proto;
1453         int err = 0;
1454
1455         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1456                 goto out;
1457         tail_skb = &(skb_shinfo(skb)->frag_list);
1458
1459         /* move skb->data to ip header from ext header */
1460         if (skb->data < skb_network_header(skb))
1461                 __skb_pull(skb, skb_network_offset(skb));
1462         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1463                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1464                 *tail_skb = tmp_skb;
1465                 tail_skb = &(tmp_skb->next);
1466                 skb->len += tmp_skb->len;
1467                 skb->data_len += tmp_skb->len;
1468                 skb->truesize += tmp_skb->truesize;
1469                 tmp_skb->destructor = NULL;
1470                 tmp_skb->sk = NULL;
1471         }
1472
1473         /* Allow local fragmentation. */
1474         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1475                 skb->local_df = 1;
1476
1477         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1478         __skb_pull(skb, skb_network_header_len(skb));
1479         if (opt && opt->opt_flen)
1480                 ipv6_push_frag_opts(skb, opt, &proto);
1481         if (opt && opt->opt_nflen)
1482                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1483
1484         skb_push(skb, sizeof(struct ipv6hdr));
1485         skb_reset_network_header(skb);
1486         hdr = ipv6_hdr(skb);
1487
1488         *(__be32*)hdr = fl->fl6_flowlabel |
1489                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1490
1491         hdr->hop_limit = np->cork.hop_limit;
1492         hdr->nexthdr = proto;
1493         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1494         ipv6_addr_copy(&hdr->daddr, final_dst);
1495
1496         skb->priority = sk->sk_priority;
1497         skb->mark = sk->sk_mark;
1498
1499         skb_dst_set(skb, dst_clone(&rt->u.dst));
1500         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1501         if (proto == IPPROTO_ICMPV6) {
1502                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1503
1504                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1505                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1506         }
1507
1508         err = ip6_local_out(skb);
1509         if (err) {
1510                 if (err > 0)
1511                         err = net_xmit_errno(err);
1512                 if (err)
1513                         goto error;
1514         }
1515
1516 out:
1517         ip6_cork_release(inet, np);
1518         return err;
1519 error:
1520         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1521         goto out;
1522 }
1523
1524 void ip6_flush_pending_frames(struct sock *sk)
1525 {
1526         struct sk_buff *skb;
1527
1528         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1529                 if (skb_dst(skb))
1530                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1531                                       IPSTATS_MIB_OUTDISCARDS);
1532                 kfree_skb(skb);
1533         }
1534
1535         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1536 }