ipv6 netns: Make several "global" sysctl variables namespace aware.
[linux-flexiantxendom0-natty.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40
41 #include <linux/netfilter.h>
42 #include <linux/netfilter_ipv6.h>
43
44 #include <net/sock.h>
45 #include <net/snmp.h>
46
47 #include <net/ipv6.h>
48 #include <net/ndisc.h>
49 #include <net/protocol.h>
50 #include <net/ip6_route.h>
51 #include <net/addrconf.h>
52 #include <net/rawv6.h>
53 #include <net/icmp.h>
54 #include <net/xfrm.h>
55 #include <net/checksum.h>
56 #include <linux/mroute6.h>
57
58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
59
60 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
61 {
62         static u32 ipv6_fragmentation_id = 1;
63         static DEFINE_SPINLOCK(ip6_id_lock);
64
65         spin_lock_bh(&ip6_id_lock);
66         fhdr->identification = htonl(ipv6_fragmentation_id);
67         if (++ipv6_fragmentation_id == 0)
68                 ipv6_fragmentation_id = 1;
69         spin_unlock_bh(&ip6_id_lock);
70 }
71
72 int __ip6_local_out(struct sk_buff *skb)
73 {
74         int len;
75
76         len = skb->len - sizeof(struct ipv6hdr);
77         if (len > IPV6_MAXPLEN)
78                 len = 0;
79         ipv6_hdr(skb)->payload_len = htons(len);
80
81         return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
82                        dst_output);
83 }
84
85 int ip6_local_out(struct sk_buff *skb)
86 {
87         int err;
88
89         err = __ip6_local_out(skb);
90         if (likely(err == 1))
91                 err = dst_output(skb);
92
93         return err;
94 }
95 EXPORT_SYMBOL_GPL(ip6_local_out);
96
97 static int ip6_output_finish(struct sk_buff *skb)
98 {
99         struct dst_entry *dst = skb->dst;
100
101         if (dst->hh)
102                 return neigh_hh_output(dst->hh, skb);
103         else if (dst->neighbour)
104                 return dst->neighbour->output(skb);
105
106         IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
107         kfree_skb(skb);
108         return -EINVAL;
109
110 }
111
112 /* dev_loopback_xmit for use with netfilter. */
113 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
114 {
115         skb_reset_mac_header(newskb);
116         __skb_pull(newskb, skb_network_offset(newskb));
117         newskb->pkt_type = PACKET_LOOPBACK;
118         newskb->ip_summed = CHECKSUM_UNNECESSARY;
119         BUG_TRAP(newskb->dst);
120
121         netif_rx(newskb);
122         return 0;
123 }
124
125
126 static int ip6_output2(struct sk_buff *skb)
127 {
128         struct dst_entry *dst = skb->dst;
129         struct net_device *dev = dst->dev;
130
131         skb->protocol = htons(ETH_P_IPV6);
132         skb->dev = dev;
133
134         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
135                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
136                 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
137
138                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
139                     ((mroute6_socket && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
140                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
141                                          &ipv6_hdr(skb)->saddr))) {
142                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
143
144                         /* Do not check for IFF_ALLMULTI; multicast routing
145                            is not supported in any case.
146                          */
147                         if (newskb)
148                                 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
149                                         NULL, newskb->dev,
150                                         ip6_dev_loopback_xmit);
151
152                         if (ipv6_hdr(skb)->hop_limit == 0) {
153                                 IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
154                                 kfree_skb(skb);
155                                 return 0;
156                         }
157                 }
158
159                 IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
160         }
161
162         return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
163                        ip6_output_finish);
164 }
165
166 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
167 {
168         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
169
170         return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
171                skb->dst->dev->mtu : dst_mtu(skb->dst);
172 }
173
174 int ip6_output(struct sk_buff *skb)
175 {
176         struct inet6_dev *idev = ip6_dst_idev(skb->dst);
177         if (unlikely(idev->cnf.disable_ipv6)) {
178                 IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
179                 kfree_skb(skb);
180                 return 0;
181         }
182
183         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
184                                 dst_allfrag(skb->dst))
185                 return ip6_fragment(skb, ip6_output2);
186         else
187                 return ip6_output2(skb);
188 }
189
190 /*
191  *      xmit an sk_buff (used by TCP)
192  */
193
194 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
195              struct ipv6_txoptions *opt, int ipfragok)
196 {
197         struct ipv6_pinfo *np = inet6_sk(sk);
198         struct in6_addr *first_hop = &fl->fl6_dst;
199         struct dst_entry *dst = skb->dst;
200         struct ipv6hdr *hdr;
201         u8  proto = fl->proto;
202         int seg_len = skb->len;
203         int hlimit, tclass;
204         u32 mtu;
205
206         if (opt) {
207                 unsigned int head_room;
208
209                 /* First: exthdrs may take lots of space (~8K for now)
210                    MAX_HEADER is not enough.
211                  */
212                 head_room = opt->opt_nflen + opt->opt_flen;
213                 seg_len += head_room;
214                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
215
216                 if (skb_headroom(skb) < head_room) {
217                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
218                         if (skb2 == NULL) {
219                                 IP6_INC_STATS(ip6_dst_idev(skb->dst),
220                                               IPSTATS_MIB_OUTDISCARDS);
221                                 kfree_skb(skb);
222                                 return -ENOBUFS;
223                         }
224                         kfree_skb(skb);
225                         skb = skb2;
226                         if (sk)
227                                 skb_set_owner_w(skb, sk);
228                 }
229                 if (opt->opt_flen)
230                         ipv6_push_frag_opts(skb, opt, &proto);
231                 if (opt->opt_nflen)
232                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
233         }
234
235         skb_push(skb, sizeof(struct ipv6hdr));
236         skb_reset_network_header(skb);
237         hdr = ipv6_hdr(skb);
238
239         /*
240          *      Fill in the IPv6 header
241          */
242
243         hlimit = -1;
244         if (np)
245                 hlimit = np->hop_limit;
246         if (hlimit < 0)
247                 hlimit = ip6_dst_hoplimit(dst);
248
249         tclass = -1;
250         if (np)
251                 tclass = np->tclass;
252         if (tclass < 0)
253                 tclass = 0;
254
255         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
256
257         hdr->payload_len = htons(seg_len);
258         hdr->nexthdr = proto;
259         hdr->hop_limit = hlimit;
260
261         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
262         ipv6_addr_copy(&hdr->daddr, first_hop);
263
264         skb->priority = sk->sk_priority;
265         skb->mark = sk->sk_mark;
266
267         mtu = dst_mtu(dst);
268         if ((skb->len <= mtu) || ipfragok || skb_is_gso(skb)) {
269                 IP6_INC_STATS(ip6_dst_idev(skb->dst),
270                               IPSTATS_MIB_OUTREQUESTS);
271                 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
272                                 dst_output);
273         }
274
275         if (net_ratelimit())
276                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
277         skb->dev = dst->dev;
278         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
279         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
280         kfree_skb(skb);
281         return -EMSGSIZE;
282 }
283
284 EXPORT_SYMBOL(ip6_xmit);
285
286 /*
287  *      To avoid extra problems ND packets are send through this
288  *      routine. It's code duplication but I really want to avoid
289  *      extra checks since ipv6_build_header is used by TCP (which
290  *      is for us performance critical)
291  */
292
293 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
294                const struct in6_addr *saddr, const struct in6_addr *daddr,
295                int proto, int len)
296 {
297         struct ipv6_pinfo *np = inet6_sk(sk);
298         struct ipv6hdr *hdr;
299         int totlen;
300
301         skb->protocol = htons(ETH_P_IPV6);
302         skb->dev = dev;
303
304         totlen = len + sizeof(struct ipv6hdr);
305
306         skb_reset_network_header(skb);
307         skb_put(skb, sizeof(struct ipv6hdr));
308         hdr = ipv6_hdr(skb);
309
310         *(__be32*)hdr = htonl(0x60000000);
311
312         hdr->payload_len = htons(len);
313         hdr->nexthdr = proto;
314         hdr->hop_limit = np->hop_limit;
315
316         ipv6_addr_copy(&hdr->saddr, saddr);
317         ipv6_addr_copy(&hdr->daddr, daddr);
318
319         return 0;
320 }
321
322 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
323 {
324         struct ip6_ra_chain *ra;
325         struct sock *last = NULL;
326
327         read_lock(&ip6_ra_lock);
328         for (ra = ip6_ra_chain; ra; ra = ra->next) {
329                 struct sock *sk = ra->sk;
330                 if (sk && ra->sel == sel &&
331                     (!sk->sk_bound_dev_if ||
332                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
333                         if (last) {
334                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
335                                 if (skb2)
336                                         rawv6_rcv(last, skb2);
337                         }
338                         last = sk;
339                 }
340         }
341
342         if (last) {
343                 rawv6_rcv(last, skb);
344                 read_unlock(&ip6_ra_lock);
345                 return 1;
346         }
347         read_unlock(&ip6_ra_lock);
348         return 0;
349 }
350
351 static int ip6_forward_proxy_check(struct sk_buff *skb)
352 {
353         struct ipv6hdr *hdr = ipv6_hdr(skb);
354         u8 nexthdr = hdr->nexthdr;
355         int offset;
356
357         if (ipv6_ext_hdr(nexthdr)) {
358                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
359                 if (offset < 0)
360                         return 0;
361         } else
362                 offset = sizeof(struct ipv6hdr);
363
364         if (nexthdr == IPPROTO_ICMPV6) {
365                 struct icmp6hdr *icmp6;
366
367                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
368                                          offset + 1 - skb->data)))
369                         return 0;
370
371                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
372
373                 switch (icmp6->icmp6_type) {
374                 case NDISC_ROUTER_SOLICITATION:
375                 case NDISC_ROUTER_ADVERTISEMENT:
376                 case NDISC_NEIGHBOUR_SOLICITATION:
377                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
378                 case NDISC_REDIRECT:
379                         /* For reaction involving unicast neighbor discovery
380                          * message destined to the proxied address, pass it to
381                          * input function.
382                          */
383                         return 1;
384                 default:
385                         break;
386                 }
387         }
388
389         /*
390          * The proxying router can't forward traffic sent to a link-local
391          * address, so signal the sender and discard the packet. This
392          * behavior is clarified by the MIPv6 specification.
393          */
394         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
395                 dst_link_failure(skb);
396                 return -1;
397         }
398
399         return 0;
400 }
401
402 static inline int ip6_forward_finish(struct sk_buff *skb)
403 {
404         return dst_output(skb);
405 }
406
407 int ip6_forward(struct sk_buff *skb)
408 {
409         struct dst_entry *dst = skb->dst;
410         struct ipv6hdr *hdr = ipv6_hdr(skb);
411         struct inet6_skb_parm *opt = IP6CB(skb);
412         struct net *net = dev_net(dst->dev);
413
414         if (net->ipv6.devconf_all->forwarding == 0)
415                 goto error;
416
417         if (skb_warn_if_lro(skb))
418                 goto drop;
419
420         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
421                 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
422                 goto drop;
423         }
424
425         skb_forward_csum(skb);
426
427         /*
428          *      We DO NOT make any processing on
429          *      RA packets, pushing them to user level AS IS
430          *      without ane WARRANTY that application will be able
431          *      to interpret them. The reason is that we
432          *      cannot make anything clever here.
433          *
434          *      We are not end-node, so that if packet contains
435          *      AH/ESP, we cannot make anything.
436          *      Defragmentation also would be mistake, RA packets
437          *      cannot be fragmented, because there is no warranty
438          *      that different fragments will go along one path. --ANK
439          */
440         if (opt->ra) {
441                 u8 *ptr = skb_network_header(skb) + opt->ra;
442                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
443                         return 0;
444         }
445
446         /*
447          *      check and decrement ttl
448          */
449         if (hdr->hop_limit <= 1) {
450                 /* Force OUTPUT device used as source address */
451                 skb->dev = dst->dev;
452                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
453                             0, skb->dev);
454                 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
455
456                 kfree_skb(skb);
457                 return -ETIMEDOUT;
458         }
459
460         /* XXX: idev->cnf.proxy_ndp? */
461         if (net->ipv6.devconf_all->proxy_ndp &&
462             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
463                 int proxied = ip6_forward_proxy_check(skb);
464                 if (proxied > 0)
465                         return ip6_input(skb);
466                 else if (proxied < 0) {
467                         IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
468                         goto drop;
469                 }
470         }
471
472         if (!xfrm6_route_forward(skb)) {
473                 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
474                 goto drop;
475         }
476         dst = skb->dst;
477
478         /* IPv6 specs say nothing about it, but it is clear that we cannot
479            send redirects to source routed frames.
480            We don't send redirects to frames decapsulated from IPsec.
481          */
482         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
483             !skb->sp) {
484                 struct in6_addr *target = NULL;
485                 struct rt6_info *rt;
486                 struct neighbour *n = dst->neighbour;
487
488                 /*
489                  *      incoming and outgoing devices are the same
490                  *      send a redirect.
491                  */
492
493                 rt = (struct rt6_info *) dst;
494                 if ((rt->rt6i_flags & RTF_GATEWAY))
495                         target = (struct in6_addr*)&n->primary_key;
496                 else
497                         target = &hdr->daddr;
498
499                 /* Limit redirects both by destination (here)
500                    and by source (inside ndisc_send_redirect)
501                  */
502                 if (xrlim_allow(dst, 1*HZ))
503                         ndisc_send_redirect(skb, n, target);
504         } else {
505                 int addrtype = ipv6_addr_type(&hdr->saddr);
506
507                 /* This check is security critical. */
508                 if (addrtype == IPV6_ADDR_ANY ||
509                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
510                         goto error;
511                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
512                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
513                                 ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
514                         goto error;
515                 }
516         }
517
518         if (skb->len > dst_mtu(dst)) {
519                 /* Again, force OUTPUT device used as source address */
520                 skb->dev = dst->dev;
521                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
522                 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
523                 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
524                 kfree_skb(skb);
525                 return -EMSGSIZE;
526         }
527
528         if (skb_cow(skb, dst->dev->hard_header_len)) {
529                 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
530                 goto drop;
531         }
532
533         hdr = ipv6_hdr(skb);
534
535         /* Mangling hops number delayed to point after skb COW */
536
537         hdr->hop_limit--;
538
539         IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
540         return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
541                        ip6_forward_finish);
542
543 error:
544         IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
545 drop:
546         kfree_skb(skb);
547         return -EINVAL;
548 }
549
550 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
551 {
552         to->pkt_type = from->pkt_type;
553         to->priority = from->priority;
554         to->protocol = from->protocol;
555         dst_release(to->dst);
556         to->dst = dst_clone(from->dst);
557         to->dev = from->dev;
558         to->mark = from->mark;
559
560 #ifdef CONFIG_NET_SCHED
561         to->tc_index = from->tc_index;
562 #endif
563         nf_copy(to, from);
564 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
565     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
566         to->nf_trace = from->nf_trace;
567 #endif
568         skb_copy_secmark(to, from);
569 }
570
571 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
572 {
573         u16 offset = sizeof(struct ipv6hdr);
574         struct ipv6_opt_hdr *exthdr =
575                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
576         unsigned int packet_len = skb->tail - skb->network_header;
577         int found_rhdr = 0;
578         *nexthdr = &ipv6_hdr(skb)->nexthdr;
579
580         while (offset + 1 <= packet_len) {
581
582                 switch (**nexthdr) {
583
584                 case NEXTHDR_HOP:
585                         break;
586                 case NEXTHDR_ROUTING:
587                         found_rhdr = 1;
588                         break;
589                 case NEXTHDR_DEST:
590 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
591                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
592                                 break;
593 #endif
594                         if (found_rhdr)
595                                 return offset;
596                         break;
597                 default :
598                         return offset;
599                 }
600
601                 offset += ipv6_optlen(exthdr);
602                 *nexthdr = &exthdr->nexthdr;
603                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
604                                                  offset);
605         }
606
607         return offset;
608 }
609
610 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
611 {
612         struct net_device *dev;
613         struct sk_buff *frag;
614         struct rt6_info *rt = (struct rt6_info*)skb->dst;
615         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
616         struct ipv6hdr *tmp_hdr;
617         struct frag_hdr *fh;
618         unsigned int mtu, hlen, left, len;
619         __be32 frag_id = 0;
620         int ptr, offset = 0, err=0;
621         u8 *prevhdr, nexthdr = 0;
622
623         dev = rt->u.dst.dev;
624         hlen = ip6_find_1stfragopt(skb, &prevhdr);
625         nexthdr = *prevhdr;
626
627         mtu = ip6_skb_dst_mtu(skb);
628
629         /* We must not fragment if the socket is set to force MTU discovery
630          * or if the skb it not generated by a local socket.  (This last
631          * check should be redundant, but it's free.)
632          */
633         if (!skb->local_df) {
634                 skb->dev = skb->dst->dev;
635                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
636                 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
637                 kfree_skb(skb);
638                 return -EMSGSIZE;
639         }
640
641         if (np && np->frag_size < mtu) {
642                 if (np->frag_size)
643                         mtu = np->frag_size;
644         }
645         mtu -= hlen + sizeof(struct frag_hdr);
646
647         if (skb_shinfo(skb)->frag_list) {
648                 int first_len = skb_pagelen(skb);
649                 int truesizes = 0;
650
651                 if (first_len - hlen > mtu ||
652                     ((first_len - hlen) & 7) ||
653                     skb_cloned(skb))
654                         goto slow_path;
655
656                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
657                         /* Correct geometry. */
658                         if (frag->len > mtu ||
659                             ((frag->len & 7) && frag->next) ||
660                             skb_headroom(frag) < hlen)
661                             goto slow_path;
662
663                         /* Partially cloned skb? */
664                         if (skb_shared(frag))
665                                 goto slow_path;
666
667                         BUG_ON(frag->sk);
668                         if (skb->sk) {
669                                 sock_hold(skb->sk);
670                                 frag->sk = skb->sk;
671                                 frag->destructor = sock_wfree;
672                                 truesizes += frag->truesize;
673                         }
674                 }
675
676                 err = 0;
677                 offset = 0;
678                 frag = skb_shinfo(skb)->frag_list;
679                 skb_shinfo(skb)->frag_list = NULL;
680                 /* BUILD HEADER */
681
682                 *prevhdr = NEXTHDR_FRAGMENT;
683                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
684                 if (!tmp_hdr) {
685                         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
686                         return -ENOMEM;
687                 }
688
689                 __skb_pull(skb, hlen);
690                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
691                 __skb_push(skb, hlen);
692                 skb_reset_network_header(skb);
693                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
694
695                 ipv6_select_ident(skb, fh);
696                 fh->nexthdr = nexthdr;
697                 fh->reserved = 0;
698                 fh->frag_off = htons(IP6_MF);
699                 frag_id = fh->identification;
700
701                 first_len = skb_pagelen(skb);
702                 skb->data_len = first_len - skb_headlen(skb);
703                 skb->truesize -= truesizes;
704                 skb->len = first_len;
705                 ipv6_hdr(skb)->payload_len = htons(first_len -
706                                                    sizeof(struct ipv6hdr));
707
708                 dst_hold(&rt->u.dst);
709
710                 for (;;) {
711                         /* Prepare header of the next frame,
712                          * before previous one went down. */
713                         if (frag) {
714                                 frag->ip_summed = CHECKSUM_NONE;
715                                 skb_reset_transport_header(frag);
716                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
717                                 __skb_push(frag, hlen);
718                                 skb_reset_network_header(frag);
719                                 memcpy(skb_network_header(frag), tmp_hdr,
720                                        hlen);
721                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
722                                 fh->nexthdr = nexthdr;
723                                 fh->reserved = 0;
724                                 fh->frag_off = htons(offset);
725                                 if (frag->next != NULL)
726                                         fh->frag_off |= htons(IP6_MF);
727                                 fh->identification = frag_id;
728                                 ipv6_hdr(frag)->payload_len =
729                                                 htons(frag->len -
730                                                       sizeof(struct ipv6hdr));
731                                 ip6_copy_metadata(frag, skb);
732                         }
733
734                         err = output(skb);
735                         if(!err)
736                                 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGCREATES);
737
738                         if (err || !frag)
739                                 break;
740
741                         skb = frag;
742                         frag = skb->next;
743                         skb->next = NULL;
744                 }
745
746                 kfree(tmp_hdr);
747
748                 if (err == 0) {
749                         IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGOKS);
750                         dst_release(&rt->u.dst);
751                         return 0;
752                 }
753
754                 while (frag) {
755                         skb = frag->next;
756                         kfree_skb(frag);
757                         frag = skb;
758                 }
759
760                 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGFAILS);
761                 dst_release(&rt->u.dst);
762                 return err;
763         }
764
765 slow_path:
766         left = skb->len - hlen;         /* Space per frame */
767         ptr = hlen;                     /* Where to start from */
768
769         /*
770          *      Fragment the datagram.
771          */
772
773         *prevhdr = NEXTHDR_FRAGMENT;
774
775         /*
776          *      Keep copying data until we run out.
777          */
778         while(left > 0) {
779                 len = left;
780                 /* IF: it doesn't fit, use 'mtu' - the data space left */
781                 if (len > mtu)
782                         len = mtu;
783                 /* IF: we are not sending upto and including the packet end
784                    then align the next start on an eight byte boundary */
785                 if (len < left) {
786                         len &= ~7;
787                 }
788                 /*
789                  *      Allocate buffer.
790                  */
791
792                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
793                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
794                         IP6_INC_STATS(ip6_dst_idev(skb->dst),
795                                       IPSTATS_MIB_FRAGFAILS);
796                         err = -ENOMEM;
797                         goto fail;
798                 }
799
800                 /*
801                  *      Set up data on packet
802                  */
803
804                 ip6_copy_metadata(frag, skb);
805                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
806                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
807                 skb_reset_network_header(frag);
808                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
809                 frag->transport_header = (frag->network_header + hlen +
810                                           sizeof(struct frag_hdr));
811
812                 /*
813                  *      Charge the memory for the fragment to any owner
814                  *      it might possess
815                  */
816                 if (skb->sk)
817                         skb_set_owner_w(frag, skb->sk);
818
819                 /*
820                  *      Copy the packet header into the new buffer.
821                  */
822                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
823
824                 /*
825                  *      Build fragment header.
826                  */
827                 fh->nexthdr = nexthdr;
828                 fh->reserved = 0;
829                 if (!frag_id) {
830                         ipv6_select_ident(skb, fh);
831                         frag_id = fh->identification;
832                 } else
833                         fh->identification = frag_id;
834
835                 /*
836                  *      Copy a block of the IP datagram.
837                  */
838                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
839                         BUG();
840                 left -= len;
841
842                 fh->frag_off = htons(offset);
843                 if (left > 0)
844                         fh->frag_off |= htons(IP6_MF);
845                 ipv6_hdr(frag)->payload_len = htons(frag->len -
846                                                     sizeof(struct ipv6hdr));
847
848                 ptr += len;
849                 offset += len;
850
851                 /*
852                  *      Put this fragment into the sending queue.
853                  */
854                 err = output(frag);
855                 if (err)
856                         goto fail;
857
858                 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGCREATES);
859         }
860         IP6_INC_STATS(ip6_dst_idev(skb->dst),
861                       IPSTATS_MIB_FRAGOKS);
862         kfree_skb(skb);
863         return err;
864
865 fail:
866         IP6_INC_STATS(ip6_dst_idev(skb->dst),
867                       IPSTATS_MIB_FRAGFAILS);
868         kfree_skb(skb);
869         return err;
870 }
871
872 static inline int ip6_rt_check(struct rt6key *rt_key,
873                                struct in6_addr *fl_addr,
874                                struct in6_addr *addr_cache)
875 {
876         return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
877                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
878 }
879
880 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
881                                           struct dst_entry *dst,
882                                           struct flowi *fl)
883 {
884         struct ipv6_pinfo *np = inet6_sk(sk);
885         struct rt6_info *rt = (struct rt6_info *)dst;
886
887         if (!dst)
888                 goto out;
889
890         /* Yes, checking route validity in not connected
891          * case is not very simple. Take into account,
892          * that we do not support routing by source, TOS,
893          * and MSG_DONTROUTE            --ANK (980726)
894          *
895          * 1. ip6_rt_check(): If route was host route,
896          *    check that cached destination is current.
897          *    If it is network route, we still may
898          *    check its validity using saved pointer
899          *    to the last used address: daddr_cache.
900          *    We do not want to save whole address now,
901          *    (because main consumer of this service
902          *    is tcp, which has not this problem),
903          *    so that the last trick works only on connected
904          *    sockets.
905          * 2. oif also should be the same.
906          */
907         if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
908 #ifdef CONFIG_IPV6_SUBTREES
909             ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
910 #endif
911             (fl->oif && fl->oif != dst->dev->ifindex)) {
912                 dst_release(dst);
913                 dst = NULL;
914         }
915
916 out:
917         return dst;
918 }
919
920 static int ip6_dst_lookup_tail(struct sock *sk,
921                                struct dst_entry **dst, struct flowi *fl)
922 {
923         int err;
924         struct net *net = sock_net(sk);
925
926         if (*dst == NULL)
927                 *dst = ip6_route_output(net, sk, fl);
928
929         if ((err = (*dst)->error))
930                 goto out_err_release;
931
932         if (ipv6_addr_any(&fl->fl6_src)) {
933                 err = ipv6_dev_get_saddr(ip6_dst_idev(*dst)->dev,
934                                          &fl->fl6_dst,
935                                          sk ? inet6_sk(sk)->srcprefs : 0,
936                                          &fl->fl6_src);
937                 if (err)
938                         goto out_err_release;
939         }
940
941 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
942                 /*
943                  * Here if the dst entry we've looked up
944                  * has a neighbour entry that is in the INCOMPLETE
945                  * state and the src address from the flow is
946                  * marked as OPTIMISTIC, we release the found
947                  * dst entry and replace it instead with the
948                  * dst entry of the nexthop router
949                  */
950                 if (!((*dst)->neighbour->nud_state & NUD_VALID)) {
951                         struct inet6_ifaddr *ifp;
952                         struct flowi fl_gw;
953                         int redirect;
954
955                         ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
956                                               (*dst)->dev, 1);
957
958                         redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
959                         if (ifp)
960                                 in6_ifa_put(ifp);
961
962                         if (redirect) {
963                                 /*
964                                  * We need to get the dst entry for the
965                                  * default router instead
966                                  */
967                                 dst_release(*dst);
968                                 memcpy(&fl_gw, fl, sizeof(struct flowi));
969                                 memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
970                                 *dst = ip6_route_output(net, sk, &fl_gw);
971                                 if ((err = (*dst)->error))
972                                         goto out_err_release;
973                         }
974                 }
975 #endif
976
977         return 0;
978
979 out_err_release:
980         if (err == -ENETUNREACH)
981                 IP6_INC_STATS_BH(NULL, IPSTATS_MIB_OUTNOROUTES);
982         dst_release(*dst);
983         *dst = NULL;
984         return err;
985 }
986
987 /**
988  *      ip6_dst_lookup - perform route lookup on flow
989  *      @sk: socket which provides route info
990  *      @dst: pointer to dst_entry * for result
991  *      @fl: flow to lookup
992  *
993  *      This function performs a route lookup on the given flow.
994  *
995  *      It returns zero on success, or a standard errno code on error.
996  */
997 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
998 {
999         *dst = NULL;
1000         return ip6_dst_lookup_tail(sk, dst, fl);
1001 }
1002 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1003
1004 /**
1005  *      ip6_sk_dst_lookup - perform socket cached route lookup on flow
1006  *      @sk: socket which provides the dst cache and route info
1007  *      @dst: pointer to dst_entry * for result
1008  *      @fl: flow to lookup
1009  *
1010  *      This function performs a route lookup on the given flow with the
1011  *      possibility of using the cached route in the socket if it is valid.
1012  *      It will take the socket dst lock when operating on the dst cache.
1013  *      As a result, this function can only be used in process context.
1014  *
1015  *      It returns zero on success, or a standard errno code on error.
1016  */
1017 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1018 {
1019         *dst = NULL;
1020         if (sk) {
1021                 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1022                 *dst = ip6_sk_dst_check(sk, *dst, fl);
1023         }
1024
1025         return ip6_dst_lookup_tail(sk, dst, fl);
1026 }
1027 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1028
1029 static inline int ip6_ufo_append_data(struct sock *sk,
1030                         int getfrag(void *from, char *to, int offset, int len,
1031                         int odd, struct sk_buff *skb),
1032                         void *from, int length, int hh_len, int fragheaderlen,
1033                         int transhdrlen, int mtu,unsigned int flags)
1034
1035 {
1036         struct sk_buff *skb;
1037         int err;
1038
1039         /* There is support for UDP large send offload by network
1040          * device, so create one single skb packet containing complete
1041          * udp datagram
1042          */
1043         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1044                 skb = sock_alloc_send_skb(sk,
1045                         hh_len + fragheaderlen + transhdrlen + 20,
1046                         (flags & MSG_DONTWAIT), &err);
1047                 if (skb == NULL)
1048                         return -ENOMEM;
1049
1050                 /* reserve space for Hardware header */
1051                 skb_reserve(skb, hh_len);
1052
1053                 /* create space for UDP/IP header */
1054                 skb_put(skb,fragheaderlen + transhdrlen);
1055
1056                 /* initialize network header pointer */
1057                 skb_reset_network_header(skb);
1058
1059                 /* initialize protocol header pointer */
1060                 skb->transport_header = skb->network_header + fragheaderlen;
1061
1062                 skb->ip_summed = CHECKSUM_PARTIAL;
1063                 skb->csum = 0;
1064                 sk->sk_sndmsg_off = 0;
1065         }
1066
1067         err = skb_append_datato_frags(sk,skb, getfrag, from,
1068                                       (length - transhdrlen));
1069         if (!err) {
1070                 struct frag_hdr fhdr;
1071
1072                 /* specify the length of each IP datagram fragment*/
1073                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
1074                                             sizeof(struct frag_hdr);
1075                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1076                 ipv6_select_ident(skb, &fhdr);
1077                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1078                 __skb_queue_tail(&sk->sk_write_queue, skb);
1079
1080                 return 0;
1081         }
1082         /* There is not enough support do UPD LSO,
1083          * so follow normal path
1084          */
1085         kfree_skb(skb);
1086
1087         return err;
1088 }
1089
1090 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1091         int offset, int len, int odd, struct sk_buff *skb),
1092         void *from, int length, int transhdrlen,
1093         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1094         struct rt6_info *rt, unsigned int flags)
1095 {
1096         struct inet_sock *inet = inet_sk(sk);
1097         struct ipv6_pinfo *np = inet6_sk(sk);
1098         struct sk_buff *skb;
1099         unsigned int maxfraglen, fragheaderlen;
1100         int exthdrlen;
1101         int hh_len;
1102         int mtu;
1103         int copy;
1104         int err;
1105         int offset = 0;
1106         int csummode = CHECKSUM_NONE;
1107
1108         if (flags&MSG_PROBE)
1109                 return 0;
1110         if (skb_queue_empty(&sk->sk_write_queue)) {
1111                 /*
1112                  * setup for corking
1113                  */
1114                 if (opt) {
1115                         if (np->cork.opt == NULL) {
1116                                 np->cork.opt = kmalloc(opt->tot_len,
1117                                                        sk->sk_allocation);
1118                                 if (unlikely(np->cork.opt == NULL))
1119                                         return -ENOBUFS;
1120                         } else if (np->cork.opt->tot_len < opt->tot_len) {
1121                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
1122                                 return -EINVAL;
1123                         }
1124                         memcpy(np->cork.opt, opt, opt->tot_len);
1125                         inet->cork.flags |= IPCORK_OPT;
1126                         /* need source address above miyazawa*/
1127                 }
1128                 dst_hold(&rt->u.dst);
1129                 inet->cork.dst = &rt->u.dst;
1130                 inet->cork.fl = *fl;
1131                 np->cork.hop_limit = hlimit;
1132                 np->cork.tclass = tclass;
1133                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1134                       rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1135                 if (np->frag_size < mtu) {
1136                         if (np->frag_size)
1137                                 mtu = np->frag_size;
1138                 }
1139                 inet->cork.fragsize = mtu;
1140                 if (dst_allfrag(rt->u.dst.path))
1141                         inet->cork.flags |= IPCORK_ALLFRAG;
1142                 inet->cork.length = 0;
1143                 sk->sk_sndmsg_page = NULL;
1144                 sk->sk_sndmsg_off = 0;
1145                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1146                             rt->rt6i_nfheader_len;
1147                 length += exthdrlen;
1148                 transhdrlen += exthdrlen;
1149         } else {
1150                 rt = (struct rt6_info *)inet->cork.dst;
1151                 fl = &inet->cork.fl;
1152                 if (inet->cork.flags & IPCORK_OPT)
1153                         opt = np->cork.opt;
1154                 transhdrlen = 0;
1155                 exthdrlen = 0;
1156                 mtu = inet->cork.fragsize;
1157         }
1158
1159         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1160
1161         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1162                         (opt ? opt->opt_nflen : 0);
1163         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1164
1165         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1166                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1167                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1168                         return -EMSGSIZE;
1169                 }
1170         }
1171
1172         /*
1173          * Let's try using as much space as possible.
1174          * Use MTU if total length of the message fits into the MTU.
1175          * Otherwise, we need to reserve fragment header and
1176          * fragment alignment (= 8-15 octects, in total).
1177          *
1178          * Note that we may need to "move" the data from the tail of
1179          * of the buffer to the new fragment when we split
1180          * the message.
1181          *
1182          * FIXME: It may be fragmented into multiple chunks
1183          *        at once if non-fragmentable extension headers
1184          *        are too large.
1185          * --yoshfuji
1186          */
1187
1188         inet->cork.length += length;
1189         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1190             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1191
1192                 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1193                                           fragheaderlen, transhdrlen, mtu,
1194                                           flags);
1195                 if (err)
1196                         goto error;
1197                 return 0;
1198         }
1199
1200         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1201                 goto alloc_new_skb;
1202
1203         while (length > 0) {
1204                 /* Check if the remaining data fits into current packet. */
1205                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1206                 if (copy < length)
1207                         copy = maxfraglen - skb->len;
1208
1209                 if (copy <= 0) {
1210                         char *data;
1211                         unsigned int datalen;
1212                         unsigned int fraglen;
1213                         unsigned int fraggap;
1214                         unsigned int alloclen;
1215                         struct sk_buff *skb_prev;
1216 alloc_new_skb:
1217                         skb_prev = skb;
1218
1219                         /* There's no room in the current skb */
1220                         if (skb_prev)
1221                                 fraggap = skb_prev->len - maxfraglen;
1222                         else
1223                                 fraggap = 0;
1224
1225                         /*
1226                          * If remaining data exceeds the mtu,
1227                          * we know we need more fragment(s).
1228                          */
1229                         datalen = length + fraggap;
1230                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1231                                 datalen = maxfraglen - fragheaderlen;
1232
1233                         fraglen = datalen + fragheaderlen;
1234                         if ((flags & MSG_MORE) &&
1235                             !(rt->u.dst.dev->features&NETIF_F_SG))
1236                                 alloclen = mtu;
1237                         else
1238                                 alloclen = datalen + fragheaderlen;
1239
1240                         /*
1241                          * The last fragment gets additional space at tail.
1242                          * Note: we overallocate on fragments with MSG_MODE
1243                          * because we have no idea if we're the last one.
1244                          */
1245                         if (datalen == length + fraggap)
1246                                 alloclen += rt->u.dst.trailer_len;
1247
1248                         /*
1249                          * We just reserve space for fragment header.
1250                          * Note: this may be overallocation if the message
1251                          * (without MSG_MORE) fits into the MTU.
1252                          */
1253                         alloclen += sizeof(struct frag_hdr);
1254
1255                         if (transhdrlen) {
1256                                 skb = sock_alloc_send_skb(sk,
1257                                                 alloclen + hh_len,
1258                                                 (flags & MSG_DONTWAIT), &err);
1259                         } else {
1260                                 skb = NULL;
1261                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1262                                     2 * sk->sk_sndbuf)
1263                                         skb = sock_wmalloc(sk,
1264                                                            alloclen + hh_len, 1,
1265                                                            sk->sk_allocation);
1266                                 if (unlikely(skb == NULL))
1267                                         err = -ENOBUFS;
1268                         }
1269                         if (skb == NULL)
1270                                 goto error;
1271                         /*
1272                          *      Fill in the control structures
1273                          */
1274                         skb->ip_summed = csummode;
1275                         skb->csum = 0;
1276                         /* reserve for fragmentation */
1277                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1278
1279                         /*
1280                          *      Find where to start putting bytes
1281                          */
1282                         data = skb_put(skb, fraglen);
1283                         skb_set_network_header(skb, exthdrlen);
1284                         data += fragheaderlen;
1285                         skb->transport_header = (skb->network_header +
1286                                                  fragheaderlen);
1287                         if (fraggap) {
1288                                 skb->csum = skb_copy_and_csum_bits(
1289                                         skb_prev, maxfraglen,
1290                                         data + transhdrlen, fraggap, 0);
1291                                 skb_prev->csum = csum_sub(skb_prev->csum,
1292                                                           skb->csum);
1293                                 data += fraggap;
1294                                 pskb_trim_unique(skb_prev, maxfraglen);
1295                         }
1296                         copy = datalen - transhdrlen - fraggap;
1297                         if (copy < 0) {
1298                                 err = -EINVAL;
1299                                 kfree_skb(skb);
1300                                 goto error;
1301                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1302                                 err = -EFAULT;
1303                                 kfree_skb(skb);
1304                                 goto error;
1305                         }
1306
1307                         offset += copy;
1308                         length -= datalen - fraggap;
1309                         transhdrlen = 0;
1310                         exthdrlen = 0;
1311                         csummode = CHECKSUM_NONE;
1312
1313                         /*
1314                          * Put the packet on the pending queue
1315                          */
1316                         __skb_queue_tail(&sk->sk_write_queue, skb);
1317                         continue;
1318                 }
1319
1320                 if (copy > length)
1321                         copy = length;
1322
1323                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1324                         unsigned int off;
1325
1326                         off = skb->len;
1327                         if (getfrag(from, skb_put(skb, copy),
1328                                                 offset, copy, off, skb) < 0) {
1329                                 __skb_trim(skb, off);
1330                                 err = -EFAULT;
1331                                 goto error;
1332                         }
1333                 } else {
1334                         int i = skb_shinfo(skb)->nr_frags;
1335                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1336                         struct page *page = sk->sk_sndmsg_page;
1337                         int off = sk->sk_sndmsg_off;
1338                         unsigned int left;
1339
1340                         if (page && (left = PAGE_SIZE - off) > 0) {
1341                                 if (copy >= left)
1342                                         copy = left;
1343                                 if (page != frag->page) {
1344                                         if (i == MAX_SKB_FRAGS) {
1345                                                 err = -EMSGSIZE;
1346                                                 goto error;
1347                                         }
1348                                         get_page(page);
1349                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1350                                         frag = &skb_shinfo(skb)->frags[i];
1351                                 }
1352                         } else if(i < MAX_SKB_FRAGS) {
1353                                 if (copy > PAGE_SIZE)
1354                                         copy = PAGE_SIZE;
1355                                 page = alloc_pages(sk->sk_allocation, 0);
1356                                 if (page == NULL) {
1357                                         err = -ENOMEM;
1358                                         goto error;
1359                                 }
1360                                 sk->sk_sndmsg_page = page;
1361                                 sk->sk_sndmsg_off = 0;
1362
1363                                 skb_fill_page_desc(skb, i, page, 0, 0);
1364                                 frag = &skb_shinfo(skb)->frags[i];
1365                         } else {
1366                                 err = -EMSGSIZE;
1367                                 goto error;
1368                         }
1369                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1370                                 err = -EFAULT;
1371                                 goto error;
1372                         }
1373                         sk->sk_sndmsg_off += copy;
1374                         frag->size += copy;
1375                         skb->len += copy;
1376                         skb->data_len += copy;
1377                         skb->truesize += copy;
1378                         atomic_add(copy, &sk->sk_wmem_alloc);
1379                 }
1380                 offset += copy;
1381                 length -= copy;
1382         }
1383         return 0;
1384 error:
1385         inet->cork.length -= length;
1386         IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1387         return err;
1388 }
1389
1390 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1391 {
1392         inet->cork.flags &= ~IPCORK_OPT;
1393         kfree(np->cork.opt);
1394         np->cork.opt = NULL;
1395         if (inet->cork.dst) {
1396                 dst_release(inet->cork.dst);
1397                 inet->cork.dst = NULL;
1398                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1399         }
1400         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1401 }
1402
1403 int ip6_push_pending_frames(struct sock *sk)
1404 {
1405         struct sk_buff *skb, *tmp_skb;
1406         struct sk_buff **tail_skb;
1407         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1408         struct inet_sock *inet = inet_sk(sk);
1409         struct ipv6_pinfo *np = inet6_sk(sk);
1410         struct ipv6hdr *hdr;
1411         struct ipv6_txoptions *opt = np->cork.opt;
1412         struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1413         struct flowi *fl = &inet->cork.fl;
1414         unsigned char proto = fl->proto;
1415         int err = 0;
1416
1417         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1418                 goto out;
1419         tail_skb = &(skb_shinfo(skb)->frag_list);
1420
1421         /* move skb->data to ip header from ext header */
1422         if (skb->data < skb_network_header(skb))
1423                 __skb_pull(skb, skb_network_offset(skb));
1424         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1425                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1426                 *tail_skb = tmp_skb;
1427                 tail_skb = &(tmp_skb->next);
1428                 skb->len += tmp_skb->len;
1429                 skb->data_len += tmp_skb->len;
1430                 skb->truesize += tmp_skb->truesize;
1431                 __sock_put(tmp_skb->sk);
1432                 tmp_skb->destructor = NULL;
1433                 tmp_skb->sk = NULL;
1434         }
1435
1436         /* Allow local fragmentation. */
1437         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1438                 skb->local_df = 1;
1439
1440         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1441         __skb_pull(skb, skb_network_header_len(skb));
1442         if (opt && opt->opt_flen)
1443                 ipv6_push_frag_opts(skb, opt, &proto);
1444         if (opt && opt->opt_nflen)
1445                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1446
1447         skb_push(skb, sizeof(struct ipv6hdr));
1448         skb_reset_network_header(skb);
1449         hdr = ipv6_hdr(skb);
1450
1451         *(__be32*)hdr = fl->fl6_flowlabel |
1452                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1453
1454         hdr->hop_limit = np->cork.hop_limit;
1455         hdr->nexthdr = proto;
1456         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1457         ipv6_addr_copy(&hdr->daddr, final_dst);
1458
1459         skb->priority = sk->sk_priority;
1460         skb->mark = sk->sk_mark;
1461
1462         skb->dst = dst_clone(&rt->u.dst);
1463         IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1464         if (proto == IPPROTO_ICMPV6) {
1465                 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1466
1467                 ICMP6MSGOUT_INC_STATS_BH(idev, icmp6_hdr(skb)->icmp6_type);
1468                 ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
1469         }
1470
1471         err = ip6_local_out(skb);
1472         if (err) {
1473                 if (err > 0)
1474                         err = np->recverr ? net_xmit_errno(err) : 0;
1475                 if (err)
1476                         goto error;
1477         }
1478
1479 out:
1480         ip6_cork_release(inet, np);
1481         return err;
1482 error:
1483         goto out;
1484 }
1485
1486 void ip6_flush_pending_frames(struct sock *sk)
1487 {
1488         struct sk_buff *skb;
1489
1490         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1491                 if (skb->dst)
1492                         IP6_INC_STATS(ip6_dst_idev(skb->dst),
1493                                       IPSTATS_MIB_OUTDISCARDS);
1494                 kfree_skb(skb);
1495         }
1496
1497         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1498 }