netfilter: ipv6: add IPSKB_REROUTED exclusion to NF_HOOK/POSTROUTING invocation
[linux-flexiantxendom0-natty.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40
41 #include <linux/netfilter.h>
42 #include <linux/netfilter_ipv6.h>
43
44 #include <net/sock.h>
45 #include <net/snmp.h>
46
47 #include <net/ipv6.h>
48 #include <net/ndisc.h>
49 #include <net/protocol.h>
50 #include <net/ip6_route.h>
51 #include <net/addrconf.h>
52 #include <net/rawv6.h>
53 #include <net/icmp.h>
54 #include <net/xfrm.h>
55 #include <net/checksum.h>
56 #include <linux/mroute6.h>
57
58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
59
60 int __ip6_local_out(struct sk_buff *skb)
61 {
62         int len;
63
64         len = skb->len - sizeof(struct ipv6hdr);
65         if (len > IPV6_MAXPLEN)
66                 len = 0;
67         ipv6_hdr(skb)->payload_len = htons(len);
68
69         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
70                        skb_dst(skb)->dev, dst_output);
71 }
72
73 int ip6_local_out(struct sk_buff *skb)
74 {
75         int err;
76
77         err = __ip6_local_out(skb);
78         if (likely(err == 1))
79                 err = dst_output(skb);
80
81         return err;
82 }
83 EXPORT_SYMBOL_GPL(ip6_local_out);
84
85 /* dev_loopback_xmit for use with netfilter. */
86 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
87 {
88         skb_reset_mac_header(newskb);
89         __skb_pull(newskb, skb_network_offset(newskb));
90         newskb->pkt_type = PACKET_LOOPBACK;
91         newskb->ip_summed = CHECKSUM_UNNECESSARY;
92         WARN_ON(!skb_dst(newskb));
93
94         netif_rx(newskb);
95         return 0;
96 }
97
98 static int ip6_finish_output2(struct sk_buff *skb)
99 {
100         struct dst_entry *dst = skb_dst(skb);
101         struct net_device *dev = dst->dev;
102
103         skb->protocol = htons(ETH_P_IPV6);
104         skb->dev = dev;
105
106         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
107                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
108
109                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
110                     ((mroute6_socket(dev_net(dev)) &&
111                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
112                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
113                                          &ipv6_hdr(skb)->saddr))) {
114                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
115
116                         /* Do not check for IFF_ALLMULTI; multicast routing
117                            is not supported in any case.
118                          */
119                         if (newskb)
120                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
121                                         newskb, NULL, newskb->dev,
122                                         ip6_dev_loopback_xmit);
123
124                         if (ipv6_hdr(skb)->hop_limit == 0) {
125                                 IP6_INC_STATS(dev_net(dev), idev,
126                                               IPSTATS_MIB_OUTDISCARDS);
127                                 kfree_skb(skb);
128                                 return 0;
129                         }
130                 }
131
132                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
133                                 skb->len);
134         }
135
136         if (dst->hh)
137                 return neigh_hh_output(dst->hh, skb);
138         else if (dst->neighbour)
139                 return dst->neighbour->output(skb);
140
141         IP6_INC_STATS_BH(dev_net(dst->dev),
142                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
143         kfree_skb(skb);
144         return -EINVAL;
145 }
146
147 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
148 {
149         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
150
151         return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
152                skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
153 }
154
155 static int ip6_finish_output(struct sk_buff *skb)
156 {
157         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
158             dst_allfrag(skb_dst(skb)))
159                 return ip6_fragment(skb, ip6_finish_output2);
160         else
161                 return ip6_finish_output2(skb);
162 }
163
164 int ip6_output(struct sk_buff *skb)
165 {
166         struct net_device *dev = skb_dst(skb)->dev;
167         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
168         if (unlikely(idev->cnf.disable_ipv6)) {
169                 IP6_INC_STATS(dev_net(dev), idev,
170                               IPSTATS_MIB_OUTDISCARDS);
171                 kfree_skb(skb);
172                 return 0;
173         }
174
175         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
176                             ip6_finish_output,
177                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
178 }
179
180 /*
181  *      xmit an sk_buff (used by TCP)
182  */
183
184 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
185              struct ipv6_txoptions *opt, int ipfragok)
186 {
187         struct net *net = sock_net(sk);
188         struct ipv6_pinfo *np = inet6_sk(sk);
189         struct in6_addr *first_hop = &fl->fl6_dst;
190         struct dst_entry *dst = skb_dst(skb);
191         struct ipv6hdr *hdr;
192         u8  proto = fl->proto;
193         int seg_len = skb->len;
194         int hlimit = -1;
195         int tclass = 0;
196         u32 mtu;
197
198         if (opt) {
199                 unsigned int head_room;
200
201                 /* First: exthdrs may take lots of space (~8K for now)
202                    MAX_HEADER is not enough.
203                  */
204                 head_room = opt->opt_nflen + opt->opt_flen;
205                 seg_len += head_room;
206                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
207
208                 if (skb_headroom(skb) < head_room) {
209                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
210                         if (skb2 == NULL) {
211                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
212                                               IPSTATS_MIB_OUTDISCARDS);
213                                 kfree_skb(skb);
214                                 return -ENOBUFS;
215                         }
216                         kfree_skb(skb);
217                         skb = skb2;
218                         if (sk)
219                                 skb_set_owner_w(skb, sk);
220                 }
221                 if (opt->opt_flen)
222                         ipv6_push_frag_opts(skb, opt, &proto);
223                 if (opt->opt_nflen)
224                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
225         }
226
227         skb_push(skb, sizeof(struct ipv6hdr));
228         skb_reset_network_header(skb);
229         hdr = ipv6_hdr(skb);
230
231         /* Allow local fragmentation. */
232         if (ipfragok)
233                 skb->local_df = 1;
234
235         /*
236          *      Fill in the IPv6 header
237          */
238         if (np) {
239                 tclass = np->tclass;
240                 hlimit = np->hop_limit;
241         }
242         if (hlimit < 0)
243                 hlimit = ip6_dst_hoplimit(dst);
244
245         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
246
247         hdr->payload_len = htons(seg_len);
248         hdr->nexthdr = proto;
249         hdr->hop_limit = hlimit;
250
251         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
252         ipv6_addr_copy(&hdr->daddr, first_hop);
253
254         skb->priority = sk->sk_priority;
255         skb->mark = sk->sk_mark;
256
257         mtu = dst_mtu(dst);
258         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
259                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
260                               IPSTATS_MIB_OUT, skb->len);
261                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
262                                dst->dev, dst_output);
263         }
264
265         if (net_ratelimit())
266                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
267         skb->dev = dst->dev;
268         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
269         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
270         kfree_skb(skb);
271         return -EMSGSIZE;
272 }
273
274 EXPORT_SYMBOL(ip6_xmit);
275
276 /*
277  *      To avoid extra problems ND packets are send through this
278  *      routine. It's code duplication but I really want to avoid
279  *      extra checks since ipv6_build_header is used by TCP (which
280  *      is for us performance critical)
281  */
282
283 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
284                const struct in6_addr *saddr, const struct in6_addr *daddr,
285                int proto, int len)
286 {
287         struct ipv6_pinfo *np = inet6_sk(sk);
288         struct ipv6hdr *hdr;
289         int totlen;
290
291         skb->protocol = htons(ETH_P_IPV6);
292         skb->dev = dev;
293
294         totlen = len + sizeof(struct ipv6hdr);
295
296         skb_reset_network_header(skb);
297         skb_put(skb, sizeof(struct ipv6hdr));
298         hdr = ipv6_hdr(skb);
299
300         *(__be32*)hdr = htonl(0x60000000);
301
302         hdr->payload_len = htons(len);
303         hdr->nexthdr = proto;
304         hdr->hop_limit = np->hop_limit;
305
306         ipv6_addr_copy(&hdr->saddr, saddr);
307         ipv6_addr_copy(&hdr->daddr, daddr);
308
309         return 0;
310 }
311
312 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
313 {
314         struct ip6_ra_chain *ra;
315         struct sock *last = NULL;
316
317         read_lock(&ip6_ra_lock);
318         for (ra = ip6_ra_chain; ra; ra = ra->next) {
319                 struct sock *sk = ra->sk;
320                 if (sk && ra->sel == sel &&
321                     (!sk->sk_bound_dev_if ||
322                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
323                         if (last) {
324                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
325                                 if (skb2)
326                                         rawv6_rcv(last, skb2);
327                         }
328                         last = sk;
329                 }
330         }
331
332         if (last) {
333                 rawv6_rcv(last, skb);
334                 read_unlock(&ip6_ra_lock);
335                 return 1;
336         }
337         read_unlock(&ip6_ra_lock);
338         return 0;
339 }
340
341 static int ip6_forward_proxy_check(struct sk_buff *skb)
342 {
343         struct ipv6hdr *hdr = ipv6_hdr(skb);
344         u8 nexthdr = hdr->nexthdr;
345         int offset;
346
347         if (ipv6_ext_hdr(nexthdr)) {
348                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
349                 if (offset < 0)
350                         return 0;
351         } else
352                 offset = sizeof(struct ipv6hdr);
353
354         if (nexthdr == IPPROTO_ICMPV6) {
355                 struct icmp6hdr *icmp6;
356
357                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
358                                          offset + 1 - skb->data)))
359                         return 0;
360
361                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
362
363                 switch (icmp6->icmp6_type) {
364                 case NDISC_ROUTER_SOLICITATION:
365                 case NDISC_ROUTER_ADVERTISEMENT:
366                 case NDISC_NEIGHBOUR_SOLICITATION:
367                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
368                 case NDISC_REDIRECT:
369                         /* For reaction involving unicast neighbor discovery
370                          * message destined to the proxied address, pass it to
371                          * input function.
372                          */
373                         return 1;
374                 default:
375                         break;
376                 }
377         }
378
379         /*
380          * The proxying router can't forward traffic sent to a link-local
381          * address, so signal the sender and discard the packet. This
382          * behavior is clarified by the MIPv6 specification.
383          */
384         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
385                 dst_link_failure(skb);
386                 return -1;
387         }
388
389         return 0;
390 }
391
392 static inline int ip6_forward_finish(struct sk_buff *skb)
393 {
394         return dst_output(skb);
395 }
396
397 int ip6_forward(struct sk_buff *skb)
398 {
399         struct dst_entry *dst = skb_dst(skb);
400         struct ipv6hdr *hdr = ipv6_hdr(skb);
401         struct inet6_skb_parm *opt = IP6CB(skb);
402         struct net *net = dev_net(dst->dev);
403         u32 mtu;
404
405         if (net->ipv6.devconf_all->forwarding == 0)
406                 goto error;
407
408         if (skb_warn_if_lro(skb))
409                 goto drop;
410
411         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
412                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
413                 goto drop;
414         }
415
416         skb_forward_csum(skb);
417
418         /*
419          *      We DO NOT make any processing on
420          *      RA packets, pushing them to user level AS IS
421          *      without ane WARRANTY that application will be able
422          *      to interpret them. The reason is that we
423          *      cannot make anything clever here.
424          *
425          *      We are not end-node, so that if packet contains
426          *      AH/ESP, we cannot make anything.
427          *      Defragmentation also would be mistake, RA packets
428          *      cannot be fragmented, because there is no warranty
429          *      that different fragments will go along one path. --ANK
430          */
431         if (opt->ra) {
432                 u8 *ptr = skb_network_header(skb) + opt->ra;
433                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
434                         return 0;
435         }
436
437         /*
438          *      check and decrement ttl
439          */
440         if (hdr->hop_limit <= 1) {
441                 /* Force OUTPUT device used as source address */
442                 skb->dev = dst->dev;
443                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
444                 IP6_INC_STATS_BH(net,
445                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
446
447                 kfree_skb(skb);
448                 return -ETIMEDOUT;
449         }
450
451         /* XXX: idev->cnf.proxy_ndp? */
452         if (net->ipv6.devconf_all->proxy_ndp &&
453             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
454                 int proxied = ip6_forward_proxy_check(skb);
455                 if (proxied > 0)
456                         return ip6_input(skb);
457                 else if (proxied < 0) {
458                         IP6_INC_STATS(net, ip6_dst_idev(dst),
459                                       IPSTATS_MIB_INDISCARDS);
460                         goto drop;
461                 }
462         }
463
464         if (!xfrm6_route_forward(skb)) {
465                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
466                 goto drop;
467         }
468         dst = skb_dst(skb);
469
470         /* IPv6 specs say nothing about it, but it is clear that we cannot
471            send redirects to source routed frames.
472            We don't send redirects to frames decapsulated from IPsec.
473          */
474         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
475             !skb_sec_path(skb)) {
476                 struct in6_addr *target = NULL;
477                 struct rt6_info *rt;
478                 struct neighbour *n = dst->neighbour;
479
480                 /*
481                  *      incoming and outgoing devices are the same
482                  *      send a redirect.
483                  */
484
485                 rt = (struct rt6_info *) dst;
486                 if ((rt->rt6i_flags & RTF_GATEWAY))
487                         target = (struct in6_addr*)&n->primary_key;
488                 else
489                         target = &hdr->daddr;
490
491                 /* Limit redirects both by destination (here)
492                    and by source (inside ndisc_send_redirect)
493                  */
494                 if (xrlim_allow(dst, 1*HZ))
495                         ndisc_send_redirect(skb, n, target);
496         } else {
497                 int addrtype = ipv6_addr_type(&hdr->saddr);
498
499                 /* This check is security critical. */
500                 if (addrtype == IPV6_ADDR_ANY ||
501                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
502                         goto error;
503                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
504                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
505                                     ICMPV6_NOT_NEIGHBOUR, 0);
506                         goto error;
507                 }
508         }
509
510         mtu = dst_mtu(dst);
511         if (mtu < IPV6_MIN_MTU)
512                 mtu = IPV6_MIN_MTU;
513
514         if (skb->len > mtu) {
515                 /* Again, force OUTPUT device used as source address */
516                 skb->dev = dst->dev;
517                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
518                 IP6_INC_STATS_BH(net,
519                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
520                 IP6_INC_STATS_BH(net,
521                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
522                 kfree_skb(skb);
523                 return -EMSGSIZE;
524         }
525
526         if (skb_cow(skb, dst->dev->hard_header_len)) {
527                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
528                 goto drop;
529         }
530
531         hdr = ipv6_hdr(skb);
532
533         /* Mangling hops number delayed to point after skb COW */
534
535         hdr->hop_limit--;
536
537         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
538         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
539                        ip6_forward_finish);
540
541 error:
542         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
543 drop:
544         kfree_skb(skb);
545         return -EINVAL;
546 }
547
548 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
549 {
550         to->pkt_type = from->pkt_type;
551         to->priority = from->priority;
552         to->protocol = from->protocol;
553         skb_dst_drop(to);
554         skb_dst_set(to, dst_clone(skb_dst(from)));
555         to->dev = from->dev;
556         to->mark = from->mark;
557
558 #ifdef CONFIG_NET_SCHED
559         to->tc_index = from->tc_index;
560 #endif
561         nf_copy(to, from);
562 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
563     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
564         to->nf_trace = from->nf_trace;
565 #endif
566         skb_copy_secmark(to, from);
567 }
568
569 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
570 {
571         u16 offset = sizeof(struct ipv6hdr);
572         struct ipv6_opt_hdr *exthdr =
573                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
574         unsigned int packet_len = skb->tail - skb->network_header;
575         int found_rhdr = 0;
576         *nexthdr = &ipv6_hdr(skb)->nexthdr;
577
578         while (offset + 1 <= packet_len) {
579
580                 switch (**nexthdr) {
581
582                 case NEXTHDR_HOP:
583                         break;
584                 case NEXTHDR_ROUTING:
585                         found_rhdr = 1;
586                         break;
587                 case NEXTHDR_DEST:
588 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
589                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
590                                 break;
591 #endif
592                         if (found_rhdr)
593                                 return offset;
594                         break;
595                 default :
596                         return offset;
597                 }
598
599                 offset += ipv6_optlen(exthdr);
600                 *nexthdr = &exthdr->nexthdr;
601                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
602                                                  offset);
603         }
604
605         return offset;
606 }
607
608 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
609 {
610         struct sk_buff *frag;
611         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
612         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
613         struct ipv6hdr *tmp_hdr;
614         struct frag_hdr *fh;
615         unsigned int mtu, hlen, left, len;
616         __be32 frag_id = 0;
617         int ptr, offset = 0, err=0;
618         u8 *prevhdr, nexthdr = 0;
619         struct net *net = dev_net(skb_dst(skb)->dev);
620
621         hlen = ip6_find_1stfragopt(skb, &prevhdr);
622         nexthdr = *prevhdr;
623
624         mtu = ip6_skb_dst_mtu(skb);
625
626         /* We must not fragment if the socket is set to force MTU discovery
627          * or if the skb it not generated by a local socket.
628          */
629         if (!skb->local_df) {
630                 skb->dev = skb_dst(skb)->dev;
631                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
632                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
633                               IPSTATS_MIB_FRAGFAILS);
634                 kfree_skb(skb);
635                 return -EMSGSIZE;
636         }
637
638         if (np && np->frag_size < mtu) {
639                 if (np->frag_size)
640                         mtu = np->frag_size;
641         }
642         mtu -= hlen + sizeof(struct frag_hdr);
643
644         if (skb_has_frags(skb)) {
645                 int first_len = skb_pagelen(skb);
646                 int truesizes = 0;
647
648                 if (first_len - hlen > mtu ||
649                     ((first_len - hlen) & 7) ||
650                     skb_cloned(skb))
651                         goto slow_path;
652
653                 skb_walk_frags(skb, frag) {
654                         /* Correct geometry. */
655                         if (frag->len > mtu ||
656                             ((frag->len & 7) && frag->next) ||
657                             skb_headroom(frag) < hlen)
658                             goto slow_path;
659
660                         /* Partially cloned skb? */
661                         if (skb_shared(frag))
662                                 goto slow_path;
663
664                         BUG_ON(frag->sk);
665                         if (skb->sk) {
666                                 frag->sk = skb->sk;
667                                 frag->destructor = sock_wfree;
668                                 truesizes += frag->truesize;
669                         }
670                 }
671
672                 err = 0;
673                 offset = 0;
674                 frag = skb_shinfo(skb)->frag_list;
675                 skb_frag_list_init(skb);
676                 /* BUILD HEADER */
677
678                 *prevhdr = NEXTHDR_FRAGMENT;
679                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
680                 if (!tmp_hdr) {
681                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
682                                       IPSTATS_MIB_FRAGFAILS);
683                         return -ENOMEM;
684                 }
685
686                 __skb_pull(skb, hlen);
687                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
688                 __skb_push(skb, hlen);
689                 skb_reset_network_header(skb);
690                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
691
692                 ipv6_select_ident(fh);
693                 fh->nexthdr = nexthdr;
694                 fh->reserved = 0;
695                 fh->frag_off = htons(IP6_MF);
696                 frag_id = fh->identification;
697
698                 first_len = skb_pagelen(skb);
699                 skb->data_len = first_len - skb_headlen(skb);
700                 skb->truesize -= truesizes;
701                 skb->len = first_len;
702                 ipv6_hdr(skb)->payload_len = htons(first_len -
703                                                    sizeof(struct ipv6hdr));
704
705                 dst_hold(&rt->u.dst);
706
707                 for (;;) {
708                         /* Prepare header of the next frame,
709                          * before previous one went down. */
710                         if (frag) {
711                                 frag->ip_summed = CHECKSUM_NONE;
712                                 skb_reset_transport_header(frag);
713                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
714                                 __skb_push(frag, hlen);
715                                 skb_reset_network_header(frag);
716                                 memcpy(skb_network_header(frag), tmp_hdr,
717                                        hlen);
718                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
719                                 fh->nexthdr = nexthdr;
720                                 fh->reserved = 0;
721                                 fh->frag_off = htons(offset);
722                                 if (frag->next != NULL)
723                                         fh->frag_off |= htons(IP6_MF);
724                                 fh->identification = frag_id;
725                                 ipv6_hdr(frag)->payload_len =
726                                                 htons(frag->len -
727                                                       sizeof(struct ipv6hdr));
728                                 ip6_copy_metadata(frag, skb);
729                         }
730
731                         err = output(skb);
732                         if(!err)
733                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
734                                               IPSTATS_MIB_FRAGCREATES);
735
736                         if (err || !frag)
737                                 break;
738
739                         skb = frag;
740                         frag = skb->next;
741                         skb->next = NULL;
742                 }
743
744                 kfree(tmp_hdr);
745
746                 if (err == 0) {
747                         IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
748                                       IPSTATS_MIB_FRAGOKS);
749                         dst_release(&rt->u.dst);
750                         return 0;
751                 }
752
753                 while (frag) {
754                         skb = frag->next;
755                         kfree_skb(frag);
756                         frag = skb;
757                 }
758
759                 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
760                               IPSTATS_MIB_FRAGFAILS);
761                 dst_release(&rt->u.dst);
762                 return err;
763         }
764
765 slow_path:
766         left = skb->len - hlen;         /* Space per frame */
767         ptr = hlen;                     /* Where to start from */
768
769         /*
770          *      Fragment the datagram.
771          */
772
773         *prevhdr = NEXTHDR_FRAGMENT;
774
775         /*
776          *      Keep copying data until we run out.
777          */
778         while(left > 0) {
779                 len = left;
780                 /* IF: it doesn't fit, use 'mtu' - the data space left */
781                 if (len > mtu)
782                         len = mtu;
783                 /* IF: we are not sending upto and including the packet end
784                    then align the next start on an eight byte boundary */
785                 if (len < left) {
786                         len &= ~7;
787                 }
788                 /*
789                  *      Allocate buffer.
790                  */
791
792                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
793                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
794                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
795                                       IPSTATS_MIB_FRAGFAILS);
796                         err = -ENOMEM;
797                         goto fail;
798                 }
799
800                 /*
801                  *      Set up data on packet
802                  */
803
804                 ip6_copy_metadata(frag, skb);
805                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
806                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
807                 skb_reset_network_header(frag);
808                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
809                 frag->transport_header = (frag->network_header + hlen +
810                                           sizeof(struct frag_hdr));
811
812                 /*
813                  *      Charge the memory for the fragment to any owner
814                  *      it might possess
815                  */
816                 if (skb->sk)
817                         skb_set_owner_w(frag, skb->sk);
818
819                 /*
820                  *      Copy the packet header into the new buffer.
821                  */
822                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
823
824                 /*
825                  *      Build fragment header.
826                  */
827                 fh->nexthdr = nexthdr;
828                 fh->reserved = 0;
829                 if (!frag_id) {
830                         ipv6_select_ident(fh);
831                         frag_id = fh->identification;
832                 } else
833                         fh->identification = frag_id;
834
835                 /*
836                  *      Copy a block of the IP datagram.
837                  */
838                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
839                         BUG();
840                 left -= len;
841
842                 fh->frag_off = htons(offset);
843                 if (left > 0)
844                         fh->frag_off |= htons(IP6_MF);
845                 ipv6_hdr(frag)->payload_len = htons(frag->len -
846                                                     sizeof(struct ipv6hdr));
847
848                 ptr += len;
849                 offset += len;
850
851                 /*
852                  *      Put this fragment into the sending queue.
853                  */
854                 err = output(frag);
855                 if (err)
856                         goto fail;
857
858                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
859                               IPSTATS_MIB_FRAGCREATES);
860         }
861         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
862                       IPSTATS_MIB_FRAGOKS);
863         kfree_skb(skb);
864         return err;
865
866 fail:
867         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
868                       IPSTATS_MIB_FRAGFAILS);
869         kfree_skb(skb);
870         return err;
871 }
872
873 static inline int ip6_rt_check(struct rt6key *rt_key,
874                                struct in6_addr *fl_addr,
875                                struct in6_addr *addr_cache)
876 {
877         return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
878                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
879 }
880
881 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
882                                           struct dst_entry *dst,
883                                           struct flowi *fl)
884 {
885         struct ipv6_pinfo *np = inet6_sk(sk);
886         struct rt6_info *rt = (struct rt6_info *)dst;
887
888         if (!dst)
889                 goto out;
890
891         /* Yes, checking route validity in not connected
892          * case is not very simple. Take into account,
893          * that we do not support routing by source, TOS,
894          * and MSG_DONTROUTE            --ANK (980726)
895          *
896          * 1. ip6_rt_check(): If route was host route,
897          *    check that cached destination is current.
898          *    If it is network route, we still may
899          *    check its validity using saved pointer
900          *    to the last used address: daddr_cache.
901          *    We do not want to save whole address now,
902          *    (because main consumer of this service
903          *    is tcp, which has not this problem),
904          *    so that the last trick works only on connected
905          *    sockets.
906          * 2. oif also should be the same.
907          */
908         if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
909 #ifdef CONFIG_IPV6_SUBTREES
910             ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
911 #endif
912             (fl->oif && fl->oif != dst->dev->ifindex)) {
913                 dst_release(dst);
914                 dst = NULL;
915         }
916
917 out:
918         return dst;
919 }
920
921 static int ip6_dst_lookup_tail(struct sock *sk,
922                                struct dst_entry **dst, struct flowi *fl)
923 {
924         int err;
925         struct net *net = sock_net(sk);
926
927         if (*dst == NULL)
928                 *dst = ip6_route_output(net, sk, fl);
929
930         if ((err = (*dst)->error))
931                 goto out_err_release;
932
933         if (ipv6_addr_any(&fl->fl6_src)) {
934                 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
935                                          &fl->fl6_dst,
936                                          sk ? inet6_sk(sk)->srcprefs : 0,
937                                          &fl->fl6_src);
938                 if (err)
939                         goto out_err_release;
940         }
941
942 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
943         /*
944          * Here if the dst entry we've looked up
945          * has a neighbour entry that is in the INCOMPLETE
946          * state and the src address from the flow is
947          * marked as OPTIMISTIC, we release the found
948          * dst entry and replace it instead with the
949          * dst entry of the nexthop router
950          */
951         if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
952                 struct inet6_ifaddr *ifp;
953                 struct flowi fl_gw;
954                 int redirect;
955
956                 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
957                                       (*dst)->dev, 1);
958
959                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
960                 if (ifp)
961                         in6_ifa_put(ifp);
962
963                 if (redirect) {
964                         /*
965                          * We need to get the dst entry for the
966                          * default router instead
967                          */
968                         dst_release(*dst);
969                         memcpy(&fl_gw, fl, sizeof(struct flowi));
970                         memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
971                         *dst = ip6_route_output(net, sk, &fl_gw);
972                         if ((err = (*dst)->error))
973                                 goto out_err_release;
974                 }
975         }
976 #endif
977
978         return 0;
979
980 out_err_release:
981         if (err == -ENETUNREACH)
982                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
983         dst_release(*dst);
984         *dst = NULL;
985         return err;
986 }
987
988 /**
989  *      ip6_dst_lookup - perform route lookup on flow
990  *      @sk: socket which provides route info
991  *      @dst: pointer to dst_entry * for result
992  *      @fl: flow to lookup
993  *
994  *      This function performs a route lookup on the given flow.
995  *
996  *      It returns zero on success, or a standard errno code on error.
997  */
998 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
999 {
1000         *dst = NULL;
1001         return ip6_dst_lookup_tail(sk, dst, fl);
1002 }
1003 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1004
1005 /**
1006  *      ip6_sk_dst_lookup - perform socket cached route lookup on flow
1007  *      @sk: socket which provides the dst cache and route info
1008  *      @dst: pointer to dst_entry * for result
1009  *      @fl: flow to lookup
1010  *
1011  *      This function performs a route lookup on the given flow with the
1012  *      possibility of using the cached route in the socket if it is valid.
1013  *      It will take the socket dst lock when operating on the dst cache.
1014  *      As a result, this function can only be used in process context.
1015  *
1016  *      It returns zero on success, or a standard errno code on error.
1017  */
1018 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1019 {
1020         *dst = NULL;
1021         if (sk) {
1022                 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1023                 *dst = ip6_sk_dst_check(sk, *dst, fl);
1024         }
1025
1026         return ip6_dst_lookup_tail(sk, dst, fl);
1027 }
1028 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1029
1030 static inline int ip6_ufo_append_data(struct sock *sk,
1031                         int getfrag(void *from, char *to, int offset, int len,
1032                         int odd, struct sk_buff *skb),
1033                         void *from, int length, int hh_len, int fragheaderlen,
1034                         int transhdrlen, int mtu,unsigned int flags)
1035
1036 {
1037         struct sk_buff *skb;
1038         int err;
1039
1040         /* There is support for UDP large send offload by network
1041          * device, so create one single skb packet containing complete
1042          * udp datagram
1043          */
1044         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1045                 skb = sock_alloc_send_skb(sk,
1046                         hh_len + fragheaderlen + transhdrlen + 20,
1047                         (flags & MSG_DONTWAIT), &err);
1048                 if (skb == NULL)
1049                         return -ENOMEM;
1050
1051                 /* reserve space for Hardware header */
1052                 skb_reserve(skb, hh_len);
1053
1054                 /* create space for UDP/IP header */
1055                 skb_put(skb,fragheaderlen + transhdrlen);
1056
1057                 /* initialize network header pointer */
1058                 skb_reset_network_header(skb);
1059
1060                 /* initialize protocol header pointer */
1061                 skb->transport_header = skb->network_header + fragheaderlen;
1062
1063                 skb->ip_summed = CHECKSUM_PARTIAL;
1064                 skb->csum = 0;
1065                 sk->sk_sndmsg_off = 0;
1066         }
1067
1068         err = skb_append_datato_frags(sk,skb, getfrag, from,
1069                                       (length - transhdrlen));
1070         if (!err) {
1071                 struct frag_hdr fhdr;
1072
1073                 /* Specify the length of each IPv6 datagram fragment.
1074                  * It has to be a multiple of 8.
1075                  */
1076                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1077                                              sizeof(struct frag_hdr)) & ~7;
1078                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1079                 ipv6_select_ident(&fhdr);
1080                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1081                 __skb_queue_tail(&sk->sk_write_queue, skb);
1082
1083                 return 0;
1084         }
1085         /* There is not enough support do UPD LSO,
1086          * so follow normal path
1087          */
1088         kfree_skb(skb);
1089
1090         return err;
1091 }
1092
1093 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1094                                                gfp_t gfp)
1095 {
1096         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1097 }
1098
1099 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1100                                                 gfp_t gfp)
1101 {
1102         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1103 }
1104
1105 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1106         int offset, int len, int odd, struct sk_buff *skb),
1107         void *from, int length, int transhdrlen,
1108         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1109         struct rt6_info *rt, unsigned int flags)
1110 {
1111         struct inet_sock *inet = inet_sk(sk);
1112         struct ipv6_pinfo *np = inet6_sk(sk);
1113         struct sk_buff *skb;
1114         unsigned int maxfraglen, fragheaderlen;
1115         int exthdrlen;
1116         int hh_len;
1117         int mtu;
1118         int copy;
1119         int err;
1120         int offset = 0;
1121         int csummode = CHECKSUM_NONE;
1122
1123         if (flags&MSG_PROBE)
1124                 return 0;
1125         if (skb_queue_empty(&sk->sk_write_queue)) {
1126                 /*
1127                  * setup for corking
1128                  */
1129                 if (opt) {
1130                         if (WARN_ON(np->cork.opt))
1131                                 return -EINVAL;
1132
1133                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1134                         if (unlikely(np->cork.opt == NULL))
1135                                 return -ENOBUFS;
1136
1137                         np->cork.opt->tot_len = opt->tot_len;
1138                         np->cork.opt->opt_flen = opt->opt_flen;
1139                         np->cork.opt->opt_nflen = opt->opt_nflen;
1140
1141                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1142                                                             sk->sk_allocation);
1143                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1144                                 return -ENOBUFS;
1145
1146                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1147                                                             sk->sk_allocation);
1148                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1149                                 return -ENOBUFS;
1150
1151                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1152                                                            sk->sk_allocation);
1153                         if (opt->hopopt && !np->cork.opt->hopopt)
1154                                 return -ENOBUFS;
1155
1156                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1157                                                             sk->sk_allocation);
1158                         if (opt->srcrt && !np->cork.opt->srcrt)
1159                                 return -ENOBUFS;
1160
1161                         /* need source address above miyazawa*/
1162                 }
1163                 dst_hold(&rt->u.dst);
1164                 inet->cork.dst = &rt->u.dst;
1165                 inet->cork.fl = *fl;
1166                 np->cork.hop_limit = hlimit;
1167                 np->cork.tclass = tclass;
1168                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1169                       rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1170                 if (np->frag_size < mtu) {
1171                         if (np->frag_size)
1172                                 mtu = np->frag_size;
1173                 }
1174                 inet->cork.fragsize = mtu;
1175                 if (dst_allfrag(rt->u.dst.path))
1176                         inet->cork.flags |= IPCORK_ALLFRAG;
1177                 inet->cork.length = 0;
1178                 sk->sk_sndmsg_page = NULL;
1179                 sk->sk_sndmsg_off = 0;
1180                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1181                             rt->rt6i_nfheader_len;
1182                 length += exthdrlen;
1183                 transhdrlen += exthdrlen;
1184         } else {
1185                 rt = (struct rt6_info *)inet->cork.dst;
1186                 fl = &inet->cork.fl;
1187                 opt = np->cork.opt;
1188                 transhdrlen = 0;
1189                 exthdrlen = 0;
1190                 mtu = inet->cork.fragsize;
1191         }
1192
1193         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1194
1195         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1196                         (opt ? opt->opt_nflen : 0);
1197         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1198
1199         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1200                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1201                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1202                         return -EMSGSIZE;
1203                 }
1204         }
1205
1206         /*
1207          * Let's try using as much space as possible.
1208          * Use MTU if total length of the message fits into the MTU.
1209          * Otherwise, we need to reserve fragment header and
1210          * fragment alignment (= 8-15 octects, in total).
1211          *
1212          * Note that we may need to "move" the data from the tail of
1213          * of the buffer to the new fragment when we split
1214          * the message.
1215          *
1216          * FIXME: It may be fragmented into multiple chunks
1217          *        at once if non-fragmentable extension headers
1218          *        are too large.
1219          * --yoshfuji
1220          */
1221
1222         inet->cork.length += length;
1223         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1224             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1225
1226                 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1227                                           fragheaderlen, transhdrlen, mtu,
1228                                           flags);
1229                 if (err)
1230                         goto error;
1231                 return 0;
1232         }
1233
1234         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1235                 goto alloc_new_skb;
1236
1237         while (length > 0) {
1238                 /* Check if the remaining data fits into current packet. */
1239                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1240                 if (copy < length)
1241                         copy = maxfraglen - skb->len;
1242
1243                 if (copy <= 0) {
1244                         char *data;
1245                         unsigned int datalen;
1246                         unsigned int fraglen;
1247                         unsigned int fraggap;
1248                         unsigned int alloclen;
1249                         struct sk_buff *skb_prev;
1250 alloc_new_skb:
1251                         skb_prev = skb;
1252
1253                         /* There's no room in the current skb */
1254                         if (skb_prev)
1255                                 fraggap = skb_prev->len - maxfraglen;
1256                         else
1257                                 fraggap = 0;
1258
1259                         /*
1260                          * If remaining data exceeds the mtu,
1261                          * we know we need more fragment(s).
1262                          */
1263                         datalen = length + fraggap;
1264                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1265                                 datalen = maxfraglen - fragheaderlen;
1266
1267                         fraglen = datalen + fragheaderlen;
1268                         if ((flags & MSG_MORE) &&
1269                             !(rt->u.dst.dev->features&NETIF_F_SG))
1270                                 alloclen = mtu;
1271                         else
1272                                 alloclen = datalen + fragheaderlen;
1273
1274                         /*
1275                          * The last fragment gets additional space at tail.
1276                          * Note: we overallocate on fragments with MSG_MODE
1277                          * because we have no idea if we're the last one.
1278                          */
1279                         if (datalen == length + fraggap)
1280                                 alloclen += rt->u.dst.trailer_len;
1281
1282                         /*
1283                          * We just reserve space for fragment header.
1284                          * Note: this may be overallocation if the message
1285                          * (without MSG_MORE) fits into the MTU.
1286                          */
1287                         alloclen += sizeof(struct frag_hdr);
1288
1289                         if (transhdrlen) {
1290                                 skb = sock_alloc_send_skb(sk,
1291                                                 alloclen + hh_len,
1292                                                 (flags & MSG_DONTWAIT), &err);
1293                         } else {
1294                                 skb = NULL;
1295                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1296                                     2 * sk->sk_sndbuf)
1297                                         skb = sock_wmalloc(sk,
1298                                                            alloclen + hh_len, 1,
1299                                                            sk->sk_allocation);
1300                                 if (unlikely(skb == NULL))
1301                                         err = -ENOBUFS;
1302                         }
1303                         if (skb == NULL)
1304                                 goto error;
1305                         /*
1306                          *      Fill in the control structures
1307                          */
1308                         skb->ip_summed = csummode;
1309                         skb->csum = 0;
1310                         /* reserve for fragmentation */
1311                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1312
1313                         /*
1314                          *      Find where to start putting bytes
1315                          */
1316                         data = skb_put(skb, fraglen);
1317                         skb_set_network_header(skb, exthdrlen);
1318                         data += fragheaderlen;
1319                         skb->transport_header = (skb->network_header +
1320                                                  fragheaderlen);
1321                         if (fraggap) {
1322                                 skb->csum = skb_copy_and_csum_bits(
1323                                         skb_prev, maxfraglen,
1324                                         data + transhdrlen, fraggap, 0);
1325                                 skb_prev->csum = csum_sub(skb_prev->csum,
1326                                                           skb->csum);
1327                                 data += fraggap;
1328                                 pskb_trim_unique(skb_prev, maxfraglen);
1329                         }
1330                         copy = datalen - transhdrlen - fraggap;
1331                         if (copy < 0) {
1332                                 err = -EINVAL;
1333                                 kfree_skb(skb);
1334                                 goto error;
1335                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1336                                 err = -EFAULT;
1337                                 kfree_skb(skb);
1338                                 goto error;
1339                         }
1340
1341                         offset += copy;
1342                         length -= datalen - fraggap;
1343                         transhdrlen = 0;
1344                         exthdrlen = 0;
1345                         csummode = CHECKSUM_NONE;
1346
1347                         /*
1348                          * Put the packet on the pending queue
1349                          */
1350                         __skb_queue_tail(&sk->sk_write_queue, skb);
1351                         continue;
1352                 }
1353
1354                 if (copy > length)
1355                         copy = length;
1356
1357                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1358                         unsigned int off;
1359
1360                         off = skb->len;
1361                         if (getfrag(from, skb_put(skb, copy),
1362                                                 offset, copy, off, skb) < 0) {
1363                                 __skb_trim(skb, off);
1364                                 err = -EFAULT;
1365                                 goto error;
1366                         }
1367                 } else {
1368                         int i = skb_shinfo(skb)->nr_frags;
1369                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1370                         struct page *page = sk->sk_sndmsg_page;
1371                         int off = sk->sk_sndmsg_off;
1372                         unsigned int left;
1373
1374                         if (page && (left = PAGE_SIZE - off) > 0) {
1375                                 if (copy >= left)
1376                                         copy = left;
1377                                 if (page != frag->page) {
1378                                         if (i == MAX_SKB_FRAGS) {
1379                                                 err = -EMSGSIZE;
1380                                                 goto error;
1381                                         }
1382                                         get_page(page);
1383                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1384                                         frag = &skb_shinfo(skb)->frags[i];
1385                                 }
1386                         } else if(i < MAX_SKB_FRAGS) {
1387                                 if (copy > PAGE_SIZE)
1388                                         copy = PAGE_SIZE;
1389                                 page = alloc_pages(sk->sk_allocation, 0);
1390                                 if (page == NULL) {
1391                                         err = -ENOMEM;
1392                                         goto error;
1393                                 }
1394                                 sk->sk_sndmsg_page = page;
1395                                 sk->sk_sndmsg_off = 0;
1396
1397                                 skb_fill_page_desc(skb, i, page, 0, 0);
1398                                 frag = &skb_shinfo(skb)->frags[i];
1399                         } else {
1400                                 err = -EMSGSIZE;
1401                                 goto error;
1402                         }
1403                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1404                                 err = -EFAULT;
1405                                 goto error;
1406                         }
1407                         sk->sk_sndmsg_off += copy;
1408                         frag->size += copy;
1409                         skb->len += copy;
1410                         skb->data_len += copy;
1411                         skb->truesize += copy;
1412                         atomic_add(copy, &sk->sk_wmem_alloc);
1413                 }
1414                 offset += copy;
1415                 length -= copy;
1416         }
1417         return 0;
1418 error:
1419         inet->cork.length -= length;
1420         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1421         return err;
1422 }
1423
1424 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1425 {
1426         if (np->cork.opt) {
1427                 kfree(np->cork.opt->dst0opt);
1428                 kfree(np->cork.opt->dst1opt);
1429                 kfree(np->cork.opt->hopopt);
1430                 kfree(np->cork.opt->srcrt);
1431                 kfree(np->cork.opt);
1432                 np->cork.opt = NULL;
1433         }
1434
1435         if (inet->cork.dst) {
1436                 dst_release(inet->cork.dst);
1437                 inet->cork.dst = NULL;
1438                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1439         }
1440         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1441 }
1442
1443 int ip6_push_pending_frames(struct sock *sk)
1444 {
1445         struct sk_buff *skb, *tmp_skb;
1446         struct sk_buff **tail_skb;
1447         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1448         struct inet_sock *inet = inet_sk(sk);
1449         struct ipv6_pinfo *np = inet6_sk(sk);
1450         struct net *net = sock_net(sk);
1451         struct ipv6hdr *hdr;
1452         struct ipv6_txoptions *opt = np->cork.opt;
1453         struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1454         struct flowi *fl = &inet->cork.fl;
1455         unsigned char proto = fl->proto;
1456         int err = 0;
1457
1458         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1459                 goto out;
1460         tail_skb = &(skb_shinfo(skb)->frag_list);
1461
1462         /* move skb->data to ip header from ext header */
1463         if (skb->data < skb_network_header(skb))
1464                 __skb_pull(skb, skb_network_offset(skb));
1465         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1466                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1467                 *tail_skb = tmp_skb;
1468                 tail_skb = &(tmp_skb->next);
1469                 skb->len += tmp_skb->len;
1470                 skb->data_len += tmp_skb->len;
1471                 skb->truesize += tmp_skb->truesize;
1472                 tmp_skb->destructor = NULL;
1473                 tmp_skb->sk = NULL;
1474         }
1475
1476         /* Allow local fragmentation. */
1477         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1478                 skb->local_df = 1;
1479
1480         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1481         __skb_pull(skb, skb_network_header_len(skb));
1482         if (opt && opt->opt_flen)
1483                 ipv6_push_frag_opts(skb, opt, &proto);
1484         if (opt && opt->opt_nflen)
1485                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1486
1487         skb_push(skb, sizeof(struct ipv6hdr));
1488         skb_reset_network_header(skb);
1489         hdr = ipv6_hdr(skb);
1490
1491         *(__be32*)hdr = fl->fl6_flowlabel |
1492                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1493
1494         hdr->hop_limit = np->cork.hop_limit;
1495         hdr->nexthdr = proto;
1496         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1497         ipv6_addr_copy(&hdr->daddr, final_dst);
1498
1499         skb->priority = sk->sk_priority;
1500         skb->mark = sk->sk_mark;
1501
1502         skb_dst_set(skb, dst_clone(&rt->u.dst));
1503         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1504         if (proto == IPPROTO_ICMPV6) {
1505                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1506
1507                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1508                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1509         }
1510
1511         err = ip6_local_out(skb);
1512         if (err) {
1513                 if (err > 0)
1514                         err = net_xmit_errno(err);
1515                 if (err)
1516                         goto error;
1517         }
1518
1519 out:
1520         ip6_cork_release(inet, np);
1521         return err;
1522 error:
1523         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1524         goto out;
1525 }
1526
1527 void ip6_flush_pending_frames(struct sock *sk)
1528 {
1529         struct sk_buff *skb;
1530
1531         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1532                 if (skb_dst(skb))
1533                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1534                                       IPSTATS_MIB_OUTDISCARDS);
1535                 kfree_skb(skb);
1536         }
1537
1538         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1539 }