Linux-2.6.12-rc2
[linux-flexiantxendom0-natty.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation 
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9  *
10  *      Based on linux/net/ipv4/ip_output.c
11  *
12  *      This program is free software; you can redistribute it and/or
13  *      modify it under the terms of the GNU General Public License
14  *      as published by the Free Software Foundation; either version
15  *      2 of the License, or (at your option) any later version.
16  *
17  *      Changes:
18  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
19  *                              extension headers are implemented.
20  *                              route changes now work.
21  *                              ip6_forward does not confuse sniffers.
22  *                              etc.
23  *
24  *      H. von Brand    :       Added missing #include <linux/string.h>
25  *      Imran Patel     :       frag id should be in NBO
26  *      Kazunori MIYAZAWA @USAGI
27  *                      :       add ip6_append_data and related functions
28  *                              for datagram xmit
29  */
30
31 #include <linux/config.h>
32 #include <linux/errno.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
35 #include <linux/socket.h>
36 #include <linux/net.h>
37 #include <linux/netdevice.h>
38 #include <linux/if_arp.h>
39 #include <linux/in6.h>
40 #include <linux/tcp.h>
41 #include <linux/route.h>
42
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62 {
63         static u32 ipv6_fragmentation_id = 1;
64         static DEFINE_SPINLOCK(ip6_id_lock);
65
66         spin_lock_bh(&ip6_id_lock);
67         fhdr->identification = htonl(ipv6_fragmentation_id);
68         if (++ipv6_fragmentation_id == 0)
69                 ipv6_fragmentation_id = 1;
70         spin_unlock_bh(&ip6_id_lock);
71 }
72
73 static inline int ip6_output_finish(struct sk_buff *skb)
74 {
75
76         struct dst_entry *dst = skb->dst;
77         struct hh_cache *hh = dst->hh;
78
79         if (hh) {
80                 int hh_alen;
81
82                 read_lock_bh(&hh->hh_lock);
83                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
84                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85                 read_unlock_bh(&hh->hh_lock);
86                 skb_push(skb, hh->hh_len);
87                 return hh->hh_output(skb);
88         } else if (dst->neighbour)
89                 return dst->neighbour->output(skb);
90
91         IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92         kfree_skb(skb);
93         return -EINVAL;
94
95 }
96
97 /* dev_loopback_xmit for use with netfilter. */
98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99 {
100         newskb->mac.raw = newskb->data;
101         __skb_pull(newskb, newskb->nh.raw - newskb->data);
102         newskb->pkt_type = PACKET_LOOPBACK;
103         newskb->ip_summed = CHECKSUM_UNNECESSARY;
104         BUG_TRAP(newskb->dst);
105
106         netif_rx(newskb);
107         return 0;
108 }
109
110
111 static int ip6_output2(struct sk_buff *skb)
112 {
113         struct dst_entry *dst = skb->dst;
114         struct net_device *dev = dst->dev;
115
116         skb->protocol = htons(ETH_P_IPV6);
117         skb->dev = dev;
118
119         if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121
122                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123                     ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124                                 &skb->nh.ipv6h->saddr)) {
125                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126
127                         /* Do not check for IFF_ALLMULTI; multicast routing
128                            is not supported in any case.
129                          */
130                         if (newskb)
131                                 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132                                         newskb->dev,
133                                         ip6_dev_loopback_xmit);
134
135                         if (skb->nh.ipv6h->hop_limit == 0) {
136                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137                                 kfree_skb(skb);
138                                 return 0;
139                         }
140                 }
141
142                 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143         }
144
145         return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146 }
147
148 int ip6_output(struct sk_buff *skb)
149 {
150         if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst))
151                 return ip6_fragment(skb, ip6_output2);
152         else
153                 return ip6_output2(skb);
154 }
155
156 #ifdef CONFIG_NETFILTER
157 int ip6_route_me_harder(struct sk_buff *skb)
158 {
159         struct ipv6hdr *iph = skb->nh.ipv6h;
160         struct dst_entry *dst;
161         struct flowi fl = {
162                 .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
163                 .nl_u =
164                 { .ip6_u =
165                   { .daddr = iph->daddr,
166                     .saddr = iph->saddr, } },
167                 .proto = iph->nexthdr,
168         };
169
170         dst = ip6_route_output(skb->sk, &fl);
171
172         if (dst->error) {
173                 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
174                 LIMIT_NETDEBUG(
175                         printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n"));
176                 dst_release(dst);
177                 return -EINVAL;
178         }
179
180         /* Drop old route. */
181         dst_release(skb->dst);
182
183         skb->dst = dst;
184         return 0;
185 }
186 #endif
187
188 static inline int ip6_maybe_reroute(struct sk_buff *skb)
189 {
190 #ifdef CONFIG_NETFILTER
191         if (skb->nfcache & NFC_ALTERED){
192                 if (ip6_route_me_harder(skb) != 0){
193                         kfree_skb(skb);
194                         return -EINVAL;
195                 }
196         }
197 #endif /* CONFIG_NETFILTER */
198         return dst_output(skb);
199 }
200
201 /*
202  *      xmit an sk_buff (used by TCP)
203  */
204
205 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
206              struct ipv6_txoptions *opt, int ipfragok)
207 {
208         struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
209         struct in6_addr *first_hop = &fl->fl6_dst;
210         struct dst_entry *dst = skb->dst;
211         struct ipv6hdr *hdr;
212         u8  proto = fl->proto;
213         int seg_len = skb->len;
214         int hlimit;
215         u32 mtu;
216
217         if (opt) {
218                 int head_room;
219
220                 /* First: exthdrs may take lots of space (~8K for now)
221                    MAX_HEADER is not enough.
222                  */
223                 head_room = opt->opt_nflen + opt->opt_flen;
224                 seg_len += head_room;
225                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
226
227                 if (skb_headroom(skb) < head_room) {
228                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
229                         kfree_skb(skb);
230                         skb = skb2;
231                         if (skb == NULL) {      
232                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
233                                 return -ENOBUFS;
234                         }
235                         if (sk)
236                                 skb_set_owner_w(skb, sk);
237                 }
238                 if (opt->opt_flen)
239                         ipv6_push_frag_opts(skb, opt, &proto);
240                 if (opt->opt_nflen)
241                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
242         }
243
244         hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
245
246         /*
247          *      Fill in the IPv6 header
248          */
249
250         *(u32*)hdr = htonl(0x60000000) | fl->fl6_flowlabel;
251         hlimit = -1;
252         if (np)
253                 hlimit = np->hop_limit;
254         if (hlimit < 0)
255                 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
256         if (hlimit < 0)
257                 hlimit = ipv6_get_hoplimit(dst->dev);
258
259         hdr->payload_len = htons(seg_len);
260         hdr->nexthdr = proto;
261         hdr->hop_limit = hlimit;
262
263         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
264         ipv6_addr_copy(&hdr->daddr, first_hop);
265
266         mtu = dst_mtu(dst);
267         if ((skb->len <= mtu) || ipfragok) {
268                 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
269                 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute);
270         }
271
272         if (net_ratelimit())
273                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
274         skb->dev = dst->dev;
275         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
276         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
277         kfree_skb(skb);
278         return -EMSGSIZE;
279 }
280
281 /*
282  *      To avoid extra problems ND packets are send through this
283  *      routine. It's code duplication but I really want to avoid
284  *      extra checks since ipv6_build_header is used by TCP (which
285  *      is for us performance critical)
286  */
287
288 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
289                struct in6_addr *saddr, struct in6_addr *daddr,
290                int proto, int len)
291 {
292         struct ipv6_pinfo *np = inet6_sk(sk);
293         struct ipv6hdr *hdr;
294         int totlen;
295
296         skb->protocol = htons(ETH_P_IPV6);
297         skb->dev = dev;
298
299         totlen = len + sizeof(struct ipv6hdr);
300
301         hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
302         skb->nh.ipv6h = hdr;
303
304         *(u32*)hdr = htonl(0x60000000);
305
306         hdr->payload_len = htons(len);
307         hdr->nexthdr = proto;
308         hdr->hop_limit = np->hop_limit;
309
310         ipv6_addr_copy(&hdr->saddr, saddr);
311         ipv6_addr_copy(&hdr->daddr, daddr);
312
313         return 0;
314 }
315
316 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
317 {
318         struct ip6_ra_chain *ra;
319         struct sock *last = NULL;
320
321         read_lock(&ip6_ra_lock);
322         for (ra = ip6_ra_chain; ra; ra = ra->next) {
323                 struct sock *sk = ra->sk;
324                 if (sk && ra->sel == sel) {
325                         if (last) {
326                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
327                                 if (skb2)
328                                         rawv6_rcv(last, skb2);
329                         }
330                         last = sk;
331                 }
332         }
333
334         if (last) {
335                 rawv6_rcv(last, skb);
336                 read_unlock(&ip6_ra_lock);
337                 return 1;
338         }
339         read_unlock(&ip6_ra_lock);
340         return 0;
341 }
342
343 static inline int ip6_forward_finish(struct sk_buff *skb)
344 {
345         return dst_output(skb);
346 }
347
348 int ip6_forward(struct sk_buff *skb)
349 {
350         struct dst_entry *dst = skb->dst;
351         struct ipv6hdr *hdr = skb->nh.ipv6h;
352         struct inet6_skb_parm *opt = IP6CB(skb);
353         
354         if (ipv6_devconf.forwarding == 0)
355                 goto error;
356
357         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
358                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
359                 goto drop;
360         }
361
362         skb->ip_summed = CHECKSUM_NONE;
363
364         /*
365          *      We DO NOT make any processing on
366          *      RA packets, pushing them to user level AS IS
367          *      without ane WARRANTY that application will be able
368          *      to interpret them. The reason is that we
369          *      cannot make anything clever here.
370          *
371          *      We are not end-node, so that if packet contains
372          *      AH/ESP, we cannot make anything.
373          *      Defragmentation also would be mistake, RA packets
374          *      cannot be fragmented, because there is no warranty
375          *      that different fragments will go along one path. --ANK
376          */
377         if (opt->ra) {
378                 u8 *ptr = skb->nh.raw + opt->ra;
379                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
380                         return 0;
381         }
382
383         /*
384          *      check and decrement ttl
385          */
386         if (hdr->hop_limit <= 1) {
387                 /* Force OUTPUT device used as source address */
388                 skb->dev = dst->dev;
389                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
390                             0, skb->dev);
391
392                 kfree_skb(skb);
393                 return -ETIMEDOUT;
394         }
395
396         if (!xfrm6_route_forward(skb)) {
397                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
398                 goto drop;
399         }
400         dst = skb->dst;
401
402         /* IPv6 specs say nothing about it, but it is clear that we cannot
403            send redirects to source routed frames.
404          */
405         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
406                 struct in6_addr *target = NULL;
407                 struct rt6_info *rt;
408                 struct neighbour *n = dst->neighbour;
409
410                 /*
411                  *      incoming and outgoing devices are the same
412                  *      send a redirect.
413                  */
414
415                 rt = (struct rt6_info *) dst;
416                 if ((rt->rt6i_flags & RTF_GATEWAY))
417                         target = (struct in6_addr*)&n->primary_key;
418                 else
419                         target = &hdr->daddr;
420
421                 /* Limit redirects both by destination (here)
422                    and by source (inside ndisc_send_redirect)
423                  */
424                 if (xrlim_allow(dst, 1*HZ))
425                         ndisc_send_redirect(skb, n, target);
426         } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
427                                                 |IPV6_ADDR_LINKLOCAL)) {
428                 /* This check is security critical. */
429                 goto error;
430         }
431
432         if (skb->len > dst_mtu(dst)) {
433                 /* Again, force OUTPUT device used as source address */
434                 skb->dev = dst->dev;
435                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
436                 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
437                 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
438                 kfree_skb(skb);
439                 return -EMSGSIZE;
440         }
441
442         if (skb_cow(skb, dst->dev->hard_header_len)) {
443                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
444                 goto drop;
445         }
446
447         hdr = skb->nh.ipv6h;
448
449         /* Mangling hops number delayed to point after skb COW */
450  
451         hdr->hop_limit--;
452
453         IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
454         return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
455
456 error:
457         IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
458 drop:
459         kfree_skb(skb);
460         return -EINVAL;
461 }
462
463 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
464 {
465         to->pkt_type = from->pkt_type;
466         to->priority = from->priority;
467         to->protocol = from->protocol;
468         to->security = from->security;
469         dst_release(to->dst);
470         to->dst = dst_clone(from->dst);
471         to->dev = from->dev;
472
473 #ifdef CONFIG_NET_SCHED
474         to->tc_index = from->tc_index;
475 #endif
476 #ifdef CONFIG_NETFILTER
477         to->nfmark = from->nfmark;
478         /* Connection association is same as pre-frag packet */
479         to->nfct = from->nfct;
480         nf_conntrack_get(to->nfct);
481         to->nfctinfo = from->nfctinfo;
482 #ifdef CONFIG_BRIDGE_NETFILTER
483         nf_bridge_put(to->nf_bridge);
484         to->nf_bridge = from->nf_bridge;
485         nf_bridge_get(to->nf_bridge);
486 #endif
487 #ifdef CONFIG_NETFILTER_DEBUG
488         to->nf_debug = from->nf_debug;
489 #endif
490 #endif
491 }
492
493 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
494 {
495         u16 offset = sizeof(struct ipv6hdr);
496         struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
497         unsigned int packet_len = skb->tail - skb->nh.raw;
498         int found_rhdr = 0;
499         *nexthdr = &skb->nh.ipv6h->nexthdr;
500
501         while (offset + 1 <= packet_len) {
502
503                 switch (**nexthdr) {
504
505                 case NEXTHDR_HOP:
506                 case NEXTHDR_ROUTING:
507                 case NEXTHDR_DEST:
508                         if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
509                         if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
510                         offset += ipv6_optlen(exthdr);
511                         *nexthdr = &exthdr->nexthdr;
512                         exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
513                         break;
514                 default :
515                         return offset;
516                 }
517         }
518
519         return offset;
520 }
521
522 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
523 {
524         struct net_device *dev;
525         struct sk_buff *frag;
526         struct rt6_info *rt = (struct rt6_info*)skb->dst;
527         struct ipv6hdr *tmp_hdr;
528         struct frag_hdr *fh;
529         unsigned int mtu, hlen, left, len;
530         u32 frag_id = 0;
531         int ptr, offset = 0, err=0;
532         u8 *prevhdr, nexthdr = 0;
533
534         dev = rt->u.dst.dev;
535         hlen = ip6_find_1stfragopt(skb, &prevhdr);
536         nexthdr = *prevhdr;
537
538         mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
539
540         if (skb_shinfo(skb)->frag_list) {
541                 int first_len = skb_pagelen(skb);
542
543                 if (first_len - hlen > mtu ||
544                     ((first_len - hlen) & 7) ||
545                     skb_cloned(skb))
546                         goto slow_path;
547
548                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
549                         /* Correct geometry. */
550                         if (frag->len > mtu ||
551                             ((frag->len & 7) && frag->next) ||
552                             skb_headroom(frag) < hlen)
553                             goto slow_path;
554
555                         /* Correct socket ownership. */
556                         if (frag->sk == NULL)
557                                 goto slow_path;
558
559                         /* Partially cloned skb? */
560                         if (skb_shared(frag))
561                                 goto slow_path;
562                 }
563
564                 err = 0;
565                 offset = 0;
566                 frag = skb_shinfo(skb)->frag_list;
567                 skb_shinfo(skb)->frag_list = NULL;
568                 /* BUILD HEADER */
569
570                 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
571                 if (!tmp_hdr) {
572                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
573                         return -ENOMEM;
574                 }
575
576                 *prevhdr = NEXTHDR_FRAGMENT;
577                 memcpy(tmp_hdr, skb->nh.raw, hlen);
578                 __skb_pull(skb, hlen);
579                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
580                 skb->nh.raw = __skb_push(skb, hlen);
581                 memcpy(skb->nh.raw, tmp_hdr, hlen);
582
583                 ipv6_select_ident(skb, fh);
584                 fh->nexthdr = nexthdr;
585                 fh->reserved = 0;
586                 fh->frag_off = htons(IP6_MF);
587                 frag_id = fh->identification;
588
589                 first_len = skb_pagelen(skb);
590                 skb->data_len = first_len - skb_headlen(skb);
591                 skb->len = first_len;
592                 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
593  
594
595                 for (;;) {
596                         /* Prepare header of the next frame,
597                          * before previous one went down. */
598                         if (frag) {
599                                 frag->ip_summed = CHECKSUM_NONE;
600                                 frag->h.raw = frag->data;
601                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
602                                 frag->nh.raw = __skb_push(frag, hlen);
603                                 memcpy(frag->nh.raw, tmp_hdr, hlen);
604                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
605                                 fh->nexthdr = nexthdr;
606                                 fh->reserved = 0;
607                                 fh->frag_off = htons(offset);
608                                 if (frag->next != NULL)
609                                         fh->frag_off |= htons(IP6_MF);
610                                 fh->identification = frag_id;
611                                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
612                                 ip6_copy_metadata(frag, skb);
613                         }
614                         
615                         err = output(skb);
616                         if (err || !frag)
617                                 break;
618
619                         skb = frag;
620                         frag = skb->next;
621                         skb->next = NULL;
622                 }
623
624                 if (tmp_hdr)
625                         kfree(tmp_hdr);
626
627                 if (err == 0) {
628                         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
629                         return 0;
630                 }
631
632                 while (frag) {
633                         skb = frag->next;
634                         kfree_skb(frag);
635                         frag = skb;
636                 }
637
638                 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
639                 return err;
640         }
641
642 slow_path:
643         left = skb->len - hlen;         /* Space per frame */
644         ptr = hlen;                     /* Where to start from */
645
646         /*
647          *      Fragment the datagram.
648          */
649
650         *prevhdr = NEXTHDR_FRAGMENT;
651
652         /*
653          *      Keep copying data until we run out.
654          */
655         while(left > 0) {
656                 len = left;
657                 /* IF: it doesn't fit, use 'mtu' - the data space left */
658                 if (len > mtu)
659                         len = mtu;
660                 /* IF: we are not sending upto and including the packet end
661                    then align the next start on an eight byte boundary */
662                 if (len < left) {
663                         len &= ~7;
664                 }
665                 /*
666                  *      Allocate buffer.
667                  */
668
669                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
670                         NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n"));
671                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
672                         err = -ENOMEM;
673                         goto fail;
674                 }
675
676                 /*
677                  *      Set up data on packet
678                  */
679
680                 ip6_copy_metadata(frag, skb);
681                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
682                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
683                 frag->nh.raw = frag->data;
684                 fh = (struct frag_hdr*)(frag->data + hlen);
685                 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
686
687                 /*
688                  *      Charge the memory for the fragment to any owner
689                  *      it might possess
690                  */
691                 if (skb->sk)
692                         skb_set_owner_w(frag, skb->sk);
693
694                 /*
695                  *      Copy the packet header into the new buffer.
696                  */
697                 memcpy(frag->nh.raw, skb->data, hlen);
698
699                 /*
700                  *      Build fragment header.
701                  */
702                 fh->nexthdr = nexthdr;
703                 fh->reserved = 0;
704                 if (frag_id) {
705                         ipv6_select_ident(skb, fh);
706                         frag_id = fh->identification;
707                 } else
708                         fh->identification = frag_id;
709
710                 /*
711                  *      Copy a block of the IP datagram.
712                  */
713                 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
714                         BUG();
715                 left -= len;
716
717                 fh->frag_off = htons(offset);
718                 if (left > 0)
719                         fh->frag_off |= htons(IP6_MF);
720                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
721
722                 ptr += len;
723                 offset += len;
724
725                 /*
726                  *      Put this fragment into the sending queue.
727                  */
728
729                 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
730
731                 err = output(frag);
732                 if (err)
733                         goto fail;
734         }
735         kfree_skb(skb);
736         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
737         return err;
738
739 fail:
740         kfree_skb(skb); 
741         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
742         return err;
743 }
744
745 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
746 {
747         int err = 0;
748
749         *dst = NULL;
750         if (sk) {
751                 struct ipv6_pinfo *np = inet6_sk(sk);
752         
753                 *dst = sk_dst_check(sk, np->dst_cookie);
754                 if (*dst) {
755                         struct rt6_info *rt = (struct rt6_info*)*dst;
756         
757                                 /* Yes, checking route validity in not connected
758                                    case is not very simple. Take into account,
759                                    that we do not support routing by source, TOS,
760                                    and MSG_DONTROUTE            --ANK (980726)
761         
762                                    1. If route was host route, check that
763                                       cached destination is current.
764                                       If it is network route, we still may
765                                       check its validity using saved pointer
766                                       to the last used address: daddr_cache.
767                                       We do not want to save whole address now,
768                                       (because main consumer of this service
769                                        is tcp, which has not this problem),
770                                       so that the last trick works only on connected
771                                       sockets.
772                                    2. oif also should be the same.
773                                  */
774         
775                         if (((rt->rt6i_dst.plen != 128 ||
776                               !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
777                              && (np->daddr_cache == NULL ||
778                                  !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
779                             || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
780                                 dst_release(*dst);
781                                 *dst = NULL;
782                         }
783                 }
784         }
785
786         if (*dst == NULL)
787                 *dst = ip6_route_output(sk, fl);
788
789         if ((err = (*dst)->error))
790                 goto out_err_release;
791
792         if (ipv6_addr_any(&fl->fl6_src)) {
793                 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
794
795                 if (err) {
796 #if IP6_DEBUG >= 2
797                         printk(KERN_DEBUG "ip6_dst_lookup: "
798                                "no available source address\n");
799 #endif
800                         goto out_err_release;
801                 }
802         }
803
804         return 0;
805
806 out_err_release:
807         dst_release(*dst);
808         *dst = NULL;
809         return err;
810 }
811
812 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb),
813                     void *from, int length, int transhdrlen,
814                     int hlimit, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt,
815                     unsigned int flags)
816 {
817         struct inet_sock *inet = inet_sk(sk);
818         struct ipv6_pinfo *np = inet6_sk(sk);
819         struct sk_buff *skb;
820         unsigned int maxfraglen, fragheaderlen;
821         int exthdrlen;
822         int hh_len;
823         int mtu;
824         int copy;
825         int err;
826         int offset = 0;
827         int csummode = CHECKSUM_NONE;
828
829         if (flags&MSG_PROBE)
830                 return 0;
831         if (skb_queue_empty(&sk->sk_write_queue)) {
832                 /*
833                  * setup for corking
834                  */
835                 if (opt) {
836                         if (np->cork.opt == NULL) {
837                                 np->cork.opt = kmalloc(opt->tot_len,
838                                                        sk->sk_allocation);
839                                 if (unlikely(np->cork.opt == NULL))
840                                         return -ENOBUFS;
841                         } else if (np->cork.opt->tot_len < opt->tot_len) {
842                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
843                                 return -EINVAL;
844                         }
845                         memcpy(np->cork.opt, opt, opt->tot_len);
846                         inet->cork.flags |= IPCORK_OPT;
847                         /* need source address above miyazawa*/
848                 }
849                 dst_hold(&rt->u.dst);
850                 np->cork.rt = rt;
851                 inet->cork.fl = *fl;
852                 np->cork.hop_limit = hlimit;
853                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
854                 if (dst_allfrag(rt->u.dst.path))
855                         inet->cork.flags |= IPCORK_ALLFRAG;
856                 inet->cork.length = 0;
857                 sk->sk_sndmsg_page = NULL;
858                 sk->sk_sndmsg_off = 0;
859                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
860                 length += exthdrlen;
861                 transhdrlen += exthdrlen;
862         } else {
863                 rt = np->cork.rt;
864                 fl = &inet->cork.fl;
865                 if (inet->cork.flags & IPCORK_OPT)
866                         opt = np->cork.opt;
867                 transhdrlen = 0;
868                 exthdrlen = 0;
869                 mtu = inet->cork.fragsize;
870         }
871
872         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
873
874         fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
875         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
876
877         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
878                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
879                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
880                         return -EMSGSIZE;
881                 }
882         }
883
884         /*
885          * Let's try using as much space as possible.
886          * Use MTU if total length of the message fits into the MTU.
887          * Otherwise, we need to reserve fragment header and
888          * fragment alignment (= 8-15 octects, in total).
889          *
890          * Note that we may need to "move" the data from the tail of
891          * of the buffer to the new fragment when we split 
892          * the message.
893          *
894          * FIXME: It may be fragmented into multiple chunks 
895          *        at once if non-fragmentable extension headers
896          *        are too large.
897          * --yoshfuji 
898          */
899
900         inet->cork.length += length;
901
902         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
903                 goto alloc_new_skb;
904
905         while (length > 0) {
906                 /* Check if the remaining data fits into current packet. */
907                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
908                 if (copy < length)
909                         copy = maxfraglen - skb->len;
910
911                 if (copy <= 0) {
912                         char *data;
913                         unsigned int datalen;
914                         unsigned int fraglen;
915                         unsigned int fraggap;
916                         unsigned int alloclen;
917                         struct sk_buff *skb_prev;
918 alloc_new_skb:
919                         skb_prev = skb;
920
921                         /* There's no room in the current skb */
922                         if (skb_prev)
923                                 fraggap = skb_prev->len - maxfraglen;
924                         else
925                                 fraggap = 0;
926
927                         /*
928                          * If remaining data exceeds the mtu,
929                          * we know we need more fragment(s).
930                          */
931                         datalen = length + fraggap;
932                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
933                                 datalen = maxfraglen - fragheaderlen;
934
935                         fraglen = datalen + fragheaderlen;
936                         if ((flags & MSG_MORE) &&
937                             !(rt->u.dst.dev->features&NETIF_F_SG))
938                                 alloclen = mtu;
939                         else
940                                 alloclen = datalen + fragheaderlen;
941
942                         /*
943                          * The last fragment gets additional space at tail.
944                          * Note: we overallocate on fragments with MSG_MODE
945                          * because we have no idea if we're the last one.
946                          */
947                         if (datalen == length + fraggap)
948                                 alloclen += rt->u.dst.trailer_len;
949
950                         /*
951                          * We just reserve space for fragment header.
952                          * Note: this may be overallocation if the message 
953                          * (without MSG_MORE) fits into the MTU.
954                          */
955                         alloclen += sizeof(struct frag_hdr);
956
957                         if (transhdrlen) {
958                                 skb = sock_alloc_send_skb(sk,
959                                                 alloclen + hh_len,
960                                                 (flags & MSG_DONTWAIT), &err);
961                         } else {
962                                 skb = NULL;
963                                 if (atomic_read(&sk->sk_wmem_alloc) <=
964                                     2 * sk->sk_sndbuf)
965                                         skb = sock_wmalloc(sk,
966                                                            alloclen + hh_len, 1,
967                                                            sk->sk_allocation);
968                                 if (unlikely(skb == NULL))
969                                         err = -ENOBUFS;
970                         }
971                         if (skb == NULL)
972                                 goto error;
973                         /*
974                          *      Fill in the control structures
975                          */
976                         skb->ip_summed = csummode;
977                         skb->csum = 0;
978                         /* reserve for fragmentation */
979                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
980
981                         /*
982                          *      Find where to start putting bytes
983                          */
984                         data = skb_put(skb, fraglen);
985                         skb->nh.raw = data + exthdrlen;
986                         data += fragheaderlen;
987                         skb->h.raw = data + exthdrlen;
988
989                         if (fraggap) {
990                                 skb->csum = skb_copy_and_csum_bits(
991                                         skb_prev, maxfraglen,
992                                         data + transhdrlen, fraggap, 0);
993                                 skb_prev->csum = csum_sub(skb_prev->csum,
994                                                           skb->csum);
995                                 data += fraggap;
996                                 skb_trim(skb_prev, maxfraglen);
997                         }
998                         copy = datalen - transhdrlen - fraggap;
999                         if (copy < 0) {
1000                                 err = -EINVAL;
1001                                 kfree_skb(skb);
1002                                 goto error;
1003                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1004                                 err = -EFAULT;
1005                                 kfree_skb(skb);
1006                                 goto error;
1007                         }
1008
1009                         offset += copy;
1010                         length -= datalen - fraggap;
1011                         transhdrlen = 0;
1012                         exthdrlen = 0;
1013                         csummode = CHECKSUM_NONE;
1014
1015                         /*
1016                          * Put the packet on the pending queue
1017                          */
1018                         __skb_queue_tail(&sk->sk_write_queue, skb);
1019                         continue;
1020                 }
1021
1022                 if (copy > length)
1023                         copy = length;
1024
1025                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1026                         unsigned int off;
1027
1028                         off = skb->len;
1029                         if (getfrag(from, skb_put(skb, copy),
1030                                                 offset, copy, off, skb) < 0) {
1031                                 __skb_trim(skb, off);
1032                                 err = -EFAULT;
1033                                 goto error;
1034                         }
1035                 } else {
1036                         int i = skb_shinfo(skb)->nr_frags;
1037                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1038                         struct page *page = sk->sk_sndmsg_page;
1039                         int off = sk->sk_sndmsg_off;
1040                         unsigned int left;
1041
1042                         if (page && (left = PAGE_SIZE - off) > 0) {
1043                                 if (copy >= left)
1044                                         copy = left;
1045                                 if (page != frag->page) {
1046                                         if (i == MAX_SKB_FRAGS) {
1047                                                 err = -EMSGSIZE;
1048                                                 goto error;
1049                                         }
1050                                         get_page(page);
1051                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1052                                         frag = &skb_shinfo(skb)->frags[i];
1053                                 }
1054                         } else if(i < MAX_SKB_FRAGS) {
1055                                 if (copy > PAGE_SIZE)
1056                                         copy = PAGE_SIZE;
1057                                 page = alloc_pages(sk->sk_allocation, 0);
1058                                 if (page == NULL) {
1059                                         err = -ENOMEM;
1060                                         goto error;
1061                                 }
1062                                 sk->sk_sndmsg_page = page;
1063                                 sk->sk_sndmsg_off = 0;
1064
1065                                 skb_fill_page_desc(skb, i, page, 0, 0);
1066                                 frag = &skb_shinfo(skb)->frags[i];
1067                                 skb->truesize += PAGE_SIZE;
1068                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1069                         } else {
1070                                 err = -EMSGSIZE;
1071                                 goto error;
1072                         }
1073                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1074                                 err = -EFAULT;
1075                                 goto error;
1076                         }
1077                         sk->sk_sndmsg_off += copy;
1078                         frag->size += copy;
1079                         skb->len += copy;
1080                         skb->data_len += copy;
1081                 }
1082                 offset += copy;
1083                 length -= copy;
1084         }
1085         return 0;
1086 error:
1087         inet->cork.length -= length;
1088         IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1089         return err;
1090 }
1091
1092 int ip6_push_pending_frames(struct sock *sk)
1093 {
1094         struct sk_buff *skb, *tmp_skb;
1095         struct sk_buff **tail_skb;
1096         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1097         struct inet_sock *inet = inet_sk(sk);
1098         struct ipv6_pinfo *np = inet6_sk(sk);
1099         struct ipv6hdr *hdr;
1100         struct ipv6_txoptions *opt = np->cork.opt;
1101         struct rt6_info *rt = np->cork.rt;
1102         struct flowi *fl = &inet->cork.fl;
1103         unsigned char proto = fl->proto;
1104         int err = 0;
1105
1106         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1107                 goto out;
1108         tail_skb = &(skb_shinfo(skb)->frag_list);
1109
1110         /* move skb->data to ip header from ext header */
1111         if (skb->data < skb->nh.raw)
1112                 __skb_pull(skb, skb->nh.raw - skb->data);
1113         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1114                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1115                 *tail_skb = tmp_skb;
1116                 tail_skb = &(tmp_skb->next);
1117                 skb->len += tmp_skb->len;
1118                 skb->data_len += tmp_skb->len;
1119 #if 0 /* Logically correct, but useless work, ip_fragment() will have to undo */
1120                 skb->truesize += tmp_skb->truesize;
1121                 __sock_put(tmp_skb->sk);
1122                 tmp_skb->destructor = NULL;
1123                 tmp_skb->sk = NULL;
1124 #endif
1125         }
1126
1127         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1128         __skb_pull(skb, skb->h.raw - skb->nh.raw);
1129         if (opt && opt->opt_flen)
1130                 ipv6_push_frag_opts(skb, opt, &proto);
1131         if (opt && opt->opt_nflen)
1132                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1133
1134         skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1135         
1136         *(u32*)hdr = fl->fl6_flowlabel | htonl(0x60000000);
1137
1138         if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1139                 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1140         else
1141                 hdr->payload_len = 0;
1142         hdr->hop_limit = np->cork.hop_limit;
1143         hdr->nexthdr = proto;
1144         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1145         ipv6_addr_copy(&hdr->daddr, final_dst);
1146
1147         skb->dst = dst_clone(&rt->u.dst);
1148         IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); 
1149         err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1150         if (err) {
1151                 if (err > 0)
1152                         err = inet->recverr ? net_xmit_errno(err) : 0;
1153                 if (err)
1154                         goto error;
1155         }
1156
1157 out:
1158         inet->cork.flags &= ~IPCORK_OPT;
1159         if (np->cork.opt) {
1160                 kfree(np->cork.opt);
1161                 np->cork.opt = NULL;
1162         }
1163         if (np->cork.rt) {
1164                 dst_release(&np->cork.rt->u.dst);
1165                 np->cork.rt = NULL;
1166                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1167         }
1168         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1169         return err;
1170 error:
1171         goto out;
1172 }
1173
1174 void ip6_flush_pending_frames(struct sock *sk)
1175 {
1176         struct inet_sock *inet = inet_sk(sk);
1177         struct ipv6_pinfo *np = inet6_sk(sk);
1178         struct sk_buff *skb;
1179
1180         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1181                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1182                 kfree_skb(skb);
1183         }
1184
1185         inet->cork.flags &= ~IPCORK_OPT;
1186
1187         if (np->cork.opt) {
1188                 kfree(np->cork.opt);
1189                 np->cork.opt = NULL;
1190         }
1191         if (np->cork.rt) {
1192                 dst_release(&np->cork.rt->u.dst);
1193                 np->cork.rt = NULL;
1194                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1195         }
1196         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1197 }