2 * IPv6 output functions
3 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * Based on linux/net/ipv4/ip_output.c
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
61 int __ip6_local_out(struct sk_buff *skb)
65 len = skb->len - sizeof(struct ipv6hdr);
66 if (len > IPV6_MAXPLEN)
68 ipv6_hdr(skb)->payload_len = htons(len);
70 return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 skb_dst(skb)->dev, dst_output);
74 int ip6_local_out(struct sk_buff *skb)
78 err = __ip6_local_out(skb);
80 err = dst_output(skb);
84 EXPORT_SYMBOL_GPL(ip6_local_out);
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
89 skb_reset_mac_header(newskb);
90 __skb_pull(newskb, skb_network_offset(newskb));
91 newskb->pkt_type = PACKET_LOOPBACK;
92 newskb->ip_summed = CHECKSUM_UNNECESSARY;
93 WARN_ON(!skb_dst(newskb));
99 static int ip6_finish_output2(struct sk_buff *skb)
101 struct dst_entry *dst = skb_dst(skb);
102 struct net_device *dev = dst->dev;
104 skb->protocol = htons(ETH_P_IPV6);
107 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
108 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
110 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
111 ((mroute6_socket(dev_net(dev), skb) &&
112 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
113 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
114 &ipv6_hdr(skb)->saddr))) {
115 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117 /* Do not check for IFF_ALLMULTI; multicast routing
118 is not supported in any case.
121 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
122 newskb, NULL, newskb->dev,
123 ip6_dev_loopback_xmit);
125 if (ipv6_hdr(skb)->hop_limit == 0) {
126 IP6_INC_STATS(dev_net(dev), idev,
127 IPSTATS_MIB_OUTDISCARDS);
133 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
138 return neigh_hh_output(dst->hh, skb);
139 else if (dst->neighbour)
140 return dst->neighbour->output(skb);
142 IP6_INC_STATS_BH(dev_net(dst->dev),
143 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
148 static int ip6_finish_output(struct sk_buff *skb)
150 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
151 dst_allfrag(skb_dst(skb)))
152 return ip6_fragment(skb, ip6_finish_output2);
154 return ip6_finish_output2(skb);
157 int ip6_output(struct sk_buff *skb)
159 struct net_device *dev = skb_dst(skb)->dev;
160 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161 if (unlikely(idev->cnf.disable_ipv6)) {
162 IP6_INC_STATS(dev_net(dev), idev,
163 IPSTATS_MIB_OUTDISCARDS);
168 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
170 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
174 * xmit an sk_buff (used by TCP, SCTP and DCCP)
177 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
178 struct ipv6_txoptions *opt)
180 struct net *net = sock_net(sk);
181 struct ipv6_pinfo *np = inet6_sk(sk);
182 struct in6_addr *first_hop = &fl->fl6_dst;
183 struct dst_entry *dst = skb_dst(skb);
185 u8 proto = fl->proto;
186 int seg_len = skb->len;
192 unsigned int head_room;
194 /* First: exthdrs may take lots of space (~8K for now)
195 MAX_HEADER is not enough.
197 head_room = opt->opt_nflen + opt->opt_flen;
198 seg_len += head_room;
199 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
201 if (skb_headroom(skb) < head_room) {
202 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
204 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
205 IPSTATS_MIB_OUTDISCARDS);
211 skb_set_owner_w(skb, sk);
214 ipv6_push_frag_opts(skb, opt, &proto);
216 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
219 skb_push(skb, sizeof(struct ipv6hdr));
220 skb_reset_network_header(skb);
224 * Fill in the IPv6 header
228 hlimit = np->hop_limit;
231 hlimit = ip6_dst_hoplimit(dst);
233 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
235 hdr->payload_len = htons(seg_len);
236 hdr->nexthdr = proto;
237 hdr->hop_limit = hlimit;
239 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
240 ipv6_addr_copy(&hdr->daddr, first_hop);
242 skb->priority = sk->sk_priority;
243 skb->mark = sk->sk_mark;
246 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
247 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
248 IPSTATS_MIB_OUT, skb->len);
249 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
250 dst->dev, dst_output);
254 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
256 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
257 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
262 EXPORT_SYMBOL(ip6_xmit);
265 * To avoid extra problems ND packets are send through this
266 * routine. It's code duplication but I really want to avoid
267 * extra checks since ipv6_build_header is used by TCP (which
268 * is for us performance critical)
271 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
272 const struct in6_addr *saddr, const struct in6_addr *daddr,
275 struct ipv6_pinfo *np = inet6_sk(sk);
279 skb->protocol = htons(ETH_P_IPV6);
282 totlen = len + sizeof(struct ipv6hdr);
284 skb_reset_network_header(skb);
285 skb_put(skb, sizeof(struct ipv6hdr));
288 *(__be32*)hdr = htonl(0x60000000);
290 hdr->payload_len = htons(len);
291 hdr->nexthdr = proto;
292 hdr->hop_limit = np->hop_limit;
294 ipv6_addr_copy(&hdr->saddr, saddr);
295 ipv6_addr_copy(&hdr->daddr, daddr);
300 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
302 struct ip6_ra_chain *ra;
303 struct sock *last = NULL;
305 read_lock(&ip6_ra_lock);
306 for (ra = ip6_ra_chain; ra; ra = ra->next) {
307 struct sock *sk = ra->sk;
308 if (sk && ra->sel == sel &&
309 (!sk->sk_bound_dev_if ||
310 sk->sk_bound_dev_if == skb->dev->ifindex)) {
312 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
314 rawv6_rcv(last, skb2);
321 rawv6_rcv(last, skb);
322 read_unlock(&ip6_ra_lock);
325 read_unlock(&ip6_ra_lock);
329 static int ip6_forward_proxy_check(struct sk_buff *skb)
331 struct ipv6hdr *hdr = ipv6_hdr(skb);
332 u8 nexthdr = hdr->nexthdr;
335 if (ipv6_ext_hdr(nexthdr)) {
336 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
340 offset = sizeof(struct ipv6hdr);
342 if (nexthdr == IPPROTO_ICMPV6) {
343 struct icmp6hdr *icmp6;
345 if (!pskb_may_pull(skb, (skb_network_header(skb) +
346 offset + 1 - skb->data)))
349 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
351 switch (icmp6->icmp6_type) {
352 case NDISC_ROUTER_SOLICITATION:
353 case NDISC_ROUTER_ADVERTISEMENT:
354 case NDISC_NEIGHBOUR_SOLICITATION:
355 case NDISC_NEIGHBOUR_ADVERTISEMENT:
357 /* For reaction involving unicast neighbor discovery
358 * message destined to the proxied address, pass it to
368 * The proxying router can't forward traffic sent to a link-local
369 * address, so signal the sender and discard the packet. This
370 * behavior is clarified by the MIPv6 specification.
372 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
373 dst_link_failure(skb);
380 static inline int ip6_forward_finish(struct sk_buff *skb)
382 return dst_output(skb);
385 int ip6_forward(struct sk_buff *skb)
387 struct dst_entry *dst = skb_dst(skb);
388 struct ipv6hdr *hdr = ipv6_hdr(skb);
389 struct inet6_skb_parm *opt = IP6CB(skb);
390 struct net *net = dev_net(dst->dev);
393 if (net->ipv6.devconf_all->forwarding == 0)
396 if (skb_warn_if_lro(skb))
399 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
400 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
404 if (skb->pkt_type != PACKET_HOST)
407 skb_forward_csum(skb);
410 * We DO NOT make any processing on
411 * RA packets, pushing them to user level AS IS
412 * without ane WARRANTY that application will be able
413 * to interpret them. The reason is that we
414 * cannot make anything clever here.
416 * We are not end-node, so that if packet contains
417 * AH/ESP, we cannot make anything.
418 * Defragmentation also would be mistake, RA packets
419 * cannot be fragmented, because there is no warranty
420 * that different fragments will go along one path. --ANK
423 u8 *ptr = skb_network_header(skb) + opt->ra;
424 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
429 * check and decrement ttl
431 if (hdr->hop_limit <= 1) {
432 /* Force OUTPUT device used as source address */
434 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
435 IP6_INC_STATS_BH(net,
436 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
442 /* XXX: idev->cnf.proxy_ndp? */
443 if (net->ipv6.devconf_all->proxy_ndp &&
444 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
445 int proxied = ip6_forward_proxy_check(skb);
447 return ip6_input(skb);
448 else if (proxied < 0) {
449 IP6_INC_STATS(net, ip6_dst_idev(dst),
450 IPSTATS_MIB_INDISCARDS);
455 if (!xfrm6_route_forward(skb)) {
456 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
461 /* IPv6 specs say nothing about it, but it is clear that we cannot
462 send redirects to source routed frames.
463 We don't send redirects to frames decapsulated from IPsec.
465 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
466 !skb_sec_path(skb)) {
467 struct in6_addr *target = NULL;
469 struct neighbour *n = dst->neighbour;
472 * incoming and outgoing devices are the same
476 rt = (struct rt6_info *) dst;
477 if ((rt->rt6i_flags & RTF_GATEWAY))
478 target = (struct in6_addr*)&n->primary_key;
480 target = &hdr->daddr;
482 /* Limit redirects both by destination (here)
483 and by source (inside ndisc_send_redirect)
485 if (xrlim_allow(dst, 1*HZ))
486 ndisc_send_redirect(skb, n, target);
488 int addrtype = ipv6_addr_type(&hdr->saddr);
490 /* This check is security critical. */
491 if (addrtype == IPV6_ADDR_ANY ||
492 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
494 if (addrtype & IPV6_ADDR_LINKLOCAL) {
495 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
496 ICMPV6_NOT_NEIGHBOUR, 0);
502 if (mtu < IPV6_MIN_MTU)
505 if (skb->len > mtu && !skb_is_gso(skb)) {
506 /* Again, force OUTPUT device used as source address */
508 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
509 IP6_INC_STATS_BH(net,
510 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
511 IP6_INC_STATS_BH(net,
512 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
517 if (skb_cow(skb, dst->dev->hard_header_len)) {
518 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
524 /* Mangling hops number delayed to point after skb COW */
528 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
529 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
533 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
539 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
541 to->pkt_type = from->pkt_type;
542 to->priority = from->priority;
543 to->protocol = from->protocol;
545 skb_dst_set(to, dst_clone(skb_dst(from)));
547 to->mark = from->mark;
549 #ifdef CONFIG_NET_SCHED
550 to->tc_index = from->tc_index;
553 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
554 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
555 to->nf_trace = from->nf_trace;
557 skb_copy_secmark(to, from);
560 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
562 u16 offset = sizeof(struct ipv6hdr);
563 struct ipv6_opt_hdr *exthdr =
564 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
565 unsigned int packet_len = skb->tail - skb->network_header;
567 *nexthdr = &ipv6_hdr(skb)->nexthdr;
569 while (offset + 1 <= packet_len) {
575 case NEXTHDR_ROUTING:
579 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
580 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
590 offset += ipv6_optlen(exthdr);
591 *nexthdr = &exthdr->nexthdr;
592 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
599 static u32 hashidentrnd __read_mostly;
600 #define FID_HASH_SZ 16
601 static u32 ipv6_fragmentation_id[FID_HASH_SZ];
603 void __init initialize_hashidentrnd(void)
605 get_random_bytes(&hashidentrnd, sizeof(hashidentrnd));
608 static u32 __ipv6_select_ident(const struct in6_addr *addr)
610 u32 newid, oldid, hash = jhash2((u32 *)addr, 4, hashidentrnd);
611 u32 *pid = &ipv6_fragmentation_id[hash % FID_HASH_SZ];
618 } while (cmpxchg(pid, oldid, newid) != oldid);
623 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
625 fhdr->identification = htonl(__ipv6_select_ident(&rt->rt6i_dst.addr));
628 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
630 struct sk_buff *frag;
631 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
632 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
633 struct ipv6hdr *tmp_hdr;
635 unsigned int mtu, hlen, left, len;
637 int ptr, offset = 0, err=0;
638 u8 *prevhdr, nexthdr = 0;
639 struct net *net = dev_net(skb_dst(skb)->dev);
641 hlen = ip6_find_1stfragopt(skb, &prevhdr);
644 mtu = ip6_skb_dst_mtu(skb);
646 /* We must not fragment if the socket is set to force MTU discovery
647 * or if the skb it not generated by a local socket.
649 if (!skb->local_df && skb->len > mtu) {
650 skb->dev = skb_dst(skb)->dev;
651 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
652 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
653 IPSTATS_MIB_FRAGFAILS);
658 if (np && np->frag_size < mtu) {
662 mtu -= hlen + sizeof(struct frag_hdr);
664 if (skb_has_frag_list(skb)) {
665 int first_len = skb_pagelen(skb);
666 struct sk_buff *frag2;
668 if (first_len - hlen > mtu ||
669 ((first_len - hlen) & 7) ||
673 skb_walk_frags(skb, frag) {
674 /* Correct geometry. */
675 if (frag->len > mtu ||
676 ((frag->len & 7) && frag->next) ||
677 skb_headroom(frag) < hlen)
678 goto slow_path_clean;
680 /* Partially cloned skb? */
681 if (skb_shared(frag))
682 goto slow_path_clean;
687 frag->destructor = sock_wfree;
689 skb->truesize -= frag->truesize;
694 frag = skb_shinfo(skb)->frag_list;
695 skb_frag_list_init(skb);
698 *prevhdr = NEXTHDR_FRAGMENT;
699 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
701 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
702 IPSTATS_MIB_FRAGFAILS);
706 __skb_pull(skb, hlen);
707 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
708 __skb_push(skb, hlen);
709 skb_reset_network_header(skb);
710 memcpy(skb_network_header(skb), tmp_hdr, hlen);
712 ipv6_select_ident(fh, rt);
713 fh->nexthdr = nexthdr;
715 fh->frag_off = htons(IP6_MF);
716 frag_id = fh->identification;
718 first_len = skb_pagelen(skb);
719 skb->data_len = first_len - skb_headlen(skb);
720 skb->len = first_len;
721 ipv6_hdr(skb)->payload_len = htons(first_len -
722 sizeof(struct ipv6hdr));
727 /* Prepare header of the next frame,
728 * before previous one went down. */
730 frag->ip_summed = CHECKSUM_NONE;
731 skb_reset_transport_header(frag);
732 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
733 __skb_push(frag, hlen);
734 skb_reset_network_header(frag);
735 memcpy(skb_network_header(frag), tmp_hdr,
737 offset += skb->len - hlen - sizeof(struct frag_hdr);
738 fh->nexthdr = nexthdr;
740 fh->frag_off = htons(offset);
741 if (frag->next != NULL)
742 fh->frag_off |= htons(IP6_MF);
743 fh->identification = frag_id;
744 ipv6_hdr(frag)->payload_len =
746 sizeof(struct ipv6hdr));
747 ip6_copy_metadata(frag, skb);
752 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
753 IPSTATS_MIB_FRAGCREATES);
766 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
767 IPSTATS_MIB_FRAGOKS);
768 dst_release(&rt->dst);
778 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
779 IPSTATS_MIB_FRAGFAILS);
780 dst_release(&rt->dst);
784 skb_walk_frags(skb, frag2) {
788 frag2->destructor = NULL;
789 skb->truesize += frag2->truesize;
794 left = skb->len - hlen; /* Space per frame */
795 ptr = hlen; /* Where to start from */
798 * Fragment the datagram.
801 *prevhdr = NEXTHDR_FRAGMENT;
804 * Keep copying data until we run out.
808 /* IF: it doesn't fit, use 'mtu' - the data space left */
811 /* IF: we are not sending upto and including the packet end
812 then align the next start on an eight byte boundary */
820 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
821 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
822 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
823 IPSTATS_MIB_FRAGFAILS);
829 * Set up data on packet
832 ip6_copy_metadata(frag, skb);
833 skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
834 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
835 skb_reset_network_header(frag);
836 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
837 frag->transport_header = (frag->network_header + hlen +
838 sizeof(struct frag_hdr));
841 * Charge the memory for the fragment to any owner
845 skb_set_owner_w(frag, skb->sk);
848 * Copy the packet header into the new buffer.
850 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
853 * Build fragment header.
855 fh->nexthdr = nexthdr;
858 ipv6_select_ident(fh, rt);
859 frag_id = fh->identification;
861 fh->identification = frag_id;
864 * Copy a block of the IP datagram.
866 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
870 fh->frag_off = htons(offset);
872 fh->frag_off |= htons(IP6_MF);
873 ipv6_hdr(frag)->payload_len = htons(frag->len -
874 sizeof(struct ipv6hdr));
880 * Put this fragment into the sending queue.
886 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
887 IPSTATS_MIB_FRAGCREATES);
889 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
890 IPSTATS_MIB_FRAGOKS);
895 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
896 IPSTATS_MIB_FRAGFAILS);
901 static inline int ip6_rt_check(struct rt6key *rt_key,
902 struct in6_addr *fl_addr,
903 struct in6_addr *addr_cache)
905 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
906 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
909 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
910 struct dst_entry *dst,
913 struct ipv6_pinfo *np = inet6_sk(sk);
914 struct rt6_info *rt = (struct rt6_info *)dst;
919 /* Yes, checking route validity in not connected
920 * case is not very simple. Take into account,
921 * that we do not support routing by source, TOS,
922 * and MSG_DONTROUTE --ANK (980726)
924 * 1. ip6_rt_check(): If route was host route,
925 * check that cached destination is current.
926 * If it is network route, we still may
927 * check its validity using saved pointer
928 * to the last used address: daddr_cache.
929 * We do not want to save whole address now,
930 * (because main consumer of this service
931 * is tcp, which has not this problem),
932 * so that the last trick works only on connected
934 * 2. oif also should be the same.
936 if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
937 #ifdef CONFIG_IPV6_SUBTREES
938 ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
940 (fl->oif && fl->oif != dst->dev->ifindex)) {
949 static int ip6_dst_lookup_tail(struct sock *sk,
950 struct dst_entry **dst, struct flowi *fl)
953 struct net *net = sock_net(sk);
956 *dst = ip6_route_output(net, sk, fl);
958 if ((err = (*dst)->error))
959 goto out_err_release;
961 if (ipv6_addr_any(&fl->fl6_src)) {
962 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
964 sk ? inet6_sk(sk)->srcprefs : 0,
967 goto out_err_release;
970 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
972 * Here if the dst entry we've looked up
973 * has a neighbour entry that is in the INCOMPLETE
974 * state and the src address from the flow is
975 * marked as OPTIMISTIC, we release the found
976 * dst entry and replace it instead with the
977 * dst entry of the nexthop router
979 if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
980 struct inet6_ifaddr *ifp;
984 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
987 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
993 * We need to get the dst entry for the
994 * default router instead
997 memcpy(&fl_gw, fl, sizeof(struct flowi));
998 memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
999 *dst = ip6_route_output(net, sk, &fl_gw);
1000 if ((err = (*dst)->error))
1001 goto out_err_release;
1009 if (err == -ENETUNREACH)
1010 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1017 * ip6_dst_lookup - perform route lookup on flow
1018 * @sk: socket which provides route info
1019 * @dst: pointer to dst_entry * for result
1020 * @fl: flow to lookup
1022 * This function performs a route lookup on the given flow.
1024 * It returns zero on success, or a standard errno code on error.
1026 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1029 return ip6_dst_lookup_tail(sk, dst, fl);
1031 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1034 * ip6_sk_dst_lookup - perform socket cached route lookup on flow
1035 * @sk: socket which provides the dst cache and route info
1036 * @dst: pointer to dst_entry * for result
1037 * @fl: flow to lookup
1039 * This function performs a route lookup on the given flow with the
1040 * possibility of using the cached route in the socket if it is valid.
1041 * It will take the socket dst lock when operating on the dst cache.
1042 * As a result, this function can only be used in process context.
1044 * It returns zero on success, or a standard errno code on error.
1046 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1050 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1051 *dst = ip6_sk_dst_check(sk, *dst, fl);
1054 return ip6_dst_lookup_tail(sk, dst, fl);
1056 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1058 static inline int ip6_ufo_append_data(struct sock *sk,
1059 int getfrag(void *from, char *to, int offset, int len,
1060 int odd, struct sk_buff *skb),
1061 void *from, int length, int hh_len, int fragheaderlen,
1062 int transhdrlen, int mtu,unsigned int flags,
1063 struct rt6_info *rt)
1066 struct sk_buff *skb;
1069 /* There is support for UDP large send offload by network
1070 * device, so create one single skb packet containing complete
1073 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1074 skb = sock_alloc_send_skb(sk,
1075 hh_len + fragheaderlen + transhdrlen + 20,
1076 (flags & MSG_DONTWAIT), &err);
1080 /* reserve space for Hardware header */
1081 skb_reserve(skb, hh_len);
1083 /* create space for UDP/IP header */
1084 skb_put(skb,fragheaderlen + transhdrlen);
1086 /* initialize network header pointer */
1087 skb_reset_network_header(skb);
1089 /* initialize protocol header pointer */
1090 skb->transport_header = skb->network_header + fragheaderlen;
1092 skb->ip_summed = CHECKSUM_PARTIAL;
1094 sk->sk_sndmsg_off = 0;
1097 err = skb_append_datato_frags(sk,skb, getfrag, from,
1098 (length - transhdrlen));
1100 struct frag_hdr fhdr;
1102 /* Specify the length of each IPv6 datagram fragment.
1103 * It has to be a multiple of 8.
1105 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1106 sizeof(struct frag_hdr)) & ~7;
1107 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1108 ipv6_select_ident(&fhdr, rt);
1109 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1110 __skb_queue_tail(&sk->sk_write_queue, skb);
1114 /* There is not enough support do UPD LSO,
1115 * so follow normal path
1122 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1125 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1128 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1131 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1134 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1135 int offset, int len, int odd, struct sk_buff *skb),
1136 void *from, int length, int transhdrlen,
1137 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1138 struct rt6_info *rt, unsigned int flags, int dontfrag)
1140 struct inet_sock *inet = inet_sk(sk);
1141 struct ipv6_pinfo *np = inet6_sk(sk);
1142 struct sk_buff *skb;
1143 unsigned int maxfraglen, fragheaderlen;
1150 int csummode = CHECKSUM_NONE;
1152 if (flags&MSG_PROBE)
1154 if (skb_queue_empty(&sk->sk_write_queue)) {
1159 if (WARN_ON(np->cork.opt))
1162 np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1163 if (unlikely(np->cork.opt == NULL))
1166 np->cork.opt->tot_len = opt->tot_len;
1167 np->cork.opt->opt_flen = opt->opt_flen;
1168 np->cork.opt->opt_nflen = opt->opt_nflen;
1170 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1172 if (opt->dst0opt && !np->cork.opt->dst0opt)
1175 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1177 if (opt->dst1opt && !np->cork.opt->dst1opt)
1180 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1182 if (opt->hopopt && !np->cork.opt->hopopt)
1185 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1187 if (opt->srcrt && !np->cork.opt->srcrt)
1190 /* need source address above miyazawa*/
1193 inet->cork.dst = &rt->dst;
1194 inet->cork.fl = *fl;
1195 np->cork.hop_limit = hlimit;
1196 np->cork.tclass = tclass;
1197 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1198 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1199 if (np->frag_size < mtu) {
1201 mtu = np->frag_size;
1203 inet->cork.fragsize = mtu;
1204 if (dst_allfrag(rt->dst.path))
1205 inet->cork.flags |= IPCORK_ALLFRAG;
1206 inet->cork.length = 0;
1207 sk->sk_sndmsg_page = NULL;
1208 sk->sk_sndmsg_off = 0;
1209 exthdrlen = rt->dst.header_len + (opt ? opt->opt_flen : 0) -
1210 rt->rt6i_nfheader_len;
1211 length += exthdrlen;
1212 transhdrlen += exthdrlen;
1214 rt = (struct rt6_info *)inet->cork.dst;
1215 fl = &inet->cork.fl;
1219 mtu = inet->cork.fragsize;
1222 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1224 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1225 (opt ? opt->opt_nflen : 0);
1226 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1228 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1229 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1230 ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1236 * Let's try using as much space as possible.
1237 * Use MTU if total length of the message fits into the MTU.
1238 * Otherwise, we need to reserve fragment header and
1239 * fragment alignment (= 8-15 octects, in total).
1241 * Note that we may need to "move" the data from the tail of
1242 * of the buffer to the new fragment when we split
1245 * FIXME: It may be fragmented into multiple chunks
1246 * at once if non-fragmentable extension headers
1251 inet->cork.length += length;
1253 int proto = sk->sk_protocol;
1254 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1255 ipv6_local_rxpmtu(sk, fl, mtu-exthdrlen);
1259 if (proto == IPPROTO_UDP &&
1260 (rt->dst.dev->features & NETIF_F_UFO)) {
1262 err = ip6_ufo_append_data(sk, getfrag, from, length,
1263 hh_len, fragheaderlen,
1264 transhdrlen, mtu, flags, rt);
1271 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1274 while (length > 0) {
1275 /* Check if the remaining data fits into current packet. */
1276 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1278 copy = maxfraglen - skb->len;
1282 unsigned int datalen;
1283 unsigned int fraglen;
1284 unsigned int fraggap;
1285 unsigned int alloclen;
1286 struct sk_buff *skb_prev;
1290 /* There's no room in the current skb */
1292 fraggap = skb_prev->len - maxfraglen;
1297 * If remaining data exceeds the mtu,
1298 * we know we need more fragment(s).
1300 datalen = length + fraggap;
1301 if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1302 datalen = maxfraglen - fragheaderlen;
1304 fraglen = datalen + fragheaderlen;
1305 if ((flags & MSG_MORE) &&
1306 !(rt->dst.dev->features&NETIF_F_SG))
1309 alloclen = datalen + fragheaderlen;
1312 * The last fragment gets additional space at tail.
1313 * Note: we overallocate on fragments with MSG_MODE
1314 * because we have no idea if we're the last one.
1316 if (datalen == length + fraggap)
1317 alloclen += rt->dst.trailer_len;
1320 * We just reserve space for fragment header.
1321 * Note: this may be overallocation if the message
1322 * (without MSG_MORE) fits into the MTU.
1324 alloclen += sizeof(struct frag_hdr);
1327 skb = sock_alloc_send_skb(sk,
1329 (flags & MSG_DONTWAIT), &err);
1332 if (atomic_read(&sk->sk_wmem_alloc) <=
1334 skb = sock_wmalloc(sk,
1335 alloclen + hh_len, 1,
1337 if (unlikely(skb == NULL))
1343 * Fill in the control structures
1345 skb->ip_summed = csummode;
1347 /* reserve for fragmentation */
1348 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1351 * Find where to start putting bytes
1353 data = skb_put(skb, fraglen);
1354 skb_set_network_header(skb, exthdrlen);
1355 data += fragheaderlen;
1356 skb->transport_header = (skb->network_header +
1359 skb->csum = skb_copy_and_csum_bits(
1360 skb_prev, maxfraglen,
1361 data + transhdrlen, fraggap, 0);
1362 skb_prev->csum = csum_sub(skb_prev->csum,
1365 pskb_trim_unique(skb_prev, maxfraglen);
1367 copy = datalen - transhdrlen - fraggap;
1372 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1379 length -= datalen - fraggap;
1382 csummode = CHECKSUM_NONE;
1385 * Put the packet on the pending queue
1387 __skb_queue_tail(&sk->sk_write_queue, skb);
1394 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1398 if (getfrag(from, skb_put(skb, copy),
1399 offset, copy, off, skb) < 0) {
1400 __skb_trim(skb, off);
1405 int i = skb_shinfo(skb)->nr_frags;
1406 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1407 struct page *page = sk->sk_sndmsg_page;
1408 int off = sk->sk_sndmsg_off;
1411 if (page && (left = PAGE_SIZE - off) > 0) {
1414 if (page != frag->page) {
1415 if (i == MAX_SKB_FRAGS) {
1420 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1421 frag = &skb_shinfo(skb)->frags[i];
1423 } else if(i < MAX_SKB_FRAGS) {
1424 if (copy > PAGE_SIZE)
1426 page = alloc_pages(sk->sk_allocation, 0);
1431 sk->sk_sndmsg_page = page;
1432 sk->sk_sndmsg_off = 0;
1434 skb_fill_page_desc(skb, i, page, 0, 0);
1435 frag = &skb_shinfo(skb)->frags[i];
1440 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1444 sk->sk_sndmsg_off += copy;
1447 skb->data_len += copy;
1448 skb->truesize += copy;
1449 atomic_add(copy, &sk->sk_wmem_alloc);
1456 inet->cork.length -= length;
1457 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1461 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1464 kfree(np->cork.opt->dst0opt);
1465 kfree(np->cork.opt->dst1opt);
1466 kfree(np->cork.opt->hopopt);
1467 kfree(np->cork.opt->srcrt);
1468 kfree(np->cork.opt);
1469 np->cork.opt = NULL;
1472 if (inet->cork.dst) {
1473 dst_release(inet->cork.dst);
1474 inet->cork.dst = NULL;
1475 inet->cork.flags &= ~IPCORK_ALLFRAG;
1477 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1480 int ip6_push_pending_frames(struct sock *sk)
1482 struct sk_buff *skb, *tmp_skb;
1483 struct sk_buff **tail_skb;
1484 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1485 struct inet_sock *inet = inet_sk(sk);
1486 struct ipv6_pinfo *np = inet6_sk(sk);
1487 struct net *net = sock_net(sk);
1488 struct ipv6hdr *hdr;
1489 struct ipv6_txoptions *opt = np->cork.opt;
1490 struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1491 struct flowi *fl = &inet->cork.fl;
1492 unsigned char proto = fl->proto;
1495 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1497 tail_skb = &(skb_shinfo(skb)->frag_list);
1499 /* move skb->data to ip header from ext header */
1500 if (skb->data < skb_network_header(skb))
1501 __skb_pull(skb, skb_network_offset(skb));
1502 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1503 __skb_pull(tmp_skb, skb_network_header_len(skb));
1504 *tail_skb = tmp_skb;
1505 tail_skb = &(tmp_skb->next);
1506 skb->len += tmp_skb->len;
1507 skb->data_len += tmp_skb->len;
1508 skb->truesize += tmp_skb->truesize;
1509 tmp_skb->destructor = NULL;
1513 /* Allow local fragmentation. */
1514 if (np->pmtudisc < IPV6_PMTUDISC_DO)
1517 ipv6_addr_copy(final_dst, &fl->fl6_dst);
1518 __skb_pull(skb, skb_network_header_len(skb));
1519 if (opt && opt->opt_flen)
1520 ipv6_push_frag_opts(skb, opt, &proto);
1521 if (opt && opt->opt_nflen)
1522 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1524 skb_push(skb, sizeof(struct ipv6hdr));
1525 skb_reset_network_header(skb);
1526 hdr = ipv6_hdr(skb);
1528 *(__be32*)hdr = fl->fl6_flowlabel |
1529 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1531 hdr->hop_limit = np->cork.hop_limit;
1532 hdr->nexthdr = proto;
1533 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1534 ipv6_addr_copy(&hdr->daddr, final_dst);
1536 skb->priority = sk->sk_priority;
1537 skb->mark = sk->sk_mark;
1539 skb_dst_set(skb, dst_clone(&rt->dst));
1540 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1541 if (proto == IPPROTO_ICMPV6) {
1542 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1544 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1545 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1548 err = ip6_local_out(skb);
1551 err = net_xmit_errno(err);
1557 ip6_cork_release(inet, np);
1560 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1564 void ip6_flush_pending_frames(struct sock *sk)
1566 struct sk_buff *skb;
1568 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1570 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1571 IPSTATS_MIB_OUTDISCARDS);
1575 ip6_cork_release(inet_sk(sk), inet6_sk(sk));