2 * IPv6 output functions
3 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * Based on linux/net/ipv4/ip_output.c
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
61 int __ip6_local_out(struct sk_buff *skb)
65 len = skb->len - sizeof(struct ipv6hdr);
66 if (len > IPV6_MAXPLEN)
68 ipv6_hdr(skb)->payload_len = htons(len);
70 return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 skb_dst(skb)->dev, dst_output);
74 int ip6_local_out(struct sk_buff *skb)
78 err = __ip6_local_out(skb);
80 err = dst_output(skb);
84 EXPORT_SYMBOL_GPL(ip6_local_out);
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
89 skb_reset_mac_header(newskb);
90 __skb_pull(newskb, skb_network_offset(newskb));
91 newskb->pkt_type = PACKET_LOOPBACK;
92 newskb->ip_summed = CHECKSUM_UNNECESSARY;
93 WARN_ON(!skb_dst(newskb));
99 static int ip6_finish_output2(struct sk_buff *skb)
101 struct dst_entry *dst = skb_dst(skb);
102 struct net_device *dev = dst->dev;
104 skb->protocol = htons(ETH_P_IPV6);
107 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
108 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
110 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
111 ((mroute6_socket(dev_net(dev)) &&
112 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
113 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
114 &ipv6_hdr(skb)->saddr))) {
115 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117 /* Do not check for IFF_ALLMULTI; multicast routing
118 is not supported in any case.
121 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
122 newskb, NULL, newskb->dev,
123 ip6_dev_loopback_xmit);
125 if (ipv6_hdr(skb)->hop_limit == 0) {
126 IP6_INC_STATS(dev_net(dev), idev,
127 IPSTATS_MIB_OUTDISCARDS);
133 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
138 return neigh_hh_output(dst->hh, skb);
139 else if (dst->neighbour)
140 return dst->neighbour->output(skb);
142 IP6_INC_STATS_BH(dev_net(dst->dev),
143 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
148 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
150 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
152 return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
153 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
156 static int ip6_finish_output(struct sk_buff *skb)
158 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
159 dst_allfrag(skb_dst(skb)))
160 return ip6_fragment(skb, ip6_finish_output2);
162 return ip6_finish_output2(skb);
165 int ip6_output(struct sk_buff *skb)
167 struct net_device *dev = skb_dst(skb)->dev;
168 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
169 if (unlikely(idev->cnf.disable_ipv6)) {
170 IP6_INC_STATS(dev_net(dev), idev,
171 IPSTATS_MIB_OUTDISCARDS);
176 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
178 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
182 * xmit an sk_buff (used by TCP, SCTP and DCCP)
185 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
186 struct ipv6_txoptions *opt)
188 struct net *net = sock_net(sk);
189 struct ipv6_pinfo *np = inet6_sk(sk);
190 struct in6_addr *first_hop = &fl->fl6_dst;
191 struct dst_entry *dst = skb_dst(skb);
193 u8 proto = fl->proto;
194 int seg_len = skb->len;
200 unsigned int head_room;
202 /* First: exthdrs may take lots of space (~8K for now)
203 MAX_HEADER is not enough.
205 head_room = opt->opt_nflen + opt->opt_flen;
206 seg_len += head_room;
207 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
209 if (skb_headroom(skb) < head_room) {
210 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
212 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
213 IPSTATS_MIB_OUTDISCARDS);
220 skb_set_owner_w(skb, sk);
223 ipv6_push_frag_opts(skb, opt, &proto);
225 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
228 skb_push(skb, sizeof(struct ipv6hdr));
229 skb_reset_network_header(skb);
233 * Fill in the IPv6 header
237 hlimit = np->hop_limit;
240 hlimit = ip6_dst_hoplimit(dst);
242 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
244 hdr->payload_len = htons(seg_len);
245 hdr->nexthdr = proto;
246 hdr->hop_limit = hlimit;
248 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
249 ipv6_addr_copy(&hdr->daddr, first_hop);
251 skb->priority = sk->sk_priority;
252 skb->mark = sk->sk_mark;
255 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
256 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
257 IPSTATS_MIB_OUT, skb->len);
258 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
259 dst->dev, dst_output);
263 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
265 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
266 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
271 EXPORT_SYMBOL(ip6_xmit);
274 * To avoid extra problems ND packets are send through this
275 * routine. It's code duplication but I really want to avoid
276 * extra checks since ipv6_build_header is used by TCP (which
277 * is for us performance critical)
280 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
281 const struct in6_addr *saddr, const struct in6_addr *daddr,
284 struct ipv6_pinfo *np = inet6_sk(sk);
288 skb->protocol = htons(ETH_P_IPV6);
291 totlen = len + sizeof(struct ipv6hdr);
293 skb_reset_network_header(skb);
294 skb_put(skb, sizeof(struct ipv6hdr));
297 *(__be32*)hdr = htonl(0x60000000);
299 hdr->payload_len = htons(len);
300 hdr->nexthdr = proto;
301 hdr->hop_limit = np->hop_limit;
303 ipv6_addr_copy(&hdr->saddr, saddr);
304 ipv6_addr_copy(&hdr->daddr, daddr);
309 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
311 struct ip6_ra_chain *ra;
312 struct sock *last = NULL;
314 read_lock(&ip6_ra_lock);
315 for (ra = ip6_ra_chain; ra; ra = ra->next) {
316 struct sock *sk = ra->sk;
317 if (sk && ra->sel == sel &&
318 (!sk->sk_bound_dev_if ||
319 sk->sk_bound_dev_if == skb->dev->ifindex)) {
321 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
323 rawv6_rcv(last, skb2);
330 rawv6_rcv(last, skb);
331 read_unlock(&ip6_ra_lock);
334 read_unlock(&ip6_ra_lock);
338 static int ip6_forward_proxy_check(struct sk_buff *skb)
340 struct ipv6hdr *hdr = ipv6_hdr(skb);
341 u8 nexthdr = hdr->nexthdr;
344 if (ipv6_ext_hdr(nexthdr)) {
345 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
349 offset = sizeof(struct ipv6hdr);
351 if (nexthdr == IPPROTO_ICMPV6) {
352 struct icmp6hdr *icmp6;
354 if (!pskb_may_pull(skb, (skb_network_header(skb) +
355 offset + 1 - skb->data)))
358 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
360 switch (icmp6->icmp6_type) {
361 case NDISC_ROUTER_SOLICITATION:
362 case NDISC_ROUTER_ADVERTISEMENT:
363 case NDISC_NEIGHBOUR_SOLICITATION:
364 case NDISC_NEIGHBOUR_ADVERTISEMENT:
366 /* For reaction involving unicast neighbor discovery
367 * message destined to the proxied address, pass it to
377 * The proxying router can't forward traffic sent to a link-local
378 * address, so signal the sender and discard the packet. This
379 * behavior is clarified by the MIPv6 specification.
381 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
382 dst_link_failure(skb);
389 static inline int ip6_forward_finish(struct sk_buff *skb)
391 return dst_output(skb);
394 int ip6_forward(struct sk_buff *skb)
396 struct dst_entry *dst = skb_dst(skb);
397 struct ipv6hdr *hdr = ipv6_hdr(skb);
398 struct inet6_skb_parm *opt = IP6CB(skb);
399 struct net *net = dev_net(dst->dev);
402 if (net->ipv6.devconf_all->forwarding == 0)
405 if (skb_warn_if_lro(skb))
408 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
409 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
413 skb_forward_csum(skb);
416 * We DO NOT make any processing on
417 * RA packets, pushing them to user level AS IS
418 * without ane WARRANTY that application will be able
419 * to interpret them. The reason is that we
420 * cannot make anything clever here.
422 * We are not end-node, so that if packet contains
423 * AH/ESP, we cannot make anything.
424 * Defragmentation also would be mistake, RA packets
425 * cannot be fragmented, because there is no warranty
426 * that different fragments will go along one path. --ANK
429 u8 *ptr = skb_network_header(skb) + opt->ra;
430 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
435 * check and decrement ttl
437 if (hdr->hop_limit <= 1) {
438 /* Force OUTPUT device used as source address */
440 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
441 IP6_INC_STATS_BH(net,
442 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
448 /* XXX: idev->cnf.proxy_ndp? */
449 if (net->ipv6.devconf_all->proxy_ndp &&
450 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
451 int proxied = ip6_forward_proxy_check(skb);
453 return ip6_input(skb);
454 else if (proxied < 0) {
455 IP6_INC_STATS(net, ip6_dst_idev(dst),
456 IPSTATS_MIB_INDISCARDS);
461 if (!xfrm6_route_forward(skb)) {
462 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
467 /* IPv6 specs say nothing about it, but it is clear that we cannot
468 send redirects to source routed frames.
469 We don't send redirects to frames decapsulated from IPsec.
471 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
472 !skb_sec_path(skb)) {
473 struct in6_addr *target = NULL;
475 struct neighbour *n = dst->neighbour;
478 * incoming and outgoing devices are the same
482 rt = (struct rt6_info *) dst;
483 if ((rt->rt6i_flags & RTF_GATEWAY))
484 target = (struct in6_addr*)&n->primary_key;
486 target = &hdr->daddr;
488 /* Limit redirects both by destination (here)
489 and by source (inside ndisc_send_redirect)
491 if (xrlim_allow(dst, 1*HZ))
492 ndisc_send_redirect(skb, n, target);
494 int addrtype = ipv6_addr_type(&hdr->saddr);
496 /* This check is security critical. */
497 if (addrtype == IPV6_ADDR_ANY ||
498 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
500 if (addrtype & IPV6_ADDR_LINKLOCAL) {
501 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
502 ICMPV6_NOT_NEIGHBOUR, 0);
508 if (mtu < IPV6_MIN_MTU)
511 if (skb->len > mtu) {
512 /* Again, force OUTPUT device used as source address */
514 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
515 IP6_INC_STATS_BH(net,
516 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
517 IP6_INC_STATS_BH(net,
518 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
523 if (skb_cow(skb, dst->dev->hard_header_len)) {
524 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
530 /* Mangling hops number delayed to point after skb COW */
534 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
535 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
539 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
545 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
547 to->pkt_type = from->pkt_type;
548 to->priority = from->priority;
549 to->protocol = from->protocol;
551 skb_dst_set(to, dst_clone(skb_dst(from)));
553 to->mark = from->mark;
555 #ifdef CONFIG_NET_SCHED
556 to->tc_index = from->tc_index;
559 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
560 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
561 to->nf_trace = from->nf_trace;
563 skb_copy_secmark(to, from);
566 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
568 u16 offset = sizeof(struct ipv6hdr);
569 struct ipv6_opt_hdr *exthdr =
570 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
571 unsigned int packet_len = skb->tail - skb->network_header;
573 *nexthdr = &ipv6_hdr(skb)->nexthdr;
575 while (offset + 1 <= packet_len) {
581 case NEXTHDR_ROUTING:
585 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
586 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
596 offset += ipv6_optlen(exthdr);
597 *nexthdr = &exthdr->nexthdr;
598 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
605 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
607 struct sk_buff *frag;
608 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
609 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
610 struct ipv6hdr *tmp_hdr;
612 unsigned int mtu, hlen, left, len;
614 int ptr, offset = 0, err=0;
615 u8 *prevhdr, nexthdr = 0;
616 struct net *net = dev_net(skb_dst(skb)->dev);
618 hlen = ip6_find_1stfragopt(skb, &prevhdr);
621 mtu = ip6_skb_dst_mtu(skb);
623 /* We must not fragment if the socket is set to force MTU discovery
624 * or if the skb it not generated by a local socket.
626 if (!skb->local_df) {
627 skb->dev = skb_dst(skb)->dev;
628 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
629 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
630 IPSTATS_MIB_FRAGFAILS);
635 if (np && np->frag_size < mtu) {
639 mtu -= hlen + sizeof(struct frag_hdr);
641 if (skb_has_frags(skb)) {
642 int first_len = skb_pagelen(skb);
645 if (first_len - hlen > mtu ||
646 ((first_len - hlen) & 7) ||
650 skb_walk_frags(skb, frag) {
651 /* Correct geometry. */
652 if (frag->len > mtu ||
653 ((frag->len & 7) && frag->next) ||
654 skb_headroom(frag) < hlen)
657 /* Partially cloned skb? */
658 if (skb_shared(frag))
664 frag->destructor = sock_wfree;
665 truesizes += frag->truesize;
671 frag = skb_shinfo(skb)->frag_list;
672 skb_frag_list_init(skb);
675 *prevhdr = NEXTHDR_FRAGMENT;
676 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
678 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
679 IPSTATS_MIB_FRAGFAILS);
683 __skb_pull(skb, hlen);
684 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
685 __skb_push(skb, hlen);
686 skb_reset_network_header(skb);
687 memcpy(skb_network_header(skb), tmp_hdr, hlen);
689 ipv6_select_ident(fh);
690 fh->nexthdr = nexthdr;
692 fh->frag_off = htons(IP6_MF);
693 frag_id = fh->identification;
695 first_len = skb_pagelen(skb);
696 skb->data_len = first_len - skb_headlen(skb);
697 skb->truesize -= truesizes;
698 skb->len = first_len;
699 ipv6_hdr(skb)->payload_len = htons(first_len -
700 sizeof(struct ipv6hdr));
702 dst_hold(&rt->u.dst);
705 /* Prepare header of the next frame,
706 * before previous one went down. */
708 frag->ip_summed = CHECKSUM_NONE;
709 skb_reset_transport_header(frag);
710 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
711 __skb_push(frag, hlen);
712 skb_reset_network_header(frag);
713 memcpy(skb_network_header(frag), tmp_hdr,
715 offset += skb->len - hlen - sizeof(struct frag_hdr);
716 fh->nexthdr = nexthdr;
718 fh->frag_off = htons(offset);
719 if (frag->next != NULL)
720 fh->frag_off |= htons(IP6_MF);
721 fh->identification = frag_id;
722 ipv6_hdr(frag)->payload_len =
724 sizeof(struct ipv6hdr));
725 ip6_copy_metadata(frag, skb);
730 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
731 IPSTATS_MIB_FRAGCREATES);
744 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
745 IPSTATS_MIB_FRAGOKS);
746 dst_release(&rt->u.dst);
756 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
757 IPSTATS_MIB_FRAGFAILS);
758 dst_release(&rt->u.dst);
763 left = skb->len - hlen; /* Space per frame */
764 ptr = hlen; /* Where to start from */
767 * Fragment the datagram.
770 *prevhdr = NEXTHDR_FRAGMENT;
773 * Keep copying data until we run out.
777 /* IF: it doesn't fit, use 'mtu' - the data space left */
780 /* IF: we are not sending upto and including the packet end
781 then align the next start on an eight byte boundary */
789 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
790 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
791 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
792 IPSTATS_MIB_FRAGFAILS);
798 * Set up data on packet
801 ip6_copy_metadata(frag, skb);
802 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
803 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
804 skb_reset_network_header(frag);
805 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
806 frag->transport_header = (frag->network_header + hlen +
807 sizeof(struct frag_hdr));
810 * Charge the memory for the fragment to any owner
814 skb_set_owner_w(frag, skb->sk);
817 * Copy the packet header into the new buffer.
819 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
822 * Build fragment header.
824 fh->nexthdr = nexthdr;
827 ipv6_select_ident(fh);
828 frag_id = fh->identification;
830 fh->identification = frag_id;
833 * Copy a block of the IP datagram.
835 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
839 fh->frag_off = htons(offset);
841 fh->frag_off |= htons(IP6_MF);
842 ipv6_hdr(frag)->payload_len = htons(frag->len -
843 sizeof(struct ipv6hdr));
849 * Put this fragment into the sending queue.
855 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
856 IPSTATS_MIB_FRAGCREATES);
858 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
859 IPSTATS_MIB_FRAGOKS);
864 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
865 IPSTATS_MIB_FRAGFAILS);
870 static inline int ip6_rt_check(struct rt6key *rt_key,
871 struct in6_addr *fl_addr,
872 struct in6_addr *addr_cache)
874 return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
875 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
878 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
879 struct dst_entry *dst,
882 struct ipv6_pinfo *np = inet6_sk(sk);
883 struct rt6_info *rt = (struct rt6_info *)dst;
888 /* Yes, checking route validity in not connected
889 * case is not very simple. Take into account,
890 * that we do not support routing by source, TOS,
891 * and MSG_DONTROUTE --ANK (980726)
893 * 1. ip6_rt_check(): If route was host route,
894 * check that cached destination is current.
895 * If it is network route, we still may
896 * check its validity using saved pointer
897 * to the last used address: daddr_cache.
898 * We do not want to save whole address now,
899 * (because main consumer of this service
900 * is tcp, which has not this problem),
901 * so that the last trick works only on connected
903 * 2. oif also should be the same.
905 if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
906 #ifdef CONFIG_IPV6_SUBTREES
907 ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
909 (fl->oif && fl->oif != dst->dev->ifindex)) {
918 static int ip6_dst_lookup_tail(struct sock *sk,
919 struct dst_entry **dst, struct flowi *fl)
922 struct net *net = sock_net(sk);
925 *dst = ip6_route_output(net, sk, fl);
927 if ((err = (*dst)->error))
928 goto out_err_release;
930 if (ipv6_addr_any(&fl->fl6_src)) {
931 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
933 sk ? inet6_sk(sk)->srcprefs : 0,
936 goto out_err_release;
939 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
941 * Here if the dst entry we've looked up
942 * has a neighbour entry that is in the INCOMPLETE
943 * state and the src address from the flow is
944 * marked as OPTIMISTIC, we release the found
945 * dst entry and replace it instead with the
946 * dst entry of the nexthop router
948 if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
949 struct inet6_ifaddr *ifp;
953 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
956 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
962 * We need to get the dst entry for the
963 * default router instead
966 memcpy(&fl_gw, fl, sizeof(struct flowi));
967 memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
968 *dst = ip6_route_output(net, sk, &fl_gw);
969 if ((err = (*dst)->error))
970 goto out_err_release;
978 if (err == -ENETUNREACH)
979 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
986 * ip6_dst_lookup - perform route lookup on flow
987 * @sk: socket which provides route info
988 * @dst: pointer to dst_entry * for result
989 * @fl: flow to lookup
991 * This function performs a route lookup on the given flow.
993 * It returns zero on success, or a standard errno code on error.
995 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
998 return ip6_dst_lookup_tail(sk, dst, fl);
1000 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1003 * ip6_sk_dst_lookup - perform socket cached route lookup on flow
1004 * @sk: socket which provides the dst cache and route info
1005 * @dst: pointer to dst_entry * for result
1006 * @fl: flow to lookup
1008 * This function performs a route lookup on the given flow with the
1009 * possibility of using the cached route in the socket if it is valid.
1010 * It will take the socket dst lock when operating on the dst cache.
1011 * As a result, this function can only be used in process context.
1013 * It returns zero on success, or a standard errno code on error.
1015 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1019 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1020 *dst = ip6_sk_dst_check(sk, *dst, fl);
1023 return ip6_dst_lookup_tail(sk, dst, fl);
1025 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1027 static inline int ip6_ufo_append_data(struct sock *sk,
1028 int getfrag(void *from, char *to, int offset, int len,
1029 int odd, struct sk_buff *skb),
1030 void *from, int length, int hh_len, int fragheaderlen,
1031 int transhdrlen, int mtu,unsigned int flags)
1034 struct sk_buff *skb;
1037 /* There is support for UDP large send offload by network
1038 * device, so create one single skb packet containing complete
1041 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1042 skb = sock_alloc_send_skb(sk,
1043 hh_len + fragheaderlen + transhdrlen + 20,
1044 (flags & MSG_DONTWAIT), &err);
1048 /* reserve space for Hardware header */
1049 skb_reserve(skb, hh_len);
1051 /* create space for UDP/IP header */
1052 skb_put(skb,fragheaderlen + transhdrlen);
1054 /* initialize network header pointer */
1055 skb_reset_network_header(skb);
1057 /* initialize protocol header pointer */
1058 skb->transport_header = skb->network_header + fragheaderlen;
1060 skb->ip_summed = CHECKSUM_PARTIAL;
1062 sk->sk_sndmsg_off = 0;
1065 err = skb_append_datato_frags(sk,skb, getfrag, from,
1066 (length - transhdrlen));
1068 struct frag_hdr fhdr;
1070 /* Specify the length of each IPv6 datagram fragment.
1071 * It has to be a multiple of 8.
1073 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1074 sizeof(struct frag_hdr)) & ~7;
1075 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1076 ipv6_select_ident(&fhdr);
1077 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1078 __skb_queue_tail(&sk->sk_write_queue, skb);
1082 /* There is not enough support do UPD LSO,
1083 * so follow normal path
1090 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1093 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1096 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1099 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1102 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1103 int offset, int len, int odd, struct sk_buff *skb),
1104 void *from, int length, int transhdrlen,
1105 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1106 struct rt6_info *rt, unsigned int flags)
1108 struct inet_sock *inet = inet_sk(sk);
1109 struct ipv6_pinfo *np = inet6_sk(sk);
1110 struct sk_buff *skb;
1111 unsigned int maxfraglen, fragheaderlen;
1118 int csummode = CHECKSUM_NONE;
1120 if (flags&MSG_PROBE)
1122 if (skb_queue_empty(&sk->sk_write_queue)) {
1127 if (WARN_ON(np->cork.opt))
1130 np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1131 if (unlikely(np->cork.opt == NULL))
1134 np->cork.opt->tot_len = opt->tot_len;
1135 np->cork.opt->opt_flen = opt->opt_flen;
1136 np->cork.opt->opt_nflen = opt->opt_nflen;
1138 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1140 if (opt->dst0opt && !np->cork.opt->dst0opt)
1143 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1145 if (opt->dst1opt && !np->cork.opt->dst1opt)
1148 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1150 if (opt->hopopt && !np->cork.opt->hopopt)
1153 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1155 if (opt->srcrt && !np->cork.opt->srcrt)
1158 /* need source address above miyazawa*/
1160 dst_hold(&rt->u.dst);
1161 inet->cork.dst = &rt->u.dst;
1162 inet->cork.fl = *fl;
1163 np->cork.hop_limit = hlimit;
1164 np->cork.tclass = tclass;
1165 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1166 rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1167 if (np->frag_size < mtu) {
1169 mtu = np->frag_size;
1171 inet->cork.fragsize = mtu;
1172 if (dst_allfrag(rt->u.dst.path))
1173 inet->cork.flags |= IPCORK_ALLFRAG;
1174 inet->cork.length = 0;
1175 sk->sk_sndmsg_page = NULL;
1176 sk->sk_sndmsg_off = 0;
1177 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1178 rt->rt6i_nfheader_len;
1179 length += exthdrlen;
1180 transhdrlen += exthdrlen;
1182 rt = (struct rt6_info *)inet->cork.dst;
1183 fl = &inet->cork.fl;
1187 mtu = inet->cork.fragsize;
1190 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1192 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1193 (opt ? opt->opt_nflen : 0);
1194 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1196 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1197 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1198 ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1204 * Let's try using as much space as possible.
1205 * Use MTU if total length of the message fits into the MTU.
1206 * Otherwise, we need to reserve fragment header and
1207 * fragment alignment (= 8-15 octects, in total).
1209 * Note that we may need to "move" the data from the tail of
1210 * of the buffer to the new fragment when we split
1213 * FIXME: It may be fragmented into multiple chunks
1214 * at once if non-fragmentable extension headers
1219 inet->cork.length += length;
1220 if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1221 (rt->u.dst.dev->features & NETIF_F_UFO)) {
1223 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1224 fragheaderlen, transhdrlen, mtu,
1231 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1234 while (length > 0) {
1235 /* Check if the remaining data fits into current packet. */
1236 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1238 copy = maxfraglen - skb->len;
1242 unsigned int datalen;
1243 unsigned int fraglen;
1244 unsigned int fraggap;
1245 unsigned int alloclen;
1246 struct sk_buff *skb_prev;
1250 /* There's no room in the current skb */
1252 fraggap = skb_prev->len - maxfraglen;
1257 * If remaining data exceeds the mtu,
1258 * we know we need more fragment(s).
1260 datalen = length + fraggap;
1261 if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1262 datalen = maxfraglen - fragheaderlen;
1264 fraglen = datalen + fragheaderlen;
1265 if ((flags & MSG_MORE) &&
1266 !(rt->u.dst.dev->features&NETIF_F_SG))
1269 alloclen = datalen + fragheaderlen;
1272 * The last fragment gets additional space at tail.
1273 * Note: we overallocate on fragments with MSG_MODE
1274 * because we have no idea if we're the last one.
1276 if (datalen == length + fraggap)
1277 alloclen += rt->u.dst.trailer_len;
1280 * We just reserve space for fragment header.
1281 * Note: this may be overallocation if the message
1282 * (without MSG_MORE) fits into the MTU.
1284 alloclen += sizeof(struct frag_hdr);
1287 skb = sock_alloc_send_skb(sk,
1289 (flags & MSG_DONTWAIT), &err);
1292 if (atomic_read(&sk->sk_wmem_alloc) <=
1294 skb = sock_wmalloc(sk,
1295 alloclen + hh_len, 1,
1297 if (unlikely(skb == NULL))
1303 * Fill in the control structures
1305 skb->ip_summed = csummode;
1307 /* reserve for fragmentation */
1308 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1311 * Find where to start putting bytes
1313 data = skb_put(skb, fraglen);
1314 skb_set_network_header(skb, exthdrlen);
1315 data += fragheaderlen;
1316 skb->transport_header = (skb->network_header +
1319 skb->csum = skb_copy_and_csum_bits(
1320 skb_prev, maxfraglen,
1321 data + transhdrlen, fraggap, 0);
1322 skb_prev->csum = csum_sub(skb_prev->csum,
1325 pskb_trim_unique(skb_prev, maxfraglen);
1327 copy = datalen - transhdrlen - fraggap;
1332 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1339 length -= datalen - fraggap;
1342 csummode = CHECKSUM_NONE;
1345 * Put the packet on the pending queue
1347 __skb_queue_tail(&sk->sk_write_queue, skb);
1354 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1358 if (getfrag(from, skb_put(skb, copy),
1359 offset, copy, off, skb) < 0) {
1360 __skb_trim(skb, off);
1365 int i = skb_shinfo(skb)->nr_frags;
1366 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1367 struct page *page = sk->sk_sndmsg_page;
1368 int off = sk->sk_sndmsg_off;
1371 if (page && (left = PAGE_SIZE - off) > 0) {
1374 if (page != frag->page) {
1375 if (i == MAX_SKB_FRAGS) {
1380 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1381 frag = &skb_shinfo(skb)->frags[i];
1383 } else if(i < MAX_SKB_FRAGS) {
1384 if (copy > PAGE_SIZE)
1386 page = alloc_pages(sk->sk_allocation, 0);
1391 sk->sk_sndmsg_page = page;
1392 sk->sk_sndmsg_off = 0;
1394 skb_fill_page_desc(skb, i, page, 0, 0);
1395 frag = &skb_shinfo(skb)->frags[i];
1400 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1404 sk->sk_sndmsg_off += copy;
1407 skb->data_len += copy;
1408 skb->truesize += copy;
1409 atomic_add(copy, &sk->sk_wmem_alloc);
1416 inet->cork.length -= length;
1417 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1421 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1424 kfree(np->cork.opt->dst0opt);
1425 kfree(np->cork.opt->dst1opt);
1426 kfree(np->cork.opt->hopopt);
1427 kfree(np->cork.opt->srcrt);
1428 kfree(np->cork.opt);
1429 np->cork.opt = NULL;
1432 if (inet->cork.dst) {
1433 dst_release(inet->cork.dst);
1434 inet->cork.dst = NULL;
1435 inet->cork.flags &= ~IPCORK_ALLFRAG;
1437 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1440 int ip6_push_pending_frames(struct sock *sk)
1442 struct sk_buff *skb, *tmp_skb;
1443 struct sk_buff **tail_skb;
1444 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1445 struct inet_sock *inet = inet_sk(sk);
1446 struct ipv6_pinfo *np = inet6_sk(sk);
1447 struct net *net = sock_net(sk);
1448 struct ipv6hdr *hdr;
1449 struct ipv6_txoptions *opt = np->cork.opt;
1450 struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1451 struct flowi *fl = &inet->cork.fl;
1452 unsigned char proto = fl->proto;
1455 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1457 tail_skb = &(skb_shinfo(skb)->frag_list);
1459 /* move skb->data to ip header from ext header */
1460 if (skb->data < skb_network_header(skb))
1461 __skb_pull(skb, skb_network_offset(skb));
1462 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1463 __skb_pull(tmp_skb, skb_network_header_len(skb));
1464 *tail_skb = tmp_skb;
1465 tail_skb = &(tmp_skb->next);
1466 skb->len += tmp_skb->len;
1467 skb->data_len += tmp_skb->len;
1468 skb->truesize += tmp_skb->truesize;
1469 tmp_skb->destructor = NULL;
1473 /* Allow local fragmentation. */
1474 if (np->pmtudisc < IPV6_PMTUDISC_DO)
1477 ipv6_addr_copy(final_dst, &fl->fl6_dst);
1478 __skb_pull(skb, skb_network_header_len(skb));
1479 if (opt && opt->opt_flen)
1480 ipv6_push_frag_opts(skb, opt, &proto);
1481 if (opt && opt->opt_nflen)
1482 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1484 skb_push(skb, sizeof(struct ipv6hdr));
1485 skb_reset_network_header(skb);
1486 hdr = ipv6_hdr(skb);
1488 *(__be32*)hdr = fl->fl6_flowlabel |
1489 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1491 hdr->hop_limit = np->cork.hop_limit;
1492 hdr->nexthdr = proto;
1493 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1494 ipv6_addr_copy(&hdr->daddr, final_dst);
1496 skb->priority = sk->sk_priority;
1497 skb->mark = sk->sk_mark;
1499 skb_dst_set(skb, dst_clone(&rt->u.dst));
1500 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1501 if (proto == IPPROTO_ICMPV6) {
1502 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1504 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1505 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1508 err = ip6_local_out(skb);
1511 err = net_xmit_errno(err);
1517 ip6_cork_release(inet, np);
1520 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1524 void ip6_flush_pending_frames(struct sock *sk)
1526 struct sk_buff *skb;
1528 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1530 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1531 IPSTATS_MIB_OUTDISCARDS);
1535 ip6_cork_release(inet_sk(sk), inet6_sk(sk));