2 * IPv6 output functions
3 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * Based on linux/net/ipv4/ip_output.c
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
41 #include <linux/netfilter.h>
42 #include <linux/netfilter_ipv6.h>
48 #include <net/ndisc.h>
49 #include <net/protocol.h>
50 #include <net/ip6_route.h>
51 #include <net/addrconf.h>
52 #include <net/rawv6.h>
55 #include <net/checksum.h>
56 #include <linux/mroute6.h>
58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 int __ip6_local_out(struct sk_buff *skb)
64 len = skb->len - sizeof(struct ipv6hdr);
65 if (len > IPV6_MAXPLEN)
67 ipv6_hdr(skb)->payload_len = htons(len);
69 return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
70 skb_dst(skb)->dev, dst_output);
73 int ip6_local_out(struct sk_buff *skb)
77 err = __ip6_local_out(skb);
79 err = dst_output(skb);
83 EXPORT_SYMBOL_GPL(ip6_local_out);
85 /* dev_loopback_xmit for use with netfilter. */
86 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 skb_reset_mac_header(newskb);
89 __skb_pull(newskb, skb_network_offset(newskb));
90 newskb->pkt_type = PACKET_LOOPBACK;
91 newskb->ip_summed = CHECKSUM_UNNECESSARY;
92 WARN_ON(!skb_dst(newskb));
98 static int ip6_finish_output2(struct sk_buff *skb)
100 struct dst_entry *dst = skb_dst(skb);
101 struct net_device *dev = dst->dev;
103 skb->protocol = htons(ETH_P_IPV6);
106 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
107 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
109 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
110 ((mroute6_socket(dev_net(dev)) &&
111 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
112 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
113 &ipv6_hdr(skb)->saddr))) {
114 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
116 /* Do not check for IFF_ALLMULTI; multicast routing
117 is not supported in any case.
120 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
121 newskb, NULL, newskb->dev,
122 ip6_dev_loopback_xmit);
124 if (ipv6_hdr(skb)->hop_limit == 0) {
125 IP6_INC_STATS(dev_net(dev), idev,
126 IPSTATS_MIB_OUTDISCARDS);
132 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
137 return neigh_hh_output(dst->hh, skb);
138 else if (dst->neighbour)
139 return dst->neighbour->output(skb);
141 IP6_INC_STATS_BH(dev_net(dst->dev),
142 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
147 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
149 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
151 return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
152 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
155 static int ip6_finish_output(struct sk_buff *skb)
157 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
158 dst_allfrag(skb_dst(skb)))
159 return ip6_fragment(skb, ip6_finish_output2);
161 return ip6_finish_output2(skb);
164 int ip6_output(struct sk_buff *skb)
166 struct net_device *dev = skb_dst(skb)->dev;
167 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
168 if (unlikely(idev->cnf.disable_ipv6)) {
169 IP6_INC_STATS(dev_net(dev), idev,
170 IPSTATS_MIB_OUTDISCARDS);
175 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
177 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
181 * xmit an sk_buff (used by TCP)
184 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
185 struct ipv6_txoptions *opt, int ipfragok)
187 struct net *net = sock_net(sk);
188 struct ipv6_pinfo *np = inet6_sk(sk);
189 struct in6_addr *first_hop = &fl->fl6_dst;
190 struct dst_entry *dst = skb_dst(skb);
192 u8 proto = fl->proto;
193 int seg_len = skb->len;
199 unsigned int head_room;
201 /* First: exthdrs may take lots of space (~8K for now)
202 MAX_HEADER is not enough.
204 head_room = opt->opt_nflen + opt->opt_flen;
205 seg_len += head_room;
206 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
208 if (skb_headroom(skb) < head_room) {
209 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
211 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
212 IPSTATS_MIB_OUTDISCARDS);
219 skb_set_owner_w(skb, sk);
222 ipv6_push_frag_opts(skb, opt, &proto);
224 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
227 skb_push(skb, sizeof(struct ipv6hdr));
228 skb_reset_network_header(skb);
231 /* Allow local fragmentation. */
236 * Fill in the IPv6 header
240 hlimit = np->hop_limit;
243 hlimit = ip6_dst_hoplimit(dst);
245 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
247 hdr->payload_len = htons(seg_len);
248 hdr->nexthdr = proto;
249 hdr->hop_limit = hlimit;
251 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
252 ipv6_addr_copy(&hdr->daddr, first_hop);
254 skb->priority = sk->sk_priority;
255 skb->mark = sk->sk_mark;
258 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
259 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
260 IPSTATS_MIB_OUT, skb->len);
261 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
262 dst->dev, dst_output);
266 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
268 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
269 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
274 EXPORT_SYMBOL(ip6_xmit);
277 * To avoid extra problems ND packets are send through this
278 * routine. It's code duplication but I really want to avoid
279 * extra checks since ipv6_build_header is used by TCP (which
280 * is for us performance critical)
283 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
284 const struct in6_addr *saddr, const struct in6_addr *daddr,
287 struct ipv6_pinfo *np = inet6_sk(sk);
291 skb->protocol = htons(ETH_P_IPV6);
294 totlen = len + sizeof(struct ipv6hdr);
296 skb_reset_network_header(skb);
297 skb_put(skb, sizeof(struct ipv6hdr));
300 *(__be32*)hdr = htonl(0x60000000);
302 hdr->payload_len = htons(len);
303 hdr->nexthdr = proto;
304 hdr->hop_limit = np->hop_limit;
306 ipv6_addr_copy(&hdr->saddr, saddr);
307 ipv6_addr_copy(&hdr->daddr, daddr);
312 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
314 struct ip6_ra_chain *ra;
315 struct sock *last = NULL;
317 read_lock(&ip6_ra_lock);
318 for (ra = ip6_ra_chain; ra; ra = ra->next) {
319 struct sock *sk = ra->sk;
320 if (sk && ra->sel == sel &&
321 (!sk->sk_bound_dev_if ||
322 sk->sk_bound_dev_if == skb->dev->ifindex)) {
324 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
326 rawv6_rcv(last, skb2);
333 rawv6_rcv(last, skb);
334 read_unlock(&ip6_ra_lock);
337 read_unlock(&ip6_ra_lock);
341 static int ip6_forward_proxy_check(struct sk_buff *skb)
343 struct ipv6hdr *hdr = ipv6_hdr(skb);
344 u8 nexthdr = hdr->nexthdr;
347 if (ipv6_ext_hdr(nexthdr)) {
348 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
352 offset = sizeof(struct ipv6hdr);
354 if (nexthdr == IPPROTO_ICMPV6) {
355 struct icmp6hdr *icmp6;
357 if (!pskb_may_pull(skb, (skb_network_header(skb) +
358 offset + 1 - skb->data)))
361 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
363 switch (icmp6->icmp6_type) {
364 case NDISC_ROUTER_SOLICITATION:
365 case NDISC_ROUTER_ADVERTISEMENT:
366 case NDISC_NEIGHBOUR_SOLICITATION:
367 case NDISC_NEIGHBOUR_ADVERTISEMENT:
369 /* For reaction involving unicast neighbor discovery
370 * message destined to the proxied address, pass it to
380 * The proxying router can't forward traffic sent to a link-local
381 * address, so signal the sender and discard the packet. This
382 * behavior is clarified by the MIPv6 specification.
384 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
385 dst_link_failure(skb);
392 static inline int ip6_forward_finish(struct sk_buff *skb)
394 return dst_output(skb);
397 int ip6_forward(struct sk_buff *skb)
399 struct dst_entry *dst = skb_dst(skb);
400 struct ipv6hdr *hdr = ipv6_hdr(skb);
401 struct inet6_skb_parm *opt = IP6CB(skb);
402 struct net *net = dev_net(dst->dev);
405 if (net->ipv6.devconf_all->forwarding == 0)
408 if (skb_warn_if_lro(skb))
411 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
412 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
416 skb_forward_csum(skb);
419 * We DO NOT make any processing on
420 * RA packets, pushing them to user level AS IS
421 * without ane WARRANTY that application will be able
422 * to interpret them. The reason is that we
423 * cannot make anything clever here.
425 * We are not end-node, so that if packet contains
426 * AH/ESP, we cannot make anything.
427 * Defragmentation also would be mistake, RA packets
428 * cannot be fragmented, because there is no warranty
429 * that different fragments will go along one path. --ANK
432 u8 *ptr = skb_network_header(skb) + opt->ra;
433 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
438 * check and decrement ttl
440 if (hdr->hop_limit <= 1) {
441 /* Force OUTPUT device used as source address */
443 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
444 IP6_INC_STATS_BH(net,
445 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
451 /* XXX: idev->cnf.proxy_ndp? */
452 if (net->ipv6.devconf_all->proxy_ndp &&
453 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
454 int proxied = ip6_forward_proxy_check(skb);
456 return ip6_input(skb);
457 else if (proxied < 0) {
458 IP6_INC_STATS(net, ip6_dst_idev(dst),
459 IPSTATS_MIB_INDISCARDS);
464 if (!xfrm6_route_forward(skb)) {
465 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
470 /* IPv6 specs say nothing about it, but it is clear that we cannot
471 send redirects to source routed frames.
472 We don't send redirects to frames decapsulated from IPsec.
474 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
475 !skb_sec_path(skb)) {
476 struct in6_addr *target = NULL;
478 struct neighbour *n = dst->neighbour;
481 * incoming and outgoing devices are the same
485 rt = (struct rt6_info *) dst;
486 if ((rt->rt6i_flags & RTF_GATEWAY))
487 target = (struct in6_addr*)&n->primary_key;
489 target = &hdr->daddr;
491 /* Limit redirects both by destination (here)
492 and by source (inside ndisc_send_redirect)
494 if (xrlim_allow(dst, 1*HZ))
495 ndisc_send_redirect(skb, n, target);
497 int addrtype = ipv6_addr_type(&hdr->saddr);
499 /* This check is security critical. */
500 if (addrtype == IPV6_ADDR_ANY ||
501 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
503 if (addrtype & IPV6_ADDR_LINKLOCAL) {
504 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
505 ICMPV6_NOT_NEIGHBOUR, 0);
511 if (mtu < IPV6_MIN_MTU)
514 if (skb->len > mtu) {
515 /* Again, force OUTPUT device used as source address */
517 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
518 IP6_INC_STATS_BH(net,
519 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
520 IP6_INC_STATS_BH(net,
521 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
526 if (skb_cow(skb, dst->dev->hard_header_len)) {
527 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
533 /* Mangling hops number delayed to point after skb COW */
537 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
538 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
542 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
548 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
550 to->pkt_type = from->pkt_type;
551 to->priority = from->priority;
552 to->protocol = from->protocol;
554 skb_dst_set(to, dst_clone(skb_dst(from)));
556 to->mark = from->mark;
558 #ifdef CONFIG_NET_SCHED
559 to->tc_index = from->tc_index;
562 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
563 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
564 to->nf_trace = from->nf_trace;
566 skb_copy_secmark(to, from);
569 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
571 u16 offset = sizeof(struct ipv6hdr);
572 struct ipv6_opt_hdr *exthdr =
573 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
574 unsigned int packet_len = skb->tail - skb->network_header;
576 *nexthdr = &ipv6_hdr(skb)->nexthdr;
578 while (offset + 1 <= packet_len) {
584 case NEXTHDR_ROUTING:
588 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
589 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
599 offset += ipv6_optlen(exthdr);
600 *nexthdr = &exthdr->nexthdr;
601 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
608 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
610 struct sk_buff *frag;
611 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
612 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
613 struct ipv6hdr *tmp_hdr;
615 unsigned int mtu, hlen, left, len;
617 int ptr, offset = 0, err=0;
618 u8 *prevhdr, nexthdr = 0;
619 struct net *net = dev_net(skb_dst(skb)->dev);
621 hlen = ip6_find_1stfragopt(skb, &prevhdr);
624 mtu = ip6_skb_dst_mtu(skb);
626 /* We must not fragment if the socket is set to force MTU discovery
627 * or if the skb it not generated by a local socket.
629 if (!skb->local_df) {
630 skb->dev = skb_dst(skb)->dev;
631 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
632 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
633 IPSTATS_MIB_FRAGFAILS);
638 if (np && np->frag_size < mtu) {
642 mtu -= hlen + sizeof(struct frag_hdr);
644 if (skb_has_frags(skb)) {
645 int first_len = skb_pagelen(skb);
648 if (first_len - hlen > mtu ||
649 ((first_len - hlen) & 7) ||
653 skb_walk_frags(skb, frag) {
654 /* Correct geometry. */
655 if (frag->len > mtu ||
656 ((frag->len & 7) && frag->next) ||
657 skb_headroom(frag) < hlen)
660 /* Partially cloned skb? */
661 if (skb_shared(frag))
667 frag->destructor = sock_wfree;
668 truesizes += frag->truesize;
674 frag = skb_shinfo(skb)->frag_list;
675 skb_frag_list_init(skb);
678 *prevhdr = NEXTHDR_FRAGMENT;
679 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
681 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
682 IPSTATS_MIB_FRAGFAILS);
686 __skb_pull(skb, hlen);
687 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
688 __skb_push(skb, hlen);
689 skb_reset_network_header(skb);
690 memcpy(skb_network_header(skb), tmp_hdr, hlen);
692 ipv6_select_ident(fh);
693 fh->nexthdr = nexthdr;
695 fh->frag_off = htons(IP6_MF);
696 frag_id = fh->identification;
698 first_len = skb_pagelen(skb);
699 skb->data_len = first_len - skb_headlen(skb);
700 skb->truesize -= truesizes;
701 skb->len = first_len;
702 ipv6_hdr(skb)->payload_len = htons(first_len -
703 sizeof(struct ipv6hdr));
705 dst_hold(&rt->u.dst);
708 /* Prepare header of the next frame,
709 * before previous one went down. */
711 frag->ip_summed = CHECKSUM_NONE;
712 skb_reset_transport_header(frag);
713 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
714 __skb_push(frag, hlen);
715 skb_reset_network_header(frag);
716 memcpy(skb_network_header(frag), tmp_hdr,
718 offset += skb->len - hlen - sizeof(struct frag_hdr);
719 fh->nexthdr = nexthdr;
721 fh->frag_off = htons(offset);
722 if (frag->next != NULL)
723 fh->frag_off |= htons(IP6_MF);
724 fh->identification = frag_id;
725 ipv6_hdr(frag)->payload_len =
727 sizeof(struct ipv6hdr));
728 ip6_copy_metadata(frag, skb);
733 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
734 IPSTATS_MIB_FRAGCREATES);
747 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
748 IPSTATS_MIB_FRAGOKS);
749 dst_release(&rt->u.dst);
759 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
760 IPSTATS_MIB_FRAGFAILS);
761 dst_release(&rt->u.dst);
766 left = skb->len - hlen; /* Space per frame */
767 ptr = hlen; /* Where to start from */
770 * Fragment the datagram.
773 *prevhdr = NEXTHDR_FRAGMENT;
776 * Keep copying data until we run out.
780 /* IF: it doesn't fit, use 'mtu' - the data space left */
783 /* IF: we are not sending upto and including the packet end
784 then align the next start on an eight byte boundary */
792 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
793 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
794 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
795 IPSTATS_MIB_FRAGFAILS);
801 * Set up data on packet
804 ip6_copy_metadata(frag, skb);
805 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
806 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
807 skb_reset_network_header(frag);
808 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
809 frag->transport_header = (frag->network_header + hlen +
810 sizeof(struct frag_hdr));
813 * Charge the memory for the fragment to any owner
817 skb_set_owner_w(frag, skb->sk);
820 * Copy the packet header into the new buffer.
822 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
825 * Build fragment header.
827 fh->nexthdr = nexthdr;
830 ipv6_select_ident(fh);
831 frag_id = fh->identification;
833 fh->identification = frag_id;
836 * Copy a block of the IP datagram.
838 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
842 fh->frag_off = htons(offset);
844 fh->frag_off |= htons(IP6_MF);
845 ipv6_hdr(frag)->payload_len = htons(frag->len -
846 sizeof(struct ipv6hdr));
852 * Put this fragment into the sending queue.
858 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
859 IPSTATS_MIB_FRAGCREATES);
861 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
862 IPSTATS_MIB_FRAGOKS);
867 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
868 IPSTATS_MIB_FRAGFAILS);
873 static inline int ip6_rt_check(struct rt6key *rt_key,
874 struct in6_addr *fl_addr,
875 struct in6_addr *addr_cache)
877 return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
878 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
881 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
882 struct dst_entry *dst,
885 struct ipv6_pinfo *np = inet6_sk(sk);
886 struct rt6_info *rt = (struct rt6_info *)dst;
891 /* Yes, checking route validity in not connected
892 * case is not very simple. Take into account,
893 * that we do not support routing by source, TOS,
894 * and MSG_DONTROUTE --ANK (980726)
896 * 1. ip6_rt_check(): If route was host route,
897 * check that cached destination is current.
898 * If it is network route, we still may
899 * check its validity using saved pointer
900 * to the last used address: daddr_cache.
901 * We do not want to save whole address now,
902 * (because main consumer of this service
903 * is tcp, which has not this problem),
904 * so that the last trick works only on connected
906 * 2. oif also should be the same.
908 if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
909 #ifdef CONFIG_IPV6_SUBTREES
910 ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
912 (fl->oif && fl->oif != dst->dev->ifindex)) {
921 static int ip6_dst_lookup_tail(struct sock *sk,
922 struct dst_entry **dst, struct flowi *fl)
925 struct net *net = sock_net(sk);
928 *dst = ip6_route_output(net, sk, fl);
930 if ((err = (*dst)->error))
931 goto out_err_release;
933 if (ipv6_addr_any(&fl->fl6_src)) {
934 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
936 sk ? inet6_sk(sk)->srcprefs : 0,
939 goto out_err_release;
942 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
944 * Here if the dst entry we've looked up
945 * has a neighbour entry that is in the INCOMPLETE
946 * state and the src address from the flow is
947 * marked as OPTIMISTIC, we release the found
948 * dst entry and replace it instead with the
949 * dst entry of the nexthop router
951 if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
952 struct inet6_ifaddr *ifp;
956 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
959 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
965 * We need to get the dst entry for the
966 * default router instead
969 memcpy(&fl_gw, fl, sizeof(struct flowi));
970 memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
971 *dst = ip6_route_output(net, sk, &fl_gw);
972 if ((err = (*dst)->error))
973 goto out_err_release;
981 if (err == -ENETUNREACH)
982 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
989 * ip6_dst_lookup - perform route lookup on flow
990 * @sk: socket which provides route info
991 * @dst: pointer to dst_entry * for result
992 * @fl: flow to lookup
994 * This function performs a route lookup on the given flow.
996 * It returns zero on success, or a standard errno code on error.
998 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1001 return ip6_dst_lookup_tail(sk, dst, fl);
1003 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1006 * ip6_sk_dst_lookup - perform socket cached route lookup on flow
1007 * @sk: socket which provides the dst cache and route info
1008 * @dst: pointer to dst_entry * for result
1009 * @fl: flow to lookup
1011 * This function performs a route lookup on the given flow with the
1012 * possibility of using the cached route in the socket if it is valid.
1013 * It will take the socket dst lock when operating on the dst cache.
1014 * As a result, this function can only be used in process context.
1016 * It returns zero on success, or a standard errno code on error.
1018 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1022 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1023 *dst = ip6_sk_dst_check(sk, *dst, fl);
1026 return ip6_dst_lookup_tail(sk, dst, fl);
1028 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1030 static inline int ip6_ufo_append_data(struct sock *sk,
1031 int getfrag(void *from, char *to, int offset, int len,
1032 int odd, struct sk_buff *skb),
1033 void *from, int length, int hh_len, int fragheaderlen,
1034 int transhdrlen, int mtu,unsigned int flags)
1037 struct sk_buff *skb;
1040 /* There is support for UDP large send offload by network
1041 * device, so create one single skb packet containing complete
1044 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1045 skb = sock_alloc_send_skb(sk,
1046 hh_len + fragheaderlen + transhdrlen + 20,
1047 (flags & MSG_DONTWAIT), &err);
1051 /* reserve space for Hardware header */
1052 skb_reserve(skb, hh_len);
1054 /* create space for UDP/IP header */
1055 skb_put(skb,fragheaderlen + transhdrlen);
1057 /* initialize network header pointer */
1058 skb_reset_network_header(skb);
1060 /* initialize protocol header pointer */
1061 skb->transport_header = skb->network_header + fragheaderlen;
1063 skb->ip_summed = CHECKSUM_PARTIAL;
1065 sk->sk_sndmsg_off = 0;
1068 err = skb_append_datato_frags(sk,skb, getfrag, from,
1069 (length - transhdrlen));
1071 struct frag_hdr fhdr;
1073 /* Specify the length of each IPv6 datagram fragment.
1074 * It has to be a multiple of 8.
1076 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1077 sizeof(struct frag_hdr)) & ~7;
1078 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1079 ipv6_select_ident(&fhdr);
1080 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1081 __skb_queue_tail(&sk->sk_write_queue, skb);
1085 /* There is not enough support do UPD LSO,
1086 * so follow normal path
1093 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1096 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1099 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1102 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1105 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1106 int offset, int len, int odd, struct sk_buff *skb),
1107 void *from, int length, int transhdrlen,
1108 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1109 struct rt6_info *rt, unsigned int flags)
1111 struct inet_sock *inet = inet_sk(sk);
1112 struct ipv6_pinfo *np = inet6_sk(sk);
1113 struct sk_buff *skb;
1114 unsigned int maxfraglen, fragheaderlen;
1121 int csummode = CHECKSUM_NONE;
1123 if (flags&MSG_PROBE)
1125 if (skb_queue_empty(&sk->sk_write_queue)) {
1130 if (WARN_ON(np->cork.opt))
1133 np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1134 if (unlikely(np->cork.opt == NULL))
1137 np->cork.opt->tot_len = opt->tot_len;
1138 np->cork.opt->opt_flen = opt->opt_flen;
1139 np->cork.opt->opt_nflen = opt->opt_nflen;
1141 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1143 if (opt->dst0opt && !np->cork.opt->dst0opt)
1146 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1148 if (opt->dst1opt && !np->cork.opt->dst1opt)
1151 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1153 if (opt->hopopt && !np->cork.opt->hopopt)
1156 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1158 if (opt->srcrt && !np->cork.opt->srcrt)
1161 /* need source address above miyazawa*/
1163 dst_hold(&rt->u.dst);
1164 inet->cork.dst = &rt->u.dst;
1165 inet->cork.fl = *fl;
1166 np->cork.hop_limit = hlimit;
1167 np->cork.tclass = tclass;
1168 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1169 rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1170 if (np->frag_size < mtu) {
1172 mtu = np->frag_size;
1174 inet->cork.fragsize = mtu;
1175 if (dst_allfrag(rt->u.dst.path))
1176 inet->cork.flags |= IPCORK_ALLFRAG;
1177 inet->cork.length = 0;
1178 sk->sk_sndmsg_page = NULL;
1179 sk->sk_sndmsg_off = 0;
1180 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1181 rt->rt6i_nfheader_len;
1182 length += exthdrlen;
1183 transhdrlen += exthdrlen;
1185 rt = (struct rt6_info *)inet->cork.dst;
1186 fl = &inet->cork.fl;
1190 mtu = inet->cork.fragsize;
1193 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1195 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1196 (opt ? opt->opt_nflen : 0);
1197 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1199 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1200 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1201 ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1207 * Let's try using as much space as possible.
1208 * Use MTU if total length of the message fits into the MTU.
1209 * Otherwise, we need to reserve fragment header and
1210 * fragment alignment (= 8-15 octects, in total).
1212 * Note that we may need to "move" the data from the tail of
1213 * of the buffer to the new fragment when we split
1216 * FIXME: It may be fragmented into multiple chunks
1217 * at once if non-fragmentable extension headers
1222 inet->cork.length += length;
1223 if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1224 (rt->u.dst.dev->features & NETIF_F_UFO)) {
1226 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1227 fragheaderlen, transhdrlen, mtu,
1234 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1237 while (length > 0) {
1238 /* Check if the remaining data fits into current packet. */
1239 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1241 copy = maxfraglen - skb->len;
1245 unsigned int datalen;
1246 unsigned int fraglen;
1247 unsigned int fraggap;
1248 unsigned int alloclen;
1249 struct sk_buff *skb_prev;
1253 /* There's no room in the current skb */
1255 fraggap = skb_prev->len - maxfraglen;
1260 * If remaining data exceeds the mtu,
1261 * we know we need more fragment(s).
1263 datalen = length + fraggap;
1264 if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1265 datalen = maxfraglen - fragheaderlen;
1267 fraglen = datalen + fragheaderlen;
1268 if ((flags & MSG_MORE) &&
1269 !(rt->u.dst.dev->features&NETIF_F_SG))
1272 alloclen = datalen + fragheaderlen;
1275 * The last fragment gets additional space at tail.
1276 * Note: we overallocate on fragments with MSG_MODE
1277 * because we have no idea if we're the last one.
1279 if (datalen == length + fraggap)
1280 alloclen += rt->u.dst.trailer_len;
1283 * We just reserve space for fragment header.
1284 * Note: this may be overallocation if the message
1285 * (without MSG_MORE) fits into the MTU.
1287 alloclen += sizeof(struct frag_hdr);
1290 skb = sock_alloc_send_skb(sk,
1292 (flags & MSG_DONTWAIT), &err);
1295 if (atomic_read(&sk->sk_wmem_alloc) <=
1297 skb = sock_wmalloc(sk,
1298 alloclen + hh_len, 1,
1300 if (unlikely(skb == NULL))
1306 * Fill in the control structures
1308 skb->ip_summed = csummode;
1310 /* reserve for fragmentation */
1311 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1314 * Find where to start putting bytes
1316 data = skb_put(skb, fraglen);
1317 skb_set_network_header(skb, exthdrlen);
1318 data += fragheaderlen;
1319 skb->transport_header = (skb->network_header +
1322 skb->csum = skb_copy_and_csum_bits(
1323 skb_prev, maxfraglen,
1324 data + transhdrlen, fraggap, 0);
1325 skb_prev->csum = csum_sub(skb_prev->csum,
1328 pskb_trim_unique(skb_prev, maxfraglen);
1330 copy = datalen - transhdrlen - fraggap;
1335 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1342 length -= datalen - fraggap;
1345 csummode = CHECKSUM_NONE;
1348 * Put the packet on the pending queue
1350 __skb_queue_tail(&sk->sk_write_queue, skb);
1357 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1361 if (getfrag(from, skb_put(skb, copy),
1362 offset, copy, off, skb) < 0) {
1363 __skb_trim(skb, off);
1368 int i = skb_shinfo(skb)->nr_frags;
1369 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1370 struct page *page = sk->sk_sndmsg_page;
1371 int off = sk->sk_sndmsg_off;
1374 if (page && (left = PAGE_SIZE - off) > 0) {
1377 if (page != frag->page) {
1378 if (i == MAX_SKB_FRAGS) {
1383 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1384 frag = &skb_shinfo(skb)->frags[i];
1386 } else if(i < MAX_SKB_FRAGS) {
1387 if (copy > PAGE_SIZE)
1389 page = alloc_pages(sk->sk_allocation, 0);
1394 sk->sk_sndmsg_page = page;
1395 sk->sk_sndmsg_off = 0;
1397 skb_fill_page_desc(skb, i, page, 0, 0);
1398 frag = &skb_shinfo(skb)->frags[i];
1403 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1407 sk->sk_sndmsg_off += copy;
1410 skb->data_len += copy;
1411 skb->truesize += copy;
1412 atomic_add(copy, &sk->sk_wmem_alloc);
1419 inet->cork.length -= length;
1420 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1424 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1427 kfree(np->cork.opt->dst0opt);
1428 kfree(np->cork.opt->dst1opt);
1429 kfree(np->cork.opt->hopopt);
1430 kfree(np->cork.opt->srcrt);
1431 kfree(np->cork.opt);
1432 np->cork.opt = NULL;
1435 if (inet->cork.dst) {
1436 dst_release(inet->cork.dst);
1437 inet->cork.dst = NULL;
1438 inet->cork.flags &= ~IPCORK_ALLFRAG;
1440 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1443 int ip6_push_pending_frames(struct sock *sk)
1445 struct sk_buff *skb, *tmp_skb;
1446 struct sk_buff **tail_skb;
1447 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1448 struct inet_sock *inet = inet_sk(sk);
1449 struct ipv6_pinfo *np = inet6_sk(sk);
1450 struct net *net = sock_net(sk);
1451 struct ipv6hdr *hdr;
1452 struct ipv6_txoptions *opt = np->cork.opt;
1453 struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1454 struct flowi *fl = &inet->cork.fl;
1455 unsigned char proto = fl->proto;
1458 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1460 tail_skb = &(skb_shinfo(skb)->frag_list);
1462 /* move skb->data to ip header from ext header */
1463 if (skb->data < skb_network_header(skb))
1464 __skb_pull(skb, skb_network_offset(skb));
1465 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1466 __skb_pull(tmp_skb, skb_network_header_len(skb));
1467 *tail_skb = tmp_skb;
1468 tail_skb = &(tmp_skb->next);
1469 skb->len += tmp_skb->len;
1470 skb->data_len += tmp_skb->len;
1471 skb->truesize += tmp_skb->truesize;
1472 tmp_skb->destructor = NULL;
1476 /* Allow local fragmentation. */
1477 if (np->pmtudisc < IPV6_PMTUDISC_DO)
1480 ipv6_addr_copy(final_dst, &fl->fl6_dst);
1481 __skb_pull(skb, skb_network_header_len(skb));
1482 if (opt && opt->opt_flen)
1483 ipv6_push_frag_opts(skb, opt, &proto);
1484 if (opt && opt->opt_nflen)
1485 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1487 skb_push(skb, sizeof(struct ipv6hdr));
1488 skb_reset_network_header(skb);
1489 hdr = ipv6_hdr(skb);
1491 *(__be32*)hdr = fl->fl6_flowlabel |
1492 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1494 hdr->hop_limit = np->cork.hop_limit;
1495 hdr->nexthdr = proto;
1496 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1497 ipv6_addr_copy(&hdr->daddr, final_dst);
1499 skb->priority = sk->sk_priority;
1500 skb->mark = sk->sk_mark;
1502 skb_dst_set(skb, dst_clone(&rt->u.dst));
1503 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1504 if (proto == IPPROTO_ICMPV6) {
1505 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1507 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1508 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1511 err = ip6_local_out(skb);
1514 err = net_xmit_errno(err);
1520 ip6_cork_release(inet, np);
1523 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1527 void ip6_flush_pending_frames(struct sock *sk)
1529 struct sk_buff *skb;
1531 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1533 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1534 IPSTATS_MIB_OUTDISCARDS);
1538 ip6_cork_release(inet_sk(sk), inet6_sk(sk));