2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
37 * request_sock handling and moved
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
40 * Added new listen semantics.
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
54 #include <linux/bottom_half.h>
55 #include <linux/types.h>
56 #include <linux/fcntl.h>
57 #include <linux/module.h>
58 #include <linux/random.h>
59 #include <linux/cache.h>
60 #include <linux/jhash.h>
61 #include <linux/init.h>
62 #include <linux/times.h>
63 #include <linux/slab.h>
65 #include <net/net_namespace.h>
67 #include <net/inet_hashtables.h>
69 #include <net/transp_v6.h>
71 #include <net/inet_common.h>
72 #include <net/timewait_sock.h>
74 #include <net/netdma.h>
75 #include <net/secure_seq.h>
77 #include <linux/inet.h>
78 #include <linux/ipv6.h>
79 #include <linux/stddef.h>
80 #include <linux/proc_fs.h>
81 #include <linux/seq_file.h>
83 #include <linux/crypto.h>
84 #include <linux/scatterlist.h>
86 int sysctl_tcp_tw_reuse __read_mostly;
87 int sysctl_tcp_low_latency __read_mostly;
88 EXPORT_SYMBOL(sysctl_tcp_low_latency);
91 #ifdef CONFIG_TCP_MD5SIG
92 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
94 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
95 __be32 daddr, __be32 saddr, struct tcphdr *th);
98 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
104 struct inet_hashinfo tcp_hashinfo;
105 EXPORT_SYMBOL(tcp_hashinfo);
107 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
109 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
112 tcp_hdr(skb)->source);
115 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
117 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
118 struct tcp_sock *tp = tcp_sk(sk);
120 /* With PAWS, it is safe from the viewpoint
121 of data integrity. Even without PAWS it is safe provided sequence
122 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
124 Actually, the idea is close to VJ's one, only timestamp cache is
125 held not per host, but per port pair and TW bucket is used as state
128 If TW bucket has been already destroyed we fall back to VJ's scheme
129 and use initial timestamp retrieved from peer table.
131 if (tcptw->tw_ts_recent_stamp &&
132 (twp == NULL || (sysctl_tcp_tw_reuse &&
133 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
134 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
135 if (tp->write_seq == 0)
137 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
138 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
145 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
147 /* This will initiate an outgoing connection. */
148 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
150 struct inet_sock *inet = inet_sk(sk);
151 struct tcp_sock *tp = tcp_sk(sk);
152 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
154 __be32 daddr, nexthop;
158 if (addr_len < sizeof(struct sockaddr_in))
161 if (usin->sin_family != AF_INET)
162 return -EAFNOSUPPORT;
164 nexthop = daddr = usin->sin_addr.s_addr;
165 if (inet->opt && inet->opt->srr) {
168 nexthop = inet->opt->faddr;
171 tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
172 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
174 inet->inet_sport, usin->sin_port, sk, 1);
176 if (tmp == -ENETUNREACH)
177 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
181 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
186 if (!inet->opt || !inet->opt->srr)
189 if (!inet->inet_saddr)
190 inet->inet_saddr = rt->rt_src;
191 inet->inet_rcv_saddr = inet->inet_saddr;
193 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
194 /* Reset inherited state */
195 tp->rx_opt.ts_recent = 0;
196 tp->rx_opt.ts_recent_stamp = 0;
200 if (tcp_death_row.sysctl_tw_recycle &&
201 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
202 struct inet_peer *peer = rt_get_peer(rt);
204 * VJ's idea. We save last timestamp seen from
205 * the destination in peer table, when entering state
206 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
207 * when trying new connection.
210 inet_peer_refcheck(peer);
211 if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
212 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
213 tp->rx_opt.ts_recent = peer->tcp_ts;
218 inet->inet_dport = usin->sin_port;
219 inet->inet_daddr = daddr;
221 inet_csk(sk)->icsk_ext_hdr_len = 0;
223 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
225 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
227 /* Socket identity is still unknown (sport may be zero).
228 * However we set state to SYN-SENT and not releasing socket
229 * lock select source port, enter ourselves into the hash tables and
230 * complete initialization after this.
232 tcp_set_state(sk, TCP_SYN_SENT);
233 err = inet_hash_connect(&tcp_death_row, sk);
237 err = ip_route_newports(&rt, IPPROTO_TCP,
238 inet->inet_sport, inet->inet_dport, sk);
242 /* OK, now commit destination to socket. */
243 sk->sk_gso_type = SKB_GSO_TCPV4;
244 sk_setup_caps(sk, &rt->dst);
247 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
252 inet->inet_id = tp->write_seq ^ jiffies;
254 err = tcp_connect(sk);
263 * This unhashes the socket and releases the local port,
266 tcp_set_state(sk, TCP_CLOSE);
268 sk->sk_route_caps = 0;
269 inet->inet_dport = 0;
272 EXPORT_SYMBOL(tcp_v4_connect);
275 * This routine does path mtu discovery as defined in RFC1191.
277 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
279 struct dst_entry *dst;
280 struct inet_sock *inet = inet_sk(sk);
282 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
283 * send out by Linux are always <576bytes so they should go through
286 if (sk->sk_state == TCP_LISTEN)
289 /* We don't check in the destentry if pmtu discovery is forbidden
290 * on this route. We just assume that no packet_to_big packets
291 * are send back when pmtu discovery is not active.
292 * There is a small race when the user changes this flag in the
293 * route, but I think that's acceptable.
295 if ((dst = __sk_dst_check(sk, 0)) == NULL)
298 dst->ops->update_pmtu(dst, mtu);
300 /* Something is about to be wrong... Remember soft error
301 * for the case, if this connection will not able to recover.
303 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
304 sk->sk_err_soft = EMSGSIZE;
308 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
309 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
310 tcp_sync_mss(sk, mtu);
312 /* Resend the TCP packet because it's
313 * clear that the old packet has been
314 * dropped. This is the new "fast" path mtu
317 tcp_simple_retransmit(sk);
318 } /* else let the usual retransmit timer handle it */
322 * This routine is called by the ICMP module when it gets some
323 * sort of error condition. If err < 0 then the socket should
324 * be closed and the error returned to the user. If err > 0
325 * it's just the icmp type << 8 | icmp code. After adjustment
326 * header points to the first 8 bytes of the tcp header. We need
327 * to find the appropriate port.
329 * The locking strategy used here is very "optimistic". When
330 * someone else accesses the socket the ICMP is just dropped
331 * and for some paths there is no check at all.
332 * A more general error queue to queue errors for later handling
333 * is probably better.
337 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
339 struct iphdr *iph = (struct iphdr *)icmp_skb->data;
340 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
341 struct inet_connection_sock *icsk;
343 struct inet_sock *inet;
344 const int type = icmp_hdr(icmp_skb)->type;
345 const int code = icmp_hdr(icmp_skb)->code;
351 struct net *net = dev_net(icmp_skb->dev);
353 if (icmp_skb->len < (iph->ihl << 2) + 8) {
354 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
358 sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
359 iph->saddr, th->source, inet_iif(icmp_skb));
361 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
364 if (sk->sk_state == TCP_TIME_WAIT) {
365 inet_twsk_put(inet_twsk(sk));
370 /* If too many ICMPs get dropped on busy
371 * servers this needs to be solved differently.
373 if (sock_owned_by_user(sk))
374 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
376 if (sk->sk_state == TCP_CLOSE)
379 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
380 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
386 seq = ntohl(th->seq);
387 if (sk->sk_state != TCP_LISTEN &&
388 !between(seq, tp->snd_una, tp->snd_nxt)) {
389 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
394 case ICMP_SOURCE_QUENCH:
395 /* Just silently ignore these. */
397 case ICMP_PARAMETERPROB:
400 case ICMP_DEST_UNREACH:
401 if (code > NR_ICMP_UNREACH)
404 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
405 if (!sock_owned_by_user(sk))
406 do_pmtu_discovery(sk, iph, info);
410 err = icmp_err_convert[code].errno;
411 /* check if icmp_skb allows revert of backoff
412 * (see draft-zimmermann-tcp-lcd) */
413 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
415 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
419 if (sock_owned_by_user(sk))
422 icsk->icsk_backoff--;
423 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
427 skb = tcp_write_queue_head(sk);
430 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
431 tcp_time_stamp - TCP_SKB_CB(skb)->when);
434 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
435 remaining, TCP_RTO_MAX);
437 /* RTO revert clocked out retransmission.
438 * Will retransmit now */
439 tcp_retransmit_timer(sk);
443 case ICMP_TIME_EXCEEDED:
450 switch (sk->sk_state) {
451 struct request_sock *req, **prev;
453 if (sock_owned_by_user(sk))
456 req = inet_csk_search_req(sk, &prev, th->dest,
457 iph->daddr, iph->saddr);
461 /* ICMPs are not backlogged, hence we cannot get
462 an established socket here.
466 if (seq != tcp_rsk(req)->snt_isn) {
467 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
472 * Still in SYN_RECV, just remove it silently.
473 * There is no good way to pass the error to the newly
474 * created socket, and POSIX does not want network
475 * errors returned from accept().
477 inet_csk_reqsk_queue_drop(sk, req, prev);
481 case TCP_SYN_RECV: /* Cannot happen.
482 It can f.e. if SYNs crossed.
484 if (!sock_owned_by_user(sk)) {
487 sk->sk_error_report(sk);
491 sk->sk_err_soft = err;
496 /* If we've already connected we will keep trying
497 * until we time out, or the user gives up.
499 * rfc1122 4.2.3.9 allows to consider as hard errors
500 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
501 * but it is obsoleted by pmtu discovery).
503 * Note, that in modern internet, where routing is unreliable
504 * and in each dark corner broken firewalls sit, sending random
505 * errors ordered by their masters even this two messages finally lose
506 * their original sense (even Linux sends invalid PORT_UNREACHs)
508 * Now we are in compliance with RFCs.
513 if (!sock_owned_by_user(sk) && inet->recverr) {
515 sk->sk_error_report(sk);
516 } else { /* Only an error on timeout */
517 sk->sk_err_soft = err;
525 static void __tcp_v4_send_check(struct sk_buff *skb,
526 __be32 saddr, __be32 daddr)
528 struct tcphdr *th = tcp_hdr(skb);
530 if (skb->ip_summed == CHECKSUM_PARTIAL) {
531 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
532 skb->csum_start = skb_transport_header(skb) - skb->head;
533 skb->csum_offset = offsetof(struct tcphdr, check);
535 th->check = tcp_v4_check(skb->len, saddr, daddr,
542 /* This routine computes an IPv4 TCP checksum. */
543 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
545 struct inet_sock *inet = inet_sk(sk);
547 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
549 EXPORT_SYMBOL(tcp_v4_send_check);
551 int tcp_v4_gso_send_check(struct sk_buff *skb)
553 const struct iphdr *iph;
556 if (!pskb_may_pull(skb, sizeof(*th)))
563 skb->ip_summed = CHECKSUM_PARTIAL;
564 __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
569 * This routine will send an RST to the other tcp.
571 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
573 * Answer: if a packet caused RST, it is not for a socket
574 * existing in our system, if it is matched to a socket,
575 * it is just duplicate segment or bug in other side's TCP.
576 * So that we build reply only basing on parameters
577 * arrived with segment.
578 * Exception: precedence violation. We do not implement it in any case.
581 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
583 struct tcphdr *th = tcp_hdr(skb);
586 #ifdef CONFIG_TCP_MD5SIG
587 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
590 struct ip_reply_arg arg;
591 #ifdef CONFIG_TCP_MD5SIG
592 struct tcp_md5sig_key *key;
596 /* Never send a reset in response to a reset. */
600 if (skb_rtable(skb)->rt_type != RTN_LOCAL)
603 /* Swap the send and the receive. */
604 memset(&rep, 0, sizeof(rep));
605 rep.th.dest = th->source;
606 rep.th.source = th->dest;
607 rep.th.doff = sizeof(struct tcphdr) / 4;
611 rep.th.seq = th->ack_seq;
614 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
615 skb->len - (th->doff << 2));
618 memset(&arg, 0, sizeof(arg));
619 arg.iov[0].iov_base = (unsigned char *)&rep;
620 arg.iov[0].iov_len = sizeof(rep.th);
622 #ifdef CONFIG_TCP_MD5SIG
623 key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
625 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
627 (TCPOPT_MD5SIG << 8) |
629 /* Update length and the length the header thinks exists */
630 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
631 rep.th.doff = arg.iov[0].iov_len / 4;
633 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
634 key, ip_hdr(skb)->saddr,
635 ip_hdr(skb)->daddr, &rep.th);
638 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
639 ip_hdr(skb)->saddr, /* XXX */
640 arg.iov[0].iov_len, IPPROTO_TCP, 0);
641 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
642 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
644 net = dev_net(skb_dst(skb)->dev);
645 ip_send_reply(net->ipv4.tcp_sock, skb,
646 &arg, arg.iov[0].iov_len);
648 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
649 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
652 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
653 outside socket context is ugly, certainly. What can I do?
656 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
657 u32 win, u32 ts, int oif,
658 struct tcp_md5sig_key *key,
661 struct tcphdr *th = tcp_hdr(skb);
664 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
665 #ifdef CONFIG_TCP_MD5SIG
666 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
670 struct ip_reply_arg arg;
671 struct net *net = dev_net(skb_dst(skb)->dev);
673 memset(&rep.th, 0, sizeof(struct tcphdr));
674 memset(&arg, 0, sizeof(arg));
676 arg.iov[0].iov_base = (unsigned char *)&rep;
677 arg.iov[0].iov_len = sizeof(rep.th);
679 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
680 (TCPOPT_TIMESTAMP << 8) |
682 rep.opt[1] = htonl(tcp_time_stamp);
683 rep.opt[2] = htonl(ts);
684 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
687 /* Swap the send and the receive. */
688 rep.th.dest = th->source;
689 rep.th.source = th->dest;
690 rep.th.doff = arg.iov[0].iov_len / 4;
691 rep.th.seq = htonl(seq);
692 rep.th.ack_seq = htonl(ack);
694 rep.th.window = htons(win);
696 #ifdef CONFIG_TCP_MD5SIG
698 int offset = (ts) ? 3 : 0;
700 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
702 (TCPOPT_MD5SIG << 8) |
704 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
705 rep.th.doff = arg.iov[0].iov_len/4;
707 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
708 key, ip_hdr(skb)->saddr,
709 ip_hdr(skb)->daddr, &rep.th);
712 arg.flags = reply_flags;
713 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
714 ip_hdr(skb)->saddr, /* XXX */
715 arg.iov[0].iov_len, IPPROTO_TCP, 0);
716 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
718 arg.bound_dev_if = oif;
720 ip_send_reply(net->ipv4.tcp_sock, skb,
721 &arg, arg.iov[0].iov_len);
723 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
726 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
728 struct inet_timewait_sock *tw = inet_twsk(sk);
729 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
731 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
732 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
735 tcp_twsk_md5_key(tcptw),
736 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
742 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
743 struct request_sock *req)
745 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
746 tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
749 tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
750 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
754 * Send a SYN-ACK after having received a SYN.
755 * This still operates on a request_sock only, not on a big
758 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
759 struct request_sock *req,
760 struct request_values *rvp)
762 const struct inet_request_sock *ireq = inet_rsk(req);
764 struct sk_buff * skb;
766 /* First, grab a route. */
767 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
770 skb = tcp_make_synack(sk, dst, req, rvp);
773 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
775 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
778 err = net_xmit_eval(err);
785 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
786 struct request_values *rvp)
788 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
789 return tcp_v4_send_synack(sk, NULL, req, rvp);
793 * IPv4 request_sock destructor.
795 static void tcp_v4_reqsk_destructor(struct request_sock *req)
797 kfree(inet_rsk(req)->opt);
800 static void syn_flood_warning(const struct sk_buff *skb)
804 #ifdef CONFIG_SYN_COOKIES
805 if (sysctl_tcp_syncookies)
806 msg = "Sending cookies";
809 msg = "Dropping request";
811 pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
812 ntohs(tcp_hdr(skb)->dest), msg);
816 * Save and compile IPv4 options into the request_sock if needed.
818 static struct ip_options *tcp_v4_save_options(struct sock *sk,
821 struct ip_options *opt = &(IPCB(skb)->opt);
822 struct ip_options *dopt = NULL;
824 if (opt && opt->optlen) {
825 int opt_size = optlength(opt);
826 dopt = kmalloc(opt_size, GFP_ATOMIC);
828 if (ip_options_echo(dopt, skb)) {
837 #ifdef CONFIG_TCP_MD5SIG
839 * RFC2385 MD5 checksumming requires a mapping of
840 * IP address->MD5 Key.
841 * We need to maintain these in the sk structure.
844 /* Find the Key structure for an address. */
845 static struct tcp_md5sig_key *
846 tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
848 struct tcp_sock *tp = tcp_sk(sk);
851 if (!tp->md5sig_info || !tp->md5sig_info->entries4)
853 for (i = 0; i < tp->md5sig_info->entries4; i++) {
854 if (tp->md5sig_info->keys4[i].addr == addr)
855 return &tp->md5sig_info->keys4[i].base;
860 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
861 struct sock *addr_sk)
863 return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
865 EXPORT_SYMBOL(tcp_v4_md5_lookup);
867 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
868 struct request_sock *req)
870 return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
873 /* This can be called on a newly created socket, from other files */
874 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
875 u8 *newkey, u8 newkeylen)
877 /* Add Key to the list */
878 struct tcp_md5sig_key *key;
879 struct tcp_sock *tp = tcp_sk(sk);
880 struct tcp4_md5sig_key *keys;
882 key = tcp_v4_md5_do_lookup(sk, addr);
884 /* Pre-existing entry - just update that one. */
887 key->keylen = newkeylen;
889 struct tcp_md5sig_info *md5sig;
891 if (!tp->md5sig_info) {
892 tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
894 if (!tp->md5sig_info) {
898 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
900 if (tcp_alloc_md5sig_pool(sk) == NULL) {
904 md5sig = tp->md5sig_info;
906 if (md5sig->alloced4 == md5sig->entries4) {
907 keys = kmalloc((sizeof(*keys) *
908 (md5sig->entries4 + 1)), GFP_ATOMIC);
911 tcp_free_md5sig_pool();
915 if (md5sig->entries4)
916 memcpy(keys, md5sig->keys4,
917 sizeof(*keys) * md5sig->entries4);
919 /* Free old key list, and reference new one */
920 kfree(md5sig->keys4);
921 md5sig->keys4 = keys;
925 md5sig->keys4[md5sig->entries4 - 1].addr = addr;
926 md5sig->keys4[md5sig->entries4 - 1].base.key = newkey;
927 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
931 EXPORT_SYMBOL(tcp_v4_md5_do_add);
933 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
934 u8 *newkey, u8 newkeylen)
936 return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
940 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
942 struct tcp_sock *tp = tcp_sk(sk);
945 for (i = 0; i < tp->md5sig_info->entries4; i++) {
946 if (tp->md5sig_info->keys4[i].addr == addr) {
948 kfree(tp->md5sig_info->keys4[i].base.key);
949 tp->md5sig_info->entries4--;
951 if (tp->md5sig_info->entries4 == 0) {
952 kfree(tp->md5sig_info->keys4);
953 tp->md5sig_info->keys4 = NULL;
954 tp->md5sig_info->alloced4 = 0;
955 } else if (tp->md5sig_info->entries4 != i) {
956 /* Need to do some manipulation */
957 memmove(&tp->md5sig_info->keys4[i],
958 &tp->md5sig_info->keys4[i+1],
959 (tp->md5sig_info->entries4 - i) *
960 sizeof(struct tcp4_md5sig_key));
962 tcp_free_md5sig_pool();
968 EXPORT_SYMBOL(tcp_v4_md5_do_del);
970 static void tcp_v4_clear_md5_list(struct sock *sk)
972 struct tcp_sock *tp = tcp_sk(sk);
974 /* Free each key, then the set of key keys,
975 * the crypto element, and then decrement our
976 * hold on the last resort crypto.
978 if (tp->md5sig_info->entries4) {
980 for (i = 0; i < tp->md5sig_info->entries4; i++)
981 kfree(tp->md5sig_info->keys4[i].base.key);
982 tp->md5sig_info->entries4 = 0;
983 tcp_free_md5sig_pool();
985 if (tp->md5sig_info->keys4) {
986 kfree(tp->md5sig_info->keys4);
987 tp->md5sig_info->keys4 = NULL;
988 tp->md5sig_info->alloced4 = 0;
992 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
995 struct tcp_md5sig cmd;
996 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
999 if (optlen < sizeof(cmd))
1002 if (copy_from_user(&cmd, optval, sizeof(cmd)))
1005 if (sin->sin_family != AF_INET)
1008 if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1009 if (!tcp_sk(sk)->md5sig_info)
1011 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1014 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1017 if (!tcp_sk(sk)->md5sig_info) {
1018 struct tcp_sock *tp = tcp_sk(sk);
1019 struct tcp_md5sig_info *p;
1021 p = kzalloc(sizeof(*p), sk->sk_allocation);
1025 tp->md5sig_info = p;
1026 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1029 newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1032 return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1033 newkey, cmd.tcpm_keylen);
1036 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1037 __be32 daddr, __be32 saddr, int nbytes)
1039 struct tcp4_pseudohdr *bp;
1040 struct scatterlist sg;
1042 bp = &hp->md5_blk.ip4;
1045 * 1. the TCP pseudo-header (in the order: source IP address,
1046 * destination IP address, zero-padded protocol number, and
1052 bp->protocol = IPPROTO_TCP;
1053 bp->len = cpu_to_be16(nbytes);
1055 sg_init_one(&sg, bp, sizeof(*bp));
1056 return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1059 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1060 __be32 daddr, __be32 saddr, struct tcphdr *th)
1062 struct tcp_md5sig_pool *hp;
1063 struct hash_desc *desc;
1065 hp = tcp_get_md5sig_pool();
1067 goto clear_hash_noput;
1068 desc = &hp->md5_desc;
1070 if (crypto_hash_init(desc))
1072 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1074 if (tcp_md5_hash_header(hp, th))
1076 if (tcp_md5_hash_key(hp, key))
1078 if (crypto_hash_final(desc, md5_hash))
1081 tcp_put_md5sig_pool();
1085 tcp_put_md5sig_pool();
1087 memset(md5_hash, 0, 16);
1091 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1092 struct sock *sk, struct request_sock *req,
1093 struct sk_buff *skb)
1095 struct tcp_md5sig_pool *hp;
1096 struct hash_desc *desc;
1097 struct tcphdr *th = tcp_hdr(skb);
1098 __be32 saddr, daddr;
1101 saddr = inet_sk(sk)->inet_saddr;
1102 daddr = inet_sk(sk)->inet_daddr;
1104 saddr = inet_rsk(req)->loc_addr;
1105 daddr = inet_rsk(req)->rmt_addr;
1107 const struct iphdr *iph = ip_hdr(skb);
1112 hp = tcp_get_md5sig_pool();
1114 goto clear_hash_noput;
1115 desc = &hp->md5_desc;
1117 if (crypto_hash_init(desc))
1120 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1122 if (tcp_md5_hash_header(hp, th))
1124 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1126 if (tcp_md5_hash_key(hp, key))
1128 if (crypto_hash_final(desc, md5_hash))
1131 tcp_put_md5sig_pool();
1135 tcp_put_md5sig_pool();
1137 memset(md5_hash, 0, 16);
1140 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1142 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1145 * This gets called for each TCP segment that arrives
1146 * so we want to be efficient.
1147 * We have 3 drop cases:
1148 * o No MD5 hash and one expected.
1149 * o MD5 hash and we're not expecting one.
1150 * o MD5 hash and its wrong.
1152 __u8 *hash_location = NULL;
1153 struct tcp_md5sig_key *hash_expected;
1154 const struct iphdr *iph = ip_hdr(skb);
1155 struct tcphdr *th = tcp_hdr(skb);
1157 unsigned char newhash[16];
1159 hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1160 hash_location = tcp_parse_md5sig_option(th);
1162 /* We've parsed the options - do we have a hash? */
1163 if (!hash_expected && !hash_location)
1166 if (hash_expected && !hash_location) {
1167 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1171 if (!hash_expected && hash_location) {
1172 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1176 /* Okay, so this is hash_expected and hash_location -
1177 * so we need to calculate the checksum.
1179 genhash = tcp_v4_md5_hash_skb(newhash,
1183 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1184 if (net_ratelimit()) {
1185 printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1186 &iph->saddr, ntohs(th->source),
1187 &iph->daddr, ntohs(th->dest),
1188 genhash ? " tcp_v4_calc_md5_hash failed" : "");
1197 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1199 .obj_size = sizeof(struct tcp_request_sock),
1200 .rtx_syn_ack = tcp_v4_rtx_synack,
1201 .send_ack = tcp_v4_reqsk_send_ack,
1202 .destructor = tcp_v4_reqsk_destructor,
1203 .send_reset = tcp_v4_send_reset,
1204 .syn_ack_timeout = tcp_syn_ack_timeout,
1207 #ifdef CONFIG_TCP_MD5SIG
1208 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1209 .md5_lookup = tcp_v4_reqsk_md5_lookup,
1210 .calc_md5_hash = tcp_v4_md5_hash_skb,
1214 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1216 struct tcp_extend_values tmp_ext;
1217 struct tcp_options_received tmp_opt;
1219 struct request_sock *req;
1220 struct inet_request_sock *ireq;
1221 struct tcp_sock *tp = tcp_sk(sk);
1222 struct dst_entry *dst = NULL;
1223 __be32 saddr = ip_hdr(skb)->saddr;
1224 __be32 daddr = ip_hdr(skb)->daddr;
1225 __u32 isn = TCP_SKB_CB(skb)->when;
1226 #ifdef CONFIG_SYN_COOKIES
1227 int want_cookie = 0;
1229 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1232 /* Never answer to SYNs send to broadcast or multicast */
1233 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1236 /* TW buckets are converted to open requests without
1237 * limitations, they conserve resources and peer is
1238 * evidently real one.
1240 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1241 if (net_ratelimit())
1242 syn_flood_warning(skb);
1243 #ifdef CONFIG_SYN_COOKIES
1244 if (sysctl_tcp_syncookies) {
1251 /* Accept backlog is full. If we have already queued enough
1252 * of warm entries in syn queue, drop request. It is better than
1253 * clogging syn queue with openreqs with exponentially increasing
1256 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1259 req = inet_reqsk_alloc(&tcp_request_sock_ops);
1263 #ifdef CONFIG_TCP_MD5SIG
1264 tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1267 tcp_clear_options(&tmp_opt);
1268 tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1269 tmp_opt.user_mss = tp->rx_opt.user_mss;
1270 tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1272 if (tmp_opt.cookie_plus > 0 &&
1273 tmp_opt.saw_tstamp &&
1274 !tp->rx_opt.cookie_out_never &&
1275 (sysctl_tcp_cookie_size > 0 ||
1276 (tp->cookie_values != NULL &&
1277 tp->cookie_values->cookie_desired > 0))) {
1279 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1280 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1282 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1283 goto drop_and_release;
1285 /* Secret recipe starts with IP addresses */
1286 *mess++ ^= (__force u32)daddr;
1287 *mess++ ^= (__force u32)saddr;
1289 /* plus variable length Initiator Cookie */
1292 *c++ ^= *hash_location++;
1294 #ifdef CONFIG_SYN_COOKIES
1295 want_cookie = 0; /* not our kind of cookie */
1297 tmp_ext.cookie_out_never = 0; /* false */
1298 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1299 } else if (!tp->rx_opt.cookie_in_always) {
1300 /* redundant indications, but ensure initialization. */
1301 tmp_ext.cookie_out_never = 1; /* true */
1302 tmp_ext.cookie_plus = 0;
1304 goto drop_and_release;
1306 tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1308 if (want_cookie && !tmp_opt.saw_tstamp)
1309 tcp_clear_options(&tmp_opt);
1311 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1312 tcp_openreq_init(req, &tmp_opt, skb);
1314 ireq = inet_rsk(req);
1315 ireq->loc_addr = daddr;
1316 ireq->rmt_addr = saddr;
1317 ireq->no_srccheck = inet_sk(sk)->transparent;
1318 ireq->opt = tcp_v4_save_options(sk, skb);
1320 if (security_inet_conn_request(sk, skb, req))
1323 if (!want_cookie || tmp_opt.tstamp_ok)
1324 TCP_ECN_create_request(req, tcp_hdr(skb));
1327 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1328 req->cookie_ts = tmp_opt.tstamp_ok;
1330 struct inet_peer *peer = NULL;
1332 /* VJ's idea. We save last timestamp seen
1333 * from the destination in peer table, when entering
1334 * state TIME-WAIT, and check against it before
1335 * accepting new connection request.
1337 * If "isn" is not zero, this request hit alive
1338 * timewait bucket, so that all the necessary checks
1339 * are made in the function processing timewait state.
1341 if (tmp_opt.saw_tstamp &&
1342 tcp_death_row.sysctl_tw_recycle &&
1343 (dst = inet_csk_route_req(sk, req)) != NULL &&
1344 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1345 peer->daddr.a4 == saddr) {
1346 inet_peer_refcheck(peer);
1347 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1348 (s32)(peer->tcp_ts - req->ts_recent) >
1350 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1351 goto drop_and_release;
1354 /* Kill the following clause, if you dislike this way. */
1355 else if (!sysctl_tcp_syncookies &&
1356 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1357 (sysctl_max_syn_backlog >> 2)) &&
1358 (!peer || !peer->tcp_ts_stamp) &&
1359 (!dst || !dst_metric(dst, RTAX_RTT))) {
1360 /* Without syncookies last quarter of
1361 * backlog is filled with destinations,
1362 * proven to be alive.
1363 * It means that we continue to communicate
1364 * to destinations, already remembered
1365 * to the moment of synflood.
1367 LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1368 &saddr, ntohs(tcp_hdr(skb)->source));
1369 goto drop_and_release;
1372 isn = tcp_v4_init_sequence(skb);
1374 tcp_rsk(req)->snt_isn = isn;
1376 if (tcp_v4_send_synack(sk, dst, req,
1377 (struct request_values *)&tmp_ext) ||
1381 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1391 EXPORT_SYMBOL(tcp_v4_conn_request);
1395 * The three way handshake has completed - we got a valid synack -
1396 * now create the new socket.
1398 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1399 struct request_sock *req,
1400 struct dst_entry *dst)
1402 struct inet_request_sock *ireq;
1403 struct inet_sock *newinet;
1404 struct tcp_sock *newtp;
1406 #ifdef CONFIG_TCP_MD5SIG
1407 struct tcp_md5sig_key *key;
1410 if (sk_acceptq_is_full(sk))
1413 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1416 newsk = tcp_create_openreq_child(sk, req, skb);
1420 newsk->sk_gso_type = SKB_GSO_TCPV4;
1421 sk_setup_caps(newsk, dst);
1423 newtp = tcp_sk(newsk);
1424 newinet = inet_sk(newsk);
1425 ireq = inet_rsk(req);
1426 newinet->inet_daddr = ireq->rmt_addr;
1427 newinet->inet_rcv_saddr = ireq->loc_addr;
1428 newinet->inet_saddr = ireq->loc_addr;
1429 newinet->opt = ireq->opt;
1431 newinet->mc_index = inet_iif(skb);
1432 newinet->mc_ttl = ip_hdr(skb)->ttl;
1433 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1435 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1436 newinet->inet_id = newtp->write_seq ^ jiffies;
1438 tcp_mtup_init(newsk);
1439 tcp_sync_mss(newsk, dst_mtu(dst));
1440 newtp->advmss = dst_metric_advmss(dst);
1441 if (tcp_sk(sk)->rx_opt.user_mss &&
1442 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1443 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1445 tcp_initialize_rcv_mss(newsk);
1447 #ifdef CONFIG_TCP_MD5SIG
1448 /* Copy over the MD5 key from the original socket */
1449 key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1452 * We're using one, so create a matching key
1453 * on the newsk structure. If we fail to get
1454 * memory, then we end up not copying the key
1457 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1459 tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1460 newkey, key->keylen);
1461 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1465 if (__inet_inherit_port(sk, newsk) < 0) {
1469 __inet_hash_nolisten(newsk, NULL);
1474 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1478 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1481 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1483 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1485 struct tcphdr *th = tcp_hdr(skb);
1486 const struct iphdr *iph = ip_hdr(skb);
1488 struct request_sock **prev;
1489 /* Find possible connection requests. */
1490 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1491 iph->saddr, iph->daddr);
1493 return tcp_check_req(sk, skb, req, prev);
1495 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1496 th->source, iph->daddr, th->dest, inet_iif(skb));
1499 if (nsk->sk_state != TCP_TIME_WAIT) {
1503 inet_twsk_put(inet_twsk(nsk));
1507 #ifdef CONFIG_SYN_COOKIES
1509 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1514 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1516 const struct iphdr *iph = ip_hdr(skb);
1518 if (skb->ip_summed == CHECKSUM_COMPLETE) {
1519 if (!tcp_v4_check(skb->len, iph->saddr,
1520 iph->daddr, skb->csum)) {
1521 skb->ip_summed = CHECKSUM_UNNECESSARY;
1526 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1527 skb->len, IPPROTO_TCP, 0);
1529 if (skb->len <= 76) {
1530 return __skb_checksum_complete(skb);
1536 /* The socket must have it's spinlock held when we get
1539 * We have a potential double-lock case here, so even when
1540 * doing backlog processing we use the BH locking scheme.
1541 * This is because we cannot sleep with the original spinlock
1544 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1547 #ifdef CONFIG_TCP_MD5SIG
1549 * We really want to reject the packet as early as possible
1551 * o We're expecting an MD5'd packet and this is no MD5 tcp option
1552 * o There is an MD5 option and we're not expecting one
1554 if (tcp_v4_inbound_md5_hash(sk, skb))
1558 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1559 sock_rps_save_rxhash(sk, skb->rxhash);
1560 TCP_CHECK_TIMER(sk);
1561 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1565 TCP_CHECK_TIMER(sk);
1569 if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1572 if (sk->sk_state == TCP_LISTEN) {
1573 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1578 if (tcp_child_process(sk, nsk, skb)) {
1585 sock_rps_save_rxhash(sk, skb->rxhash);
1588 TCP_CHECK_TIMER(sk);
1589 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1593 TCP_CHECK_TIMER(sk);
1597 tcp_v4_send_reset(rsk, skb);
1600 /* Be careful here. If this function gets more complicated and
1601 * gcc suffers from register pressure on the x86, sk (in %ebx)
1602 * might be destroyed here. This current version compiles correctly,
1603 * but you have been warned.
1608 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1611 EXPORT_SYMBOL(tcp_v4_do_rcv);
1617 int tcp_v4_rcv(struct sk_buff *skb)
1619 const struct iphdr *iph;
1623 struct net *net = dev_net(skb->dev);
1625 if (skb->pkt_type != PACKET_HOST)
1628 /* Count it even if it's bad */
1629 TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1631 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1636 if (th->doff < sizeof(struct tcphdr) / 4)
1638 if (!pskb_may_pull(skb, th->doff * 4))
1641 /* An explanation is required here, I think.
1642 * Packet length and doff are validated by header prediction,
1643 * provided case of th->doff==0 is eliminated.
1644 * So, we defer the checks. */
1645 if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1650 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1651 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1652 skb->len - th->doff * 4);
1653 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1654 TCP_SKB_CB(skb)->when = 0;
1655 TCP_SKB_CB(skb)->flags = iph->tos;
1656 TCP_SKB_CB(skb)->sacked = 0;
1658 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1663 if (sk->sk_state == TCP_TIME_WAIT)
1666 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1667 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1668 goto discard_and_relse;
1671 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1672 goto discard_and_relse;
1675 if (sk_filter(sk, skb))
1676 goto discard_and_relse;
1680 bh_lock_sock_nested(sk);
1682 if (!sock_owned_by_user(sk)) {
1683 #ifdef CONFIG_NET_DMA
1684 struct tcp_sock *tp = tcp_sk(sk);
1685 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1686 tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1687 if (tp->ucopy.dma_chan)
1688 ret = tcp_v4_do_rcv(sk, skb);
1692 if (!tcp_prequeue(sk, skb))
1693 ret = tcp_v4_do_rcv(sk, skb);
1695 } else if (unlikely(sk_add_backlog(sk, skb))) {
1697 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1698 goto discard_and_relse;
1707 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1710 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1712 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1714 tcp_v4_send_reset(NULL, skb);
1718 /* Discard frame. */
1727 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1728 inet_twsk_put(inet_twsk(sk));
1732 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1733 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1734 inet_twsk_put(inet_twsk(sk));
1737 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1739 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1741 iph->daddr, th->dest,
1744 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1745 inet_twsk_put(inet_twsk(sk));
1749 /* Fall through to ACK */
1752 tcp_v4_timewait_ack(sk, skb);
1756 case TCP_TW_SUCCESS:;
1761 struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1763 struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1764 struct inet_sock *inet = inet_sk(sk);
1765 struct inet_peer *peer;
1767 if (!rt || rt->rt_dst != inet->inet_daddr) {
1768 peer = inet_getpeer_v4(inet->inet_daddr, 1);
1772 rt_bind_peer(rt, 1);
1774 *release_it = false;
1779 EXPORT_SYMBOL(tcp_v4_get_peer);
1781 void *tcp_v4_tw_get_peer(struct sock *sk)
1783 struct inet_timewait_sock *tw = inet_twsk(sk);
1785 return inet_getpeer_v4(tw->tw_daddr, 1);
1787 EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1789 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1790 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1791 .twsk_unique = tcp_twsk_unique,
1792 .twsk_destructor= tcp_twsk_destructor,
1793 .twsk_getpeer = tcp_v4_tw_get_peer,
1796 const struct inet_connection_sock_af_ops ipv4_specific = {
1797 .queue_xmit = ip_queue_xmit,
1798 .send_check = tcp_v4_send_check,
1799 .rebuild_header = inet_sk_rebuild_header,
1800 .conn_request = tcp_v4_conn_request,
1801 .syn_recv_sock = tcp_v4_syn_recv_sock,
1802 .get_peer = tcp_v4_get_peer,
1803 .net_header_len = sizeof(struct iphdr),
1804 .setsockopt = ip_setsockopt,
1805 .getsockopt = ip_getsockopt,
1806 .addr2sockaddr = inet_csk_addr2sockaddr,
1807 .sockaddr_len = sizeof(struct sockaddr_in),
1808 .bind_conflict = inet_csk_bind_conflict,
1809 #ifdef CONFIG_COMPAT
1810 .compat_setsockopt = compat_ip_setsockopt,
1811 .compat_getsockopt = compat_ip_getsockopt,
1814 EXPORT_SYMBOL(ipv4_specific);
1816 #ifdef CONFIG_TCP_MD5SIG
1817 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1818 .md5_lookup = tcp_v4_md5_lookup,
1819 .calc_md5_hash = tcp_v4_md5_hash_skb,
1820 .md5_add = tcp_v4_md5_add_func,
1821 .md5_parse = tcp_v4_parse_md5_keys,
1825 /* NOTE: A lot of things set to zero explicitly by call to
1826 * sk_alloc() so need not be done here.
1828 static int tcp_v4_init_sock(struct sock *sk)
1830 struct inet_connection_sock *icsk = inet_csk(sk);
1831 struct tcp_sock *tp = tcp_sk(sk);
1833 skb_queue_head_init(&tp->out_of_order_queue);
1834 tcp_init_xmit_timers(sk);
1835 tcp_prequeue_init(tp);
1837 icsk->icsk_rto = TCP_TIMEOUT_INIT;
1838 tp->mdev = TCP_TIMEOUT_INIT;
1840 /* So many TCP implementations out there (incorrectly) count the
1841 * initial SYN frame in their delayed-ACK and congestion control
1842 * algorithms that we must have the following bandaid to talk
1843 * efficiently to them. -DaveM
1847 /* See draft-stevens-tcpca-spec-01 for discussion of the
1848 * initialization of these values.
1850 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1851 tp->snd_cwnd_clamp = ~0;
1852 tp->mss_cache = TCP_MSS_DEFAULT;
1854 tp->reordering = sysctl_tcp_reordering;
1855 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1857 sk->sk_state = TCP_CLOSE;
1859 sk->sk_write_space = sk_stream_write_space;
1860 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1862 icsk->icsk_af_ops = &ipv4_specific;
1863 icsk->icsk_sync_mss = tcp_sync_mss;
1864 #ifdef CONFIG_TCP_MD5SIG
1865 tp->af_specific = &tcp_sock_ipv4_specific;
1868 /* TCP Cookie Transactions */
1869 if (sysctl_tcp_cookie_size > 0) {
1870 /* Default, cookies without s_data_payload. */
1872 kzalloc(sizeof(*tp->cookie_values),
1874 if (tp->cookie_values != NULL)
1875 kref_init(&tp->cookie_values->kref);
1877 /* Presumed zeroed, in order of appearance:
1878 * cookie_in_always, cookie_out_never,
1879 * s_data_constant, s_data_in, s_data_out
1881 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1882 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1885 percpu_counter_inc(&tcp_sockets_allocated);
1891 void tcp_v4_destroy_sock(struct sock *sk)
1893 struct tcp_sock *tp = tcp_sk(sk);
1895 tcp_clear_xmit_timers(sk);
1897 tcp_cleanup_congestion_control(sk);
1899 /* Cleanup up the write buffer. */
1900 tcp_write_queue_purge(sk);
1902 /* Cleans up our, hopefully empty, out_of_order_queue. */
1903 __skb_queue_purge(&tp->out_of_order_queue);
1905 #ifdef CONFIG_TCP_MD5SIG
1906 /* Clean up the MD5 key list, if any */
1907 if (tp->md5sig_info) {
1908 tcp_v4_clear_md5_list(sk);
1909 kfree(tp->md5sig_info);
1910 tp->md5sig_info = NULL;
1914 #ifdef CONFIG_NET_DMA
1915 /* Cleans up our sk_async_wait_queue */
1916 __skb_queue_purge(&sk->sk_async_wait_queue);
1919 /* Clean prequeue, it must be empty really */
1920 __skb_queue_purge(&tp->ucopy.prequeue);
1922 /* Clean up a referenced TCP bind bucket. */
1923 if (inet_csk(sk)->icsk_bind_hash)
1927 * If sendmsg cached page exists, toss it.
1929 if (sk->sk_sndmsg_page) {
1930 __free_page(sk->sk_sndmsg_page);
1931 sk->sk_sndmsg_page = NULL;
1934 /* TCP Cookie Transactions */
1935 if (tp->cookie_values != NULL) {
1936 kref_put(&tp->cookie_values->kref,
1937 tcp_cookie_values_release);
1938 tp->cookie_values = NULL;
1941 percpu_counter_dec(&tcp_sockets_allocated);
1943 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1945 #ifdef CONFIG_PROC_FS
1946 /* Proc filesystem TCP sock list dumping. */
1948 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1950 return hlist_nulls_empty(head) ? NULL :
1951 list_entry(head->first, struct inet_timewait_sock, tw_node);
1954 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1956 return !is_a_nulls(tw->tw_node.next) ?
1957 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1961 * Get next listener socket follow cur. If cur is NULL, get first socket
1962 * starting from bucket given in st->bucket; when st->bucket is zero the
1963 * very first socket in the hash table is returned.
1965 static void *listening_get_next(struct seq_file *seq, void *cur)
1967 struct inet_connection_sock *icsk;
1968 struct hlist_nulls_node *node;
1969 struct sock *sk = cur;
1970 struct inet_listen_hashbucket *ilb;
1971 struct tcp_iter_state *st = seq->private;
1972 struct net *net = seq_file_net(seq);
1975 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1976 spin_lock_bh(&ilb->lock);
1977 sk = sk_nulls_head(&ilb->head);
1981 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1985 if (st->state == TCP_SEQ_STATE_OPENREQ) {
1986 struct request_sock *req = cur;
1988 icsk = inet_csk(st->syn_wait_sk);
1992 if (req->rsk_ops->family == st->family) {
1998 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2001 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2003 sk = sk_nulls_next(st->syn_wait_sk);
2004 st->state = TCP_SEQ_STATE_LISTENING;
2005 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2007 icsk = inet_csk(sk);
2008 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2009 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2011 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2012 sk = sk_nulls_next(sk);
2015 sk_nulls_for_each_from(sk, node) {
2016 if (!net_eq(sock_net(sk), net))
2018 if (sk->sk_family == st->family) {
2022 icsk = inet_csk(sk);
2023 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2024 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2026 st->uid = sock_i_uid(sk);
2027 st->syn_wait_sk = sk;
2028 st->state = TCP_SEQ_STATE_OPENREQ;
2032 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2034 spin_unlock_bh(&ilb->lock);
2036 if (++st->bucket < INET_LHTABLE_SIZE) {
2037 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2038 spin_lock_bh(&ilb->lock);
2039 sk = sk_nulls_head(&ilb->head);
2047 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2049 struct tcp_iter_state *st = seq->private;
2054 rc = listening_get_next(seq, NULL);
2056 while (rc && *pos) {
2057 rc = listening_get_next(seq, rc);
2063 static inline int empty_bucket(struct tcp_iter_state *st)
2065 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2066 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2070 * Get first established socket starting from bucket given in st->bucket.
2071 * If st->bucket is zero, the very first socket in the hash is returned.
2073 static void *established_get_first(struct seq_file *seq)
2075 struct tcp_iter_state *st = seq->private;
2076 struct net *net = seq_file_net(seq);
2080 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2082 struct hlist_nulls_node *node;
2083 struct inet_timewait_sock *tw;
2084 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2086 /* Lockless fast path for the common case of empty buckets */
2087 if (empty_bucket(st))
2091 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2092 if (sk->sk_family != st->family ||
2093 !net_eq(sock_net(sk), net)) {
2099 st->state = TCP_SEQ_STATE_TIME_WAIT;
2100 inet_twsk_for_each(tw, node,
2101 &tcp_hashinfo.ehash[st->bucket].twchain) {
2102 if (tw->tw_family != st->family ||
2103 !net_eq(twsk_net(tw), net)) {
2109 spin_unlock_bh(lock);
2110 st->state = TCP_SEQ_STATE_ESTABLISHED;
2116 static void *established_get_next(struct seq_file *seq, void *cur)
2118 struct sock *sk = cur;
2119 struct inet_timewait_sock *tw;
2120 struct hlist_nulls_node *node;
2121 struct tcp_iter_state *st = seq->private;
2122 struct net *net = seq_file_net(seq);
2127 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2131 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2138 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2139 st->state = TCP_SEQ_STATE_ESTABLISHED;
2141 /* Look for next non empty bucket */
2143 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2146 if (st->bucket > tcp_hashinfo.ehash_mask)
2149 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2150 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2152 sk = sk_nulls_next(sk);
2154 sk_nulls_for_each_from(sk, node) {
2155 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2159 st->state = TCP_SEQ_STATE_TIME_WAIT;
2160 tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2168 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2170 struct tcp_iter_state *st = seq->private;
2174 rc = established_get_first(seq);
2177 rc = established_get_next(seq, rc);
2183 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2186 struct tcp_iter_state *st = seq->private;
2188 st->state = TCP_SEQ_STATE_LISTENING;
2189 rc = listening_get_idx(seq, &pos);
2192 st->state = TCP_SEQ_STATE_ESTABLISHED;
2193 rc = established_get_idx(seq, pos);
2199 static void *tcp_seek_last_pos(struct seq_file *seq)
2201 struct tcp_iter_state *st = seq->private;
2202 int offset = st->offset;
2203 int orig_num = st->num;
2206 switch (st->state) {
2207 case TCP_SEQ_STATE_OPENREQ:
2208 case TCP_SEQ_STATE_LISTENING:
2209 if (st->bucket >= INET_LHTABLE_SIZE)
2211 st->state = TCP_SEQ_STATE_LISTENING;
2212 rc = listening_get_next(seq, NULL);
2213 while (offset-- && rc)
2214 rc = listening_get_next(seq, rc);
2219 case TCP_SEQ_STATE_ESTABLISHED:
2220 case TCP_SEQ_STATE_TIME_WAIT:
2221 st->state = TCP_SEQ_STATE_ESTABLISHED;
2222 if (st->bucket > tcp_hashinfo.ehash_mask)
2224 rc = established_get_first(seq);
2225 while (offset-- && rc)
2226 rc = established_get_next(seq, rc);
2234 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2236 struct tcp_iter_state *st = seq->private;
2239 if (*pos && *pos == st->last_pos) {
2240 rc = tcp_seek_last_pos(seq);
2245 st->state = TCP_SEQ_STATE_LISTENING;
2249 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2252 st->last_pos = *pos;
2256 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2258 struct tcp_iter_state *st = seq->private;
2261 if (v == SEQ_START_TOKEN) {
2262 rc = tcp_get_idx(seq, 0);
2266 switch (st->state) {
2267 case TCP_SEQ_STATE_OPENREQ:
2268 case TCP_SEQ_STATE_LISTENING:
2269 rc = listening_get_next(seq, v);
2271 st->state = TCP_SEQ_STATE_ESTABLISHED;
2274 rc = established_get_first(seq);
2277 case TCP_SEQ_STATE_ESTABLISHED:
2278 case TCP_SEQ_STATE_TIME_WAIT:
2279 rc = established_get_next(seq, v);
2284 st->last_pos = *pos;
2288 static void tcp_seq_stop(struct seq_file *seq, void *v)
2290 struct tcp_iter_state *st = seq->private;
2292 switch (st->state) {
2293 case TCP_SEQ_STATE_OPENREQ:
2295 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2296 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2298 case TCP_SEQ_STATE_LISTENING:
2299 if (v != SEQ_START_TOKEN)
2300 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2302 case TCP_SEQ_STATE_TIME_WAIT:
2303 case TCP_SEQ_STATE_ESTABLISHED:
2305 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2310 static int tcp_seq_open(struct inode *inode, struct file *file)
2312 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2313 struct tcp_iter_state *s;
2316 err = seq_open_net(inode, file, &afinfo->seq_ops,
2317 sizeof(struct tcp_iter_state));
2321 s = ((struct seq_file *)file->private_data)->private;
2322 s->family = afinfo->family;
2327 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2330 struct proc_dir_entry *p;
2332 afinfo->seq_fops.open = tcp_seq_open;
2333 afinfo->seq_fops.read = seq_read;
2334 afinfo->seq_fops.llseek = seq_lseek;
2335 afinfo->seq_fops.release = seq_release_net;
2337 afinfo->seq_ops.start = tcp_seq_start;
2338 afinfo->seq_ops.next = tcp_seq_next;
2339 afinfo->seq_ops.stop = tcp_seq_stop;
2341 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2342 &afinfo->seq_fops, afinfo);
2347 EXPORT_SYMBOL(tcp_proc_register);
2349 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2351 proc_net_remove(net, afinfo->name);
2353 EXPORT_SYMBOL(tcp_proc_unregister);
2355 static void get_openreq4(struct sock *sk, struct request_sock *req,
2356 struct seq_file *f, int i, int uid, int *len)
2358 const struct inet_request_sock *ireq = inet_rsk(req);
2359 int ttd = req->expires - jiffies;
2361 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2362 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2365 ntohs(inet_sk(sk)->inet_sport),
2367 ntohs(ireq->rmt_port),
2369 0, 0, /* could print option size, but that is af dependent. */
2370 1, /* timers active (only the expire timer) */
2371 jiffies_to_clock_t(ttd),
2374 0, /* non standard timer */
2375 0, /* open_requests have no inode */
2376 atomic_read(&sk->sk_refcnt),
2381 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2384 unsigned long timer_expires;
2385 struct tcp_sock *tp = tcp_sk(sk);
2386 const struct inet_connection_sock *icsk = inet_csk(sk);
2387 struct inet_sock *inet = inet_sk(sk);
2388 __be32 dest = inet->inet_daddr;
2389 __be32 src = inet->inet_rcv_saddr;
2390 __u16 destp = ntohs(inet->inet_dport);
2391 __u16 srcp = ntohs(inet->inet_sport);
2394 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2396 timer_expires = icsk->icsk_timeout;
2397 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2399 timer_expires = icsk->icsk_timeout;
2400 } else if (timer_pending(&sk->sk_timer)) {
2402 timer_expires = sk->sk_timer.expires;
2405 timer_expires = jiffies;
2408 if (sk->sk_state == TCP_LISTEN)
2409 rx_queue = sk->sk_ack_backlog;
2412 * because we dont lock socket, we might find a transient negative value
2414 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2416 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2417 "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2418 i, src, srcp, dest, destp, sk->sk_state,
2419 tp->write_seq - tp->snd_una,
2422 jiffies_to_clock_t(timer_expires - jiffies),
2423 icsk->icsk_retransmits,
2425 icsk->icsk_probes_out,
2427 atomic_read(&sk->sk_refcnt), sk,
2428 jiffies_to_clock_t(icsk->icsk_rto),
2429 jiffies_to_clock_t(icsk->icsk_ack.ato),
2430 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2432 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2436 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2437 struct seq_file *f, int i, int *len)
2441 int ttd = tw->tw_ttd - jiffies;
2446 dest = tw->tw_daddr;
2447 src = tw->tw_rcv_saddr;
2448 destp = ntohs(tw->tw_dport);
2449 srcp = ntohs(tw->tw_sport);
2451 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2452 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2453 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2454 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2455 atomic_read(&tw->tw_refcnt), tw, len);
2460 static int tcp4_seq_show(struct seq_file *seq, void *v)
2462 struct tcp_iter_state *st;
2465 if (v == SEQ_START_TOKEN) {
2466 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2467 " sl local_address rem_address st tx_queue "
2468 "rx_queue tr tm->when retrnsmt uid timeout "
2474 switch (st->state) {
2475 case TCP_SEQ_STATE_LISTENING:
2476 case TCP_SEQ_STATE_ESTABLISHED:
2477 get_tcp4_sock(v, seq, st->num, &len);
2479 case TCP_SEQ_STATE_OPENREQ:
2480 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2482 case TCP_SEQ_STATE_TIME_WAIT:
2483 get_timewait4_sock(v, seq, st->num, &len);
2486 seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2491 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2495 .owner = THIS_MODULE,
2498 .show = tcp4_seq_show,
2502 static int __net_init tcp4_proc_init_net(struct net *net)
2504 return tcp_proc_register(net, &tcp4_seq_afinfo);
2507 static void __net_exit tcp4_proc_exit_net(struct net *net)
2509 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2512 static struct pernet_operations tcp4_net_ops = {
2513 .init = tcp4_proc_init_net,
2514 .exit = tcp4_proc_exit_net,
2517 int __init tcp4_proc_init(void)
2519 return register_pernet_subsys(&tcp4_net_ops);
2522 void tcp4_proc_exit(void)
2524 unregister_pernet_subsys(&tcp4_net_ops);
2526 #endif /* CONFIG_PROC_FS */
2528 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2530 struct iphdr *iph = skb_gro_network_header(skb);
2532 switch (skb->ip_summed) {
2533 case CHECKSUM_COMPLETE:
2534 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2536 skb->ip_summed = CHECKSUM_UNNECESSARY;
2542 NAPI_GRO_CB(skb)->flush = 1;
2546 return tcp_gro_receive(head, skb);
2549 int tcp4_gro_complete(struct sk_buff *skb)
2551 struct iphdr *iph = ip_hdr(skb);
2552 struct tcphdr *th = tcp_hdr(skb);
2554 th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2555 iph->saddr, iph->daddr, 0);
2556 skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2558 return tcp_gro_complete(skb);
2561 struct proto tcp_prot = {
2563 .owner = THIS_MODULE,
2565 .connect = tcp_v4_connect,
2566 .disconnect = tcp_disconnect,
2567 .accept = inet_csk_accept,
2569 .init = tcp_v4_init_sock,
2570 .destroy = tcp_v4_destroy_sock,
2571 .shutdown = tcp_shutdown,
2572 .setsockopt = tcp_setsockopt,
2573 .getsockopt = tcp_getsockopt,
2574 .recvmsg = tcp_recvmsg,
2575 .sendmsg = tcp_sendmsg,
2576 .sendpage = tcp_sendpage,
2577 .backlog_rcv = tcp_v4_do_rcv,
2579 .unhash = inet_unhash,
2580 .get_port = inet_csk_get_port,
2581 .enter_memory_pressure = tcp_enter_memory_pressure,
2582 .sockets_allocated = &tcp_sockets_allocated,
2583 .orphan_count = &tcp_orphan_count,
2584 .memory_allocated = &tcp_memory_allocated,
2585 .memory_pressure = &tcp_memory_pressure,
2586 .sysctl_mem = sysctl_tcp_mem,
2587 .sysctl_wmem = sysctl_tcp_wmem,
2588 .sysctl_rmem = sysctl_tcp_rmem,
2589 .max_header = MAX_TCP_HEADER,
2590 .obj_size = sizeof(struct tcp_sock),
2591 .slab_flags = SLAB_DESTROY_BY_RCU,
2592 .twsk_prot = &tcp_timewait_sock_ops,
2593 .rsk_prot = &tcp_request_sock_ops,
2594 .h.hashinfo = &tcp_hashinfo,
2595 .no_autobind = true,
2596 #ifdef CONFIG_COMPAT
2597 .compat_setsockopt = compat_tcp_setsockopt,
2598 .compat_getsockopt = compat_tcp_getsockopt,
2601 EXPORT_SYMBOL(tcp_prot);
2604 static int __net_init tcp_sk_init(struct net *net)
2606 return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2607 PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2610 static void __net_exit tcp_sk_exit(struct net *net)
2612 inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2615 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2617 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2620 static struct pernet_operations __net_initdata tcp_sk_ops = {
2621 .init = tcp_sk_init,
2622 .exit = tcp_sk_exit,
2623 .exit_batch = tcp_sk_exit_batch,
2626 void __init tcp_v4_init(void)
2628 inet_hashinfo_init(&tcp_hashinfo);
2629 if (register_pernet_subsys(&tcp_sk_ops))
2630 panic("Failed to create the TCP control socket.\n");