TCP: check min TTL on received ICMP packets
[linux-flexiantxendom0-natty.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53
54 #include <linux/bottom_half.h>
55 #include <linux/types.h>
56 #include <linux/fcntl.h>
57 #include <linux/module.h>
58 #include <linux/random.h>
59 #include <linux/cache.h>
60 #include <linux/jhash.h>
61 #include <linux/init.h>
62 #include <linux/times.h>
63
64 #include <net/net_namespace.h>
65 #include <net/icmp.h>
66 #include <net/inet_hashtables.h>
67 #include <net/tcp.h>
68 #include <net/transp_v6.h>
69 #include <net/ipv6.h>
70 #include <net/inet_common.h>
71 #include <net/timewait_sock.h>
72 #include <net/xfrm.h>
73 #include <net/netdma.h>
74
75 #include <linux/inet.h>
76 #include <linux/ipv6.h>
77 #include <linux/stddef.h>
78 #include <linux/proc_fs.h>
79 #include <linux/seq_file.h>
80
81 #include <linux/crypto.h>
82 #include <linux/scatterlist.h>
83
84 int sysctl_tcp_tw_reuse __read_mostly;
85 int sysctl_tcp_low_latency __read_mostly;
86
87
88 #ifdef CONFIG_TCP_MD5SIG
89 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
90                                                    __be32 addr);
91 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
92                                __be32 daddr, __be32 saddr, struct tcphdr *th);
93 #else
94 static inline
95 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
96 {
97         return NULL;
98 }
99 #endif
100
101 struct inet_hashinfo tcp_hashinfo;
102
103 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
104 {
105         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
106                                           ip_hdr(skb)->saddr,
107                                           tcp_hdr(skb)->dest,
108                                           tcp_hdr(skb)->source);
109 }
110
111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112 {
113         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114         struct tcp_sock *tp = tcp_sk(sk);
115
116         /* With PAWS, it is safe from the viewpoint
117            of data integrity. Even without PAWS it is safe provided sequence
118            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
119
120            Actually, the idea is close to VJ's one, only timestamp cache is
121            held not per host, but per port pair and TW bucket is used as state
122            holder.
123
124            If TW bucket has been already destroyed we fall back to VJ's scheme
125            and use initial timestamp retrieved from peer table.
126          */
127         if (tcptw->tw_ts_recent_stamp &&
128             (twp == NULL || (sysctl_tcp_tw_reuse &&
129                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
130                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
131                 if (tp->write_seq == 0)
132                         tp->write_seq = 1;
133                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
134                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
135                 sock_hold(sktw);
136                 return 1;
137         }
138
139         return 0;
140 }
141
142 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
143
144 /* This will initiate an outgoing connection. */
145 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
146 {
147         struct inet_sock *inet = inet_sk(sk);
148         struct tcp_sock *tp = tcp_sk(sk);
149         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
150         struct rtable *rt;
151         __be32 daddr, nexthop;
152         int tmp;
153         int err;
154
155         if (addr_len < sizeof(struct sockaddr_in))
156                 return -EINVAL;
157
158         if (usin->sin_family != AF_INET)
159                 return -EAFNOSUPPORT;
160
161         nexthop = daddr = usin->sin_addr.s_addr;
162         if (inet->opt && inet->opt->srr) {
163                 if (!daddr)
164                         return -EINVAL;
165                 nexthop = inet->opt->faddr;
166         }
167
168         tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
169                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
170                                IPPROTO_TCP,
171                                inet->inet_sport, usin->sin_port, sk, 1);
172         if (tmp < 0) {
173                 if (tmp == -ENETUNREACH)
174                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
175                 return tmp;
176         }
177
178         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
179                 ip_rt_put(rt);
180                 return -ENETUNREACH;
181         }
182
183         if (!inet->opt || !inet->opt->srr)
184                 daddr = rt->rt_dst;
185
186         if (!inet->inet_saddr)
187                 inet->inet_saddr = rt->rt_src;
188         inet->inet_rcv_saddr = inet->inet_saddr;
189
190         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
191                 /* Reset inherited state */
192                 tp->rx_opt.ts_recent       = 0;
193                 tp->rx_opt.ts_recent_stamp = 0;
194                 tp->write_seq              = 0;
195         }
196
197         if (tcp_death_row.sysctl_tw_recycle &&
198             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
199                 struct inet_peer *peer = rt_get_peer(rt);
200                 /*
201                  * VJ's idea. We save last timestamp seen from
202                  * the destination in peer table, when entering state
203                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
204                  * when trying new connection.
205                  */
206                 if (peer != NULL &&
207                     (u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
208                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
209                         tp->rx_opt.ts_recent = peer->tcp_ts;
210                 }
211         }
212
213         inet->inet_dport = usin->sin_port;
214         inet->inet_daddr = daddr;
215
216         inet_csk(sk)->icsk_ext_hdr_len = 0;
217         if (inet->opt)
218                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
219
220         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
221
222         /* Socket identity is still unknown (sport may be zero).
223          * However we set state to SYN-SENT and not releasing socket
224          * lock select source port, enter ourselves into the hash tables and
225          * complete initialization after this.
226          */
227         tcp_set_state(sk, TCP_SYN_SENT);
228         err = inet_hash_connect(&tcp_death_row, sk);
229         if (err)
230                 goto failure;
231
232         err = ip_route_newports(&rt, IPPROTO_TCP,
233                                 inet->inet_sport, inet->inet_dport, sk);
234         if (err)
235                 goto failure;
236
237         /* OK, now commit destination to socket.  */
238         sk->sk_gso_type = SKB_GSO_TCPV4;
239         sk_setup_caps(sk, &rt->u.dst);
240
241         if (!tp->write_seq)
242                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
243                                                            inet->inet_daddr,
244                                                            inet->inet_sport,
245                                                            usin->sin_port);
246
247         inet->inet_id = tp->write_seq ^ jiffies;
248
249         err = tcp_connect(sk);
250         rt = NULL;
251         if (err)
252                 goto failure;
253
254         return 0;
255
256 failure:
257         /*
258          * This unhashes the socket and releases the local port,
259          * if necessary.
260          */
261         tcp_set_state(sk, TCP_CLOSE);
262         ip_rt_put(rt);
263         sk->sk_route_caps = 0;
264         inet->inet_dport = 0;
265         return err;
266 }
267
268 /*
269  * This routine does path mtu discovery as defined in RFC1191.
270  */
271 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
272 {
273         struct dst_entry *dst;
274         struct inet_sock *inet = inet_sk(sk);
275
276         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
277          * send out by Linux are always <576bytes so they should go through
278          * unfragmented).
279          */
280         if (sk->sk_state == TCP_LISTEN)
281                 return;
282
283         /* We don't check in the destentry if pmtu discovery is forbidden
284          * on this route. We just assume that no packet_to_big packets
285          * are send back when pmtu discovery is not active.
286          * There is a small race when the user changes this flag in the
287          * route, but I think that's acceptable.
288          */
289         if ((dst = __sk_dst_check(sk, 0)) == NULL)
290                 return;
291
292         dst->ops->update_pmtu(dst, mtu);
293
294         /* Something is about to be wrong... Remember soft error
295          * for the case, if this connection will not able to recover.
296          */
297         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
298                 sk->sk_err_soft = EMSGSIZE;
299
300         mtu = dst_mtu(dst);
301
302         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
303             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
304                 tcp_sync_mss(sk, mtu);
305
306                 /* Resend the TCP packet because it's
307                  * clear that the old packet has been
308                  * dropped. This is the new "fast" path mtu
309                  * discovery.
310                  */
311                 tcp_simple_retransmit(sk);
312         } /* else let the usual retransmit timer handle it */
313 }
314
315 /*
316  * This routine is called by the ICMP module when it gets some
317  * sort of error condition.  If err < 0 then the socket should
318  * be closed and the error returned to the user.  If err > 0
319  * it's just the icmp type << 8 | icmp code.  After adjustment
320  * header points to the first 8 bytes of the tcp header.  We need
321  * to find the appropriate port.
322  *
323  * The locking strategy used here is very "optimistic". When
324  * someone else accesses the socket the ICMP is just dropped
325  * and for some paths there is no check at all.
326  * A more general error queue to queue errors for later handling
327  * is probably better.
328  *
329  */
330
331 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
332 {
333         struct iphdr *iph = (struct iphdr *)icmp_skb->data;
334         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
335         struct inet_connection_sock *icsk;
336         struct tcp_sock *tp;
337         struct inet_sock *inet;
338         const int type = icmp_hdr(icmp_skb)->type;
339         const int code = icmp_hdr(icmp_skb)->code;
340         struct sock *sk;
341         struct sk_buff *skb;
342         __u32 seq;
343         __u32 remaining;
344         int err;
345         struct net *net = dev_net(icmp_skb->dev);
346
347         if (icmp_skb->len < (iph->ihl << 2) + 8) {
348                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
349                 return;
350         }
351
352         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
353                         iph->saddr, th->source, inet_iif(icmp_skb));
354         if (!sk) {
355                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
356                 return;
357         }
358         if (sk->sk_state == TCP_TIME_WAIT) {
359                 inet_twsk_put(inet_twsk(sk));
360                 return;
361         }
362
363         bh_lock_sock(sk);
364         /* If too many ICMPs get dropped on busy
365          * servers this needs to be solved differently.
366          */
367         if (sock_owned_by_user(sk))
368                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
369
370         if (sk->sk_state == TCP_CLOSE)
371                 goto out;
372
373         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
374                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
375                 goto out;
376         }
377
378         icsk = inet_csk(sk);
379         tp = tcp_sk(sk);
380         seq = ntohl(th->seq);
381         if (sk->sk_state != TCP_LISTEN &&
382             !between(seq, tp->snd_una, tp->snd_nxt)) {
383                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
384                 goto out;
385         }
386
387         switch (type) {
388         case ICMP_SOURCE_QUENCH:
389                 /* Just silently ignore these. */
390                 goto out;
391         case ICMP_PARAMETERPROB:
392                 err = EPROTO;
393                 break;
394         case ICMP_DEST_UNREACH:
395                 if (code > NR_ICMP_UNREACH)
396                         goto out;
397
398                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
399                         if (!sock_owned_by_user(sk))
400                                 do_pmtu_discovery(sk, iph, info);
401                         goto out;
402                 }
403
404                 err = icmp_err_convert[code].errno;
405                 /* check if icmp_skb allows revert of backoff
406                  * (see draft-zimmermann-tcp-lcd) */
407                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
408                         break;
409                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
410                     !icsk->icsk_backoff)
411                         break;
412
413                 icsk->icsk_backoff--;
414                 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
415                                          icsk->icsk_backoff;
416                 tcp_bound_rto(sk);
417
418                 skb = tcp_write_queue_head(sk);
419                 BUG_ON(!skb);
420
421                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
422                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
423
424                 if (remaining) {
425                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
426                                                   remaining, TCP_RTO_MAX);
427                 } else if (sock_owned_by_user(sk)) {
428                         /* RTO revert clocked out retransmission,
429                          * but socket is locked. Will defer. */
430                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
431                                                   HZ/20, TCP_RTO_MAX);
432                 } else {
433                         /* RTO revert clocked out retransmission.
434                          * Will retransmit now */
435                         tcp_retransmit_timer(sk);
436                 }
437
438                 break;
439         case ICMP_TIME_EXCEEDED:
440                 err = EHOSTUNREACH;
441                 break;
442         default:
443                 goto out;
444         }
445
446         switch (sk->sk_state) {
447                 struct request_sock *req, **prev;
448         case TCP_LISTEN:
449                 if (sock_owned_by_user(sk))
450                         goto out;
451
452                 req = inet_csk_search_req(sk, &prev, th->dest,
453                                           iph->daddr, iph->saddr);
454                 if (!req)
455                         goto out;
456
457                 /* ICMPs are not backlogged, hence we cannot get
458                    an established socket here.
459                  */
460                 WARN_ON(req->sk);
461
462                 if (seq != tcp_rsk(req)->snt_isn) {
463                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
464                         goto out;
465                 }
466
467                 /*
468                  * Still in SYN_RECV, just remove it silently.
469                  * There is no good way to pass the error to the newly
470                  * created socket, and POSIX does not want network
471                  * errors returned from accept().
472                  */
473                 inet_csk_reqsk_queue_drop(sk, req, prev);
474                 goto out;
475
476         case TCP_SYN_SENT:
477         case TCP_SYN_RECV:  /* Cannot happen.
478                                It can f.e. if SYNs crossed.
479                              */
480                 if (!sock_owned_by_user(sk)) {
481                         sk->sk_err = err;
482
483                         sk->sk_error_report(sk);
484
485                         tcp_done(sk);
486                 } else {
487                         sk->sk_err_soft = err;
488                 }
489                 goto out;
490         }
491
492         /* If we've already connected we will keep trying
493          * until we time out, or the user gives up.
494          *
495          * rfc1122 4.2.3.9 allows to consider as hard errors
496          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
497          * but it is obsoleted by pmtu discovery).
498          *
499          * Note, that in modern internet, where routing is unreliable
500          * and in each dark corner broken firewalls sit, sending random
501          * errors ordered by their masters even this two messages finally lose
502          * their original sense (even Linux sends invalid PORT_UNREACHs)
503          *
504          * Now we are in compliance with RFCs.
505          *                                                      --ANK (980905)
506          */
507
508         inet = inet_sk(sk);
509         if (!sock_owned_by_user(sk) && inet->recverr) {
510                 sk->sk_err = err;
511                 sk->sk_error_report(sk);
512         } else  { /* Only an error on timeout */
513                 sk->sk_err_soft = err;
514         }
515
516 out:
517         bh_unlock_sock(sk);
518         sock_put(sk);
519 }
520
521 /* This routine computes an IPv4 TCP checksum. */
522 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
523 {
524         struct inet_sock *inet = inet_sk(sk);
525         struct tcphdr *th = tcp_hdr(skb);
526
527         if (skb->ip_summed == CHECKSUM_PARTIAL) {
528                 th->check = ~tcp_v4_check(len, inet->inet_saddr,
529                                           inet->inet_daddr, 0);
530                 skb->csum_start = skb_transport_header(skb) - skb->head;
531                 skb->csum_offset = offsetof(struct tcphdr, check);
532         } else {
533                 th->check = tcp_v4_check(len, inet->inet_saddr,
534                                          inet->inet_daddr,
535                                          csum_partial(th,
536                                                       th->doff << 2,
537                                                       skb->csum));
538         }
539 }
540
541 int tcp_v4_gso_send_check(struct sk_buff *skb)
542 {
543         const struct iphdr *iph;
544         struct tcphdr *th;
545
546         if (!pskb_may_pull(skb, sizeof(*th)))
547                 return -EINVAL;
548
549         iph = ip_hdr(skb);
550         th = tcp_hdr(skb);
551
552         th->check = 0;
553         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
554         skb->csum_start = skb_transport_header(skb) - skb->head;
555         skb->csum_offset = offsetof(struct tcphdr, check);
556         skb->ip_summed = CHECKSUM_PARTIAL;
557         return 0;
558 }
559
560 /*
561  *      This routine will send an RST to the other tcp.
562  *
563  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
564  *                    for reset.
565  *      Answer: if a packet caused RST, it is not for a socket
566  *              existing in our system, if it is matched to a socket,
567  *              it is just duplicate segment or bug in other side's TCP.
568  *              So that we build reply only basing on parameters
569  *              arrived with segment.
570  *      Exception: precedence violation. We do not implement it in any case.
571  */
572
573 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
574 {
575         struct tcphdr *th = tcp_hdr(skb);
576         struct {
577                 struct tcphdr th;
578 #ifdef CONFIG_TCP_MD5SIG
579                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
580 #endif
581         } rep;
582         struct ip_reply_arg arg;
583 #ifdef CONFIG_TCP_MD5SIG
584         struct tcp_md5sig_key *key;
585 #endif
586         struct net *net;
587
588         /* Never send a reset in response to a reset. */
589         if (th->rst)
590                 return;
591
592         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
593                 return;
594
595         /* Swap the send and the receive. */
596         memset(&rep, 0, sizeof(rep));
597         rep.th.dest   = th->source;
598         rep.th.source = th->dest;
599         rep.th.doff   = sizeof(struct tcphdr) / 4;
600         rep.th.rst    = 1;
601
602         if (th->ack) {
603                 rep.th.seq = th->ack_seq;
604         } else {
605                 rep.th.ack = 1;
606                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
607                                        skb->len - (th->doff << 2));
608         }
609
610         memset(&arg, 0, sizeof(arg));
611         arg.iov[0].iov_base = (unsigned char *)&rep;
612         arg.iov[0].iov_len  = sizeof(rep.th);
613
614 #ifdef CONFIG_TCP_MD5SIG
615         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
616         if (key) {
617                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
618                                    (TCPOPT_NOP << 16) |
619                                    (TCPOPT_MD5SIG << 8) |
620                                    TCPOLEN_MD5SIG);
621                 /* Update length and the length the header thinks exists */
622                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
623                 rep.th.doff = arg.iov[0].iov_len / 4;
624
625                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
626                                      key, ip_hdr(skb)->saddr,
627                                      ip_hdr(skb)->daddr, &rep.th);
628         }
629 #endif
630         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
631                                       ip_hdr(skb)->saddr, /* XXX */
632                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
633         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
634         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
635
636         net = dev_net(skb_dst(skb)->dev);
637         ip_send_reply(net->ipv4.tcp_sock, skb,
638                       &arg, arg.iov[0].iov_len);
639
640         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
641         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
642 }
643
644 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
645    outside socket context is ugly, certainly. What can I do?
646  */
647
648 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
649                             u32 win, u32 ts, int oif,
650                             struct tcp_md5sig_key *key,
651                             int reply_flags)
652 {
653         struct tcphdr *th = tcp_hdr(skb);
654         struct {
655                 struct tcphdr th;
656                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
657 #ifdef CONFIG_TCP_MD5SIG
658                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
659 #endif
660                         ];
661         } rep;
662         struct ip_reply_arg arg;
663         struct net *net = dev_net(skb_dst(skb)->dev);
664
665         memset(&rep.th, 0, sizeof(struct tcphdr));
666         memset(&arg, 0, sizeof(arg));
667
668         arg.iov[0].iov_base = (unsigned char *)&rep;
669         arg.iov[0].iov_len  = sizeof(rep.th);
670         if (ts) {
671                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
672                                    (TCPOPT_TIMESTAMP << 8) |
673                                    TCPOLEN_TIMESTAMP);
674                 rep.opt[1] = htonl(tcp_time_stamp);
675                 rep.opt[2] = htonl(ts);
676                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
677         }
678
679         /* Swap the send and the receive. */
680         rep.th.dest    = th->source;
681         rep.th.source  = th->dest;
682         rep.th.doff    = arg.iov[0].iov_len / 4;
683         rep.th.seq     = htonl(seq);
684         rep.th.ack_seq = htonl(ack);
685         rep.th.ack     = 1;
686         rep.th.window  = htons(win);
687
688 #ifdef CONFIG_TCP_MD5SIG
689         if (key) {
690                 int offset = (ts) ? 3 : 0;
691
692                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
693                                           (TCPOPT_NOP << 16) |
694                                           (TCPOPT_MD5SIG << 8) |
695                                           TCPOLEN_MD5SIG);
696                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
697                 rep.th.doff = arg.iov[0].iov_len/4;
698
699                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
700                                     key, ip_hdr(skb)->saddr,
701                                     ip_hdr(skb)->daddr, &rep.th);
702         }
703 #endif
704         arg.flags = reply_flags;
705         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
706                                       ip_hdr(skb)->saddr, /* XXX */
707                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
708         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
709         if (oif)
710                 arg.bound_dev_if = oif;
711
712         ip_send_reply(net->ipv4.tcp_sock, skb,
713                       &arg, arg.iov[0].iov_len);
714
715         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
716 }
717
718 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
719 {
720         struct inet_timewait_sock *tw = inet_twsk(sk);
721         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
722
723         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
724                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
725                         tcptw->tw_ts_recent,
726                         tw->tw_bound_dev_if,
727                         tcp_twsk_md5_key(tcptw),
728                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
729                         );
730
731         inet_twsk_put(tw);
732 }
733
734 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
735                                   struct request_sock *req)
736 {
737         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
738                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
739                         req->ts_recent,
740                         0,
741                         tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
742                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
743 }
744
745 /*
746  *      Send a SYN-ACK after having received a SYN.
747  *      This still operates on a request_sock only, not on a big
748  *      socket.
749  */
750 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
751                               struct request_sock *req,
752                               struct request_values *rvp)
753 {
754         const struct inet_request_sock *ireq = inet_rsk(req);
755         int err = -1;
756         struct sk_buff * skb;
757
758         /* First, grab a route. */
759         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
760                 return -1;
761
762         skb = tcp_make_synack(sk, dst, req, rvp);
763
764         if (skb) {
765                 struct tcphdr *th = tcp_hdr(skb);
766
767                 th->check = tcp_v4_check(skb->len,
768                                          ireq->loc_addr,
769                                          ireq->rmt_addr,
770                                          csum_partial(th, skb->len,
771                                                       skb->csum));
772
773                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
774                                             ireq->rmt_addr,
775                                             ireq->opt);
776                 err = net_xmit_eval(err);
777         }
778
779         dst_release(dst);
780         return err;
781 }
782
783 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
784                               struct request_values *rvp)
785 {
786         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
787         return tcp_v4_send_synack(sk, NULL, req, rvp);
788 }
789
790 /*
791  *      IPv4 request_sock destructor.
792  */
793 static void tcp_v4_reqsk_destructor(struct request_sock *req)
794 {
795         kfree(inet_rsk(req)->opt);
796 }
797
798 #ifdef CONFIG_SYN_COOKIES
799 static void syn_flood_warning(struct sk_buff *skb)
800 {
801         static unsigned long warntime;
802
803         if (time_after(jiffies, (warntime + HZ * 60))) {
804                 warntime = jiffies;
805                 printk(KERN_INFO
806                        "possible SYN flooding on port %d. Sending cookies.\n",
807                        ntohs(tcp_hdr(skb)->dest));
808         }
809 }
810 #endif
811
812 /*
813  * Save and compile IPv4 options into the request_sock if needed.
814  */
815 static struct ip_options *tcp_v4_save_options(struct sock *sk,
816                                               struct sk_buff *skb)
817 {
818         struct ip_options *opt = &(IPCB(skb)->opt);
819         struct ip_options *dopt = NULL;
820
821         if (opt && opt->optlen) {
822                 int opt_size = optlength(opt);
823                 dopt = kmalloc(opt_size, GFP_ATOMIC);
824                 if (dopt) {
825                         if (ip_options_echo(dopt, skb)) {
826                                 kfree(dopt);
827                                 dopt = NULL;
828                         }
829                 }
830         }
831         return dopt;
832 }
833
834 #ifdef CONFIG_TCP_MD5SIG
835 /*
836  * RFC2385 MD5 checksumming requires a mapping of
837  * IP address->MD5 Key.
838  * We need to maintain these in the sk structure.
839  */
840
841 /* Find the Key structure for an address.  */
842 static struct tcp_md5sig_key *
843                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
844 {
845         struct tcp_sock *tp = tcp_sk(sk);
846         int i;
847
848         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
849                 return NULL;
850         for (i = 0; i < tp->md5sig_info->entries4; i++) {
851                 if (tp->md5sig_info->keys4[i].addr == addr)
852                         return &tp->md5sig_info->keys4[i].base;
853         }
854         return NULL;
855 }
856
857 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
858                                          struct sock *addr_sk)
859 {
860         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
861 }
862
863 EXPORT_SYMBOL(tcp_v4_md5_lookup);
864
865 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
866                                                       struct request_sock *req)
867 {
868         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
869 }
870
871 /* This can be called on a newly created socket, from other files */
872 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
873                       u8 *newkey, u8 newkeylen)
874 {
875         /* Add Key to the list */
876         struct tcp_md5sig_key *key;
877         struct tcp_sock *tp = tcp_sk(sk);
878         struct tcp4_md5sig_key *keys;
879
880         key = tcp_v4_md5_do_lookup(sk, addr);
881         if (key) {
882                 /* Pre-existing entry - just update that one. */
883                 kfree(key->key);
884                 key->key = newkey;
885                 key->keylen = newkeylen;
886         } else {
887                 struct tcp_md5sig_info *md5sig;
888
889                 if (!tp->md5sig_info) {
890                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
891                                                   GFP_ATOMIC);
892                         if (!tp->md5sig_info) {
893                                 kfree(newkey);
894                                 return -ENOMEM;
895                         }
896                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
897                 }
898                 if (tcp_alloc_md5sig_pool(sk) == NULL) {
899                         kfree(newkey);
900                         return -ENOMEM;
901                 }
902                 md5sig = tp->md5sig_info;
903
904                 if (md5sig->alloced4 == md5sig->entries4) {
905                         keys = kmalloc((sizeof(*keys) *
906                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
907                         if (!keys) {
908                                 kfree(newkey);
909                                 tcp_free_md5sig_pool();
910                                 return -ENOMEM;
911                         }
912
913                         if (md5sig->entries4)
914                                 memcpy(keys, md5sig->keys4,
915                                        sizeof(*keys) * md5sig->entries4);
916
917                         /* Free old key list, and reference new one */
918                         kfree(md5sig->keys4);
919                         md5sig->keys4 = keys;
920                         md5sig->alloced4++;
921                 }
922                 md5sig->entries4++;
923                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
924                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
925                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
926         }
927         return 0;
928 }
929
930 EXPORT_SYMBOL(tcp_v4_md5_do_add);
931
932 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
933                                u8 *newkey, u8 newkeylen)
934 {
935         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
936                                  newkey, newkeylen);
937 }
938
939 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
940 {
941         struct tcp_sock *tp = tcp_sk(sk);
942         int i;
943
944         for (i = 0; i < tp->md5sig_info->entries4; i++) {
945                 if (tp->md5sig_info->keys4[i].addr == addr) {
946                         /* Free the key */
947                         kfree(tp->md5sig_info->keys4[i].base.key);
948                         tp->md5sig_info->entries4--;
949
950                         if (tp->md5sig_info->entries4 == 0) {
951                                 kfree(tp->md5sig_info->keys4);
952                                 tp->md5sig_info->keys4 = NULL;
953                                 tp->md5sig_info->alloced4 = 0;
954                         } else if (tp->md5sig_info->entries4 != i) {
955                                 /* Need to do some manipulation */
956                                 memmove(&tp->md5sig_info->keys4[i],
957                                         &tp->md5sig_info->keys4[i+1],
958                                         (tp->md5sig_info->entries4 - i) *
959                                          sizeof(struct tcp4_md5sig_key));
960                         }
961                         tcp_free_md5sig_pool();
962                         return 0;
963                 }
964         }
965         return -ENOENT;
966 }
967
968 EXPORT_SYMBOL(tcp_v4_md5_do_del);
969
970 static void tcp_v4_clear_md5_list(struct sock *sk)
971 {
972         struct tcp_sock *tp = tcp_sk(sk);
973
974         /* Free each key, then the set of key keys,
975          * the crypto element, and then decrement our
976          * hold on the last resort crypto.
977          */
978         if (tp->md5sig_info->entries4) {
979                 int i;
980                 for (i = 0; i < tp->md5sig_info->entries4; i++)
981                         kfree(tp->md5sig_info->keys4[i].base.key);
982                 tp->md5sig_info->entries4 = 0;
983                 tcp_free_md5sig_pool();
984         }
985         if (tp->md5sig_info->keys4) {
986                 kfree(tp->md5sig_info->keys4);
987                 tp->md5sig_info->keys4 = NULL;
988                 tp->md5sig_info->alloced4  = 0;
989         }
990 }
991
992 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
993                                  int optlen)
994 {
995         struct tcp_md5sig cmd;
996         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
997         u8 *newkey;
998
999         if (optlen < sizeof(cmd))
1000                 return -EINVAL;
1001
1002         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1003                 return -EFAULT;
1004
1005         if (sin->sin_family != AF_INET)
1006                 return -EINVAL;
1007
1008         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1009                 if (!tcp_sk(sk)->md5sig_info)
1010                         return -ENOENT;
1011                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1012         }
1013
1014         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1015                 return -EINVAL;
1016
1017         if (!tcp_sk(sk)->md5sig_info) {
1018                 struct tcp_sock *tp = tcp_sk(sk);
1019                 struct tcp_md5sig_info *p;
1020
1021                 p = kzalloc(sizeof(*p), sk->sk_allocation);
1022                 if (!p)
1023                         return -EINVAL;
1024
1025                 tp->md5sig_info = p;
1026                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1027         }
1028
1029         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1030         if (!newkey)
1031                 return -ENOMEM;
1032         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1033                                  newkey, cmd.tcpm_keylen);
1034 }
1035
1036 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1037                                         __be32 daddr, __be32 saddr, int nbytes)
1038 {
1039         struct tcp4_pseudohdr *bp;
1040         struct scatterlist sg;
1041
1042         bp = &hp->md5_blk.ip4;
1043
1044         /*
1045          * 1. the TCP pseudo-header (in the order: source IP address,
1046          * destination IP address, zero-padded protocol number, and
1047          * segment length)
1048          */
1049         bp->saddr = saddr;
1050         bp->daddr = daddr;
1051         bp->pad = 0;
1052         bp->protocol = IPPROTO_TCP;
1053         bp->len = cpu_to_be16(nbytes);
1054
1055         sg_init_one(&sg, bp, sizeof(*bp));
1056         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1057 }
1058
1059 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1060                                __be32 daddr, __be32 saddr, struct tcphdr *th)
1061 {
1062         struct tcp_md5sig_pool *hp;
1063         struct hash_desc *desc;
1064
1065         hp = tcp_get_md5sig_pool();
1066         if (!hp)
1067                 goto clear_hash_noput;
1068         desc = &hp->md5_desc;
1069
1070         if (crypto_hash_init(desc))
1071                 goto clear_hash;
1072         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1073                 goto clear_hash;
1074         if (tcp_md5_hash_header(hp, th))
1075                 goto clear_hash;
1076         if (tcp_md5_hash_key(hp, key))
1077                 goto clear_hash;
1078         if (crypto_hash_final(desc, md5_hash))
1079                 goto clear_hash;
1080
1081         tcp_put_md5sig_pool();
1082         return 0;
1083
1084 clear_hash:
1085         tcp_put_md5sig_pool();
1086 clear_hash_noput:
1087         memset(md5_hash, 0, 16);
1088         return 1;
1089 }
1090
1091 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1092                         struct sock *sk, struct request_sock *req,
1093                         struct sk_buff *skb)
1094 {
1095         struct tcp_md5sig_pool *hp;
1096         struct hash_desc *desc;
1097         struct tcphdr *th = tcp_hdr(skb);
1098         __be32 saddr, daddr;
1099
1100         if (sk) {
1101                 saddr = inet_sk(sk)->inet_saddr;
1102                 daddr = inet_sk(sk)->inet_daddr;
1103         } else if (req) {
1104                 saddr = inet_rsk(req)->loc_addr;
1105                 daddr = inet_rsk(req)->rmt_addr;
1106         } else {
1107                 const struct iphdr *iph = ip_hdr(skb);
1108                 saddr = iph->saddr;
1109                 daddr = iph->daddr;
1110         }
1111
1112         hp = tcp_get_md5sig_pool();
1113         if (!hp)
1114                 goto clear_hash_noput;
1115         desc = &hp->md5_desc;
1116
1117         if (crypto_hash_init(desc))
1118                 goto clear_hash;
1119
1120         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1121                 goto clear_hash;
1122         if (tcp_md5_hash_header(hp, th))
1123                 goto clear_hash;
1124         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1125                 goto clear_hash;
1126         if (tcp_md5_hash_key(hp, key))
1127                 goto clear_hash;
1128         if (crypto_hash_final(desc, md5_hash))
1129                 goto clear_hash;
1130
1131         tcp_put_md5sig_pool();
1132         return 0;
1133
1134 clear_hash:
1135         tcp_put_md5sig_pool();
1136 clear_hash_noput:
1137         memset(md5_hash, 0, 16);
1138         return 1;
1139 }
1140
1141 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1142
1143 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1144 {
1145         /*
1146          * This gets called for each TCP segment that arrives
1147          * so we want to be efficient.
1148          * We have 3 drop cases:
1149          * o No MD5 hash and one expected.
1150          * o MD5 hash and we're not expecting one.
1151          * o MD5 hash and its wrong.
1152          */
1153         __u8 *hash_location = NULL;
1154         struct tcp_md5sig_key *hash_expected;
1155         const struct iphdr *iph = ip_hdr(skb);
1156         struct tcphdr *th = tcp_hdr(skb);
1157         int genhash;
1158         unsigned char newhash[16];
1159
1160         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1161         hash_location = tcp_parse_md5sig_option(th);
1162
1163         /* We've parsed the options - do we have a hash? */
1164         if (!hash_expected && !hash_location)
1165                 return 0;
1166
1167         if (hash_expected && !hash_location) {
1168                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1169                 return 1;
1170         }
1171
1172         if (!hash_expected && hash_location) {
1173                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1174                 return 1;
1175         }
1176
1177         /* Okay, so this is hash_expected and hash_location -
1178          * so we need to calculate the checksum.
1179          */
1180         genhash = tcp_v4_md5_hash_skb(newhash,
1181                                       hash_expected,
1182                                       NULL, NULL, skb);
1183
1184         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1185                 if (net_ratelimit()) {
1186                         printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1187                                &iph->saddr, ntohs(th->source),
1188                                &iph->daddr, ntohs(th->dest),
1189                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1190                 }
1191                 return 1;
1192         }
1193         return 0;
1194 }
1195
1196 #endif
1197
1198 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1199         .family         =       PF_INET,
1200         .obj_size       =       sizeof(struct tcp_request_sock),
1201         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1202         .send_ack       =       tcp_v4_reqsk_send_ack,
1203         .destructor     =       tcp_v4_reqsk_destructor,
1204         .send_reset     =       tcp_v4_send_reset,
1205         .syn_ack_timeout =      tcp_syn_ack_timeout,
1206 };
1207
1208 #ifdef CONFIG_TCP_MD5SIG
1209 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1210         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1211         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1212 };
1213 #endif
1214
1215 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1216         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1217         .twsk_unique    = tcp_twsk_unique,
1218         .twsk_destructor= tcp_twsk_destructor,
1219 };
1220
1221 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1222 {
1223         struct tcp_extend_values tmp_ext;
1224         struct tcp_options_received tmp_opt;
1225         u8 *hash_location;
1226         struct request_sock *req;
1227         struct inet_request_sock *ireq;
1228         struct tcp_sock *tp = tcp_sk(sk);
1229         struct dst_entry *dst = NULL;
1230         __be32 saddr = ip_hdr(skb)->saddr;
1231         __be32 daddr = ip_hdr(skb)->daddr;
1232         __u32 isn = TCP_SKB_CB(skb)->when;
1233 #ifdef CONFIG_SYN_COOKIES
1234         int want_cookie = 0;
1235 #else
1236 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1237 #endif
1238
1239         /* Never answer to SYNs send to broadcast or multicast */
1240         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1241                 goto drop;
1242
1243         /* TW buckets are converted to open requests without
1244          * limitations, they conserve resources and peer is
1245          * evidently real one.
1246          */
1247         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1248 #ifdef CONFIG_SYN_COOKIES
1249                 if (sysctl_tcp_syncookies) {
1250                         want_cookie = 1;
1251                 } else
1252 #endif
1253                 goto drop;
1254         }
1255
1256         /* Accept backlog is full. If we have already queued enough
1257          * of warm entries in syn queue, drop request. It is better than
1258          * clogging syn queue with openreqs with exponentially increasing
1259          * timeout.
1260          */
1261         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1262                 goto drop;
1263
1264         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1265         if (!req)
1266                 goto drop;
1267
1268 #ifdef CONFIG_TCP_MD5SIG
1269         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1270 #endif
1271
1272         tcp_clear_options(&tmp_opt);
1273         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1274         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1275         tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1276
1277         if (tmp_opt.cookie_plus > 0 &&
1278             tmp_opt.saw_tstamp &&
1279             !tp->rx_opt.cookie_out_never &&
1280             (sysctl_tcp_cookie_size > 0 ||
1281              (tp->cookie_values != NULL &&
1282               tp->cookie_values->cookie_desired > 0))) {
1283                 u8 *c;
1284                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1285                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1286
1287                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1288                         goto drop_and_release;
1289
1290                 /* Secret recipe starts with IP addresses */
1291                 *mess++ ^= daddr;
1292                 *mess++ ^= saddr;
1293
1294                 /* plus variable length Initiator Cookie */
1295                 c = (u8 *)mess;
1296                 while (l-- > 0)
1297                         *c++ ^= *hash_location++;
1298
1299 #ifdef CONFIG_SYN_COOKIES
1300                 want_cookie = 0;        /* not our kind of cookie */
1301 #endif
1302                 tmp_ext.cookie_out_never = 0; /* false */
1303                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1304         } else if (!tp->rx_opt.cookie_in_always) {
1305                 /* redundant indications, but ensure initialization. */
1306                 tmp_ext.cookie_out_never = 1; /* true */
1307                 tmp_ext.cookie_plus = 0;
1308         } else {
1309                 goto drop_and_release;
1310         }
1311         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1312
1313         if (want_cookie && !tmp_opt.saw_tstamp)
1314                 tcp_clear_options(&tmp_opt);
1315
1316         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1317         tcp_openreq_init(req, &tmp_opt, skb);
1318
1319         ireq = inet_rsk(req);
1320         ireq->loc_addr = daddr;
1321         ireq->rmt_addr = saddr;
1322         ireq->no_srccheck = inet_sk(sk)->transparent;
1323         ireq->opt = tcp_v4_save_options(sk, skb);
1324
1325         if (security_inet_conn_request(sk, skb, req))
1326                 goto drop_and_free;
1327
1328         if (!want_cookie)
1329                 TCP_ECN_create_request(req, tcp_hdr(skb));
1330
1331         if (want_cookie) {
1332 #ifdef CONFIG_SYN_COOKIES
1333                 syn_flood_warning(skb);
1334                 req->cookie_ts = tmp_opt.tstamp_ok;
1335 #endif
1336                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1337         } else if (!isn) {
1338                 struct inet_peer *peer = NULL;
1339
1340                 /* VJ's idea. We save last timestamp seen
1341                  * from the destination in peer table, when entering
1342                  * state TIME-WAIT, and check against it before
1343                  * accepting new connection request.
1344                  *
1345                  * If "isn" is not zero, this request hit alive
1346                  * timewait bucket, so that all the necessary checks
1347                  * are made in the function processing timewait state.
1348                  */
1349                 if (tmp_opt.saw_tstamp &&
1350                     tcp_death_row.sysctl_tw_recycle &&
1351                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1352                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1353                     peer->v4daddr == saddr) {
1354                         if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1355                             (s32)(peer->tcp_ts - req->ts_recent) >
1356                                                         TCP_PAWS_WINDOW) {
1357                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1358                                 goto drop_and_release;
1359                         }
1360                 }
1361                 /* Kill the following clause, if you dislike this way. */
1362                 else if (!sysctl_tcp_syncookies &&
1363                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1364                           (sysctl_max_syn_backlog >> 2)) &&
1365                          (!peer || !peer->tcp_ts_stamp) &&
1366                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1367                         /* Without syncookies last quarter of
1368                          * backlog is filled with destinations,
1369                          * proven to be alive.
1370                          * It means that we continue to communicate
1371                          * to destinations, already remembered
1372                          * to the moment of synflood.
1373                          */
1374                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1375                                        &saddr, ntohs(tcp_hdr(skb)->source));
1376                         goto drop_and_release;
1377                 }
1378
1379                 isn = tcp_v4_init_sequence(skb);
1380         }
1381         tcp_rsk(req)->snt_isn = isn;
1382
1383         if (tcp_v4_send_synack(sk, dst, req,
1384                                (struct request_values *)&tmp_ext) ||
1385             want_cookie)
1386                 goto drop_and_free;
1387
1388         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1389         return 0;
1390
1391 drop_and_release:
1392         dst_release(dst);
1393 drop_and_free:
1394         reqsk_free(req);
1395 drop:
1396         return 0;
1397 }
1398
1399
1400 /*
1401  * The three way handshake has completed - we got a valid synack -
1402  * now create the new socket.
1403  */
1404 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1405                                   struct request_sock *req,
1406                                   struct dst_entry *dst)
1407 {
1408         struct inet_request_sock *ireq;
1409         struct inet_sock *newinet;
1410         struct tcp_sock *newtp;
1411         struct sock *newsk;
1412 #ifdef CONFIG_TCP_MD5SIG
1413         struct tcp_md5sig_key *key;
1414 #endif
1415
1416         if (sk_acceptq_is_full(sk))
1417                 goto exit_overflow;
1418
1419         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1420                 goto exit;
1421
1422         newsk = tcp_create_openreq_child(sk, req, skb);
1423         if (!newsk)
1424                 goto exit;
1425
1426         newsk->sk_gso_type = SKB_GSO_TCPV4;
1427         sk_setup_caps(newsk, dst);
1428
1429         newtp                 = tcp_sk(newsk);
1430         newinet               = inet_sk(newsk);
1431         ireq                  = inet_rsk(req);
1432         newinet->inet_daddr   = ireq->rmt_addr;
1433         newinet->inet_rcv_saddr = ireq->loc_addr;
1434         newinet->inet_saddr           = ireq->loc_addr;
1435         newinet->opt          = ireq->opt;
1436         ireq->opt             = NULL;
1437         newinet->mc_index     = inet_iif(skb);
1438         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1439         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1440         if (newinet->opt)
1441                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1442         newinet->inet_id = newtp->write_seq ^ jiffies;
1443
1444         tcp_mtup_init(newsk);
1445         tcp_sync_mss(newsk, dst_mtu(dst));
1446         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1447         if (tcp_sk(sk)->rx_opt.user_mss &&
1448             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1449                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1450
1451         tcp_initialize_rcv_mss(newsk);
1452
1453 #ifdef CONFIG_TCP_MD5SIG
1454         /* Copy over the MD5 key from the original socket */
1455         key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1456         if (key != NULL) {
1457                 /*
1458                  * We're using one, so create a matching key
1459                  * on the newsk structure. If we fail to get
1460                  * memory, then we end up not copying the key
1461                  * across. Shucks.
1462                  */
1463                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1464                 if (newkey != NULL)
1465                         tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1466                                           newkey, key->keylen);
1467                 newsk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1468         }
1469 #endif
1470
1471         __inet_hash_nolisten(newsk, NULL);
1472         __inet_inherit_port(sk, newsk);
1473
1474         return newsk;
1475
1476 exit_overflow:
1477         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1478 exit:
1479         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1480         dst_release(dst);
1481         return NULL;
1482 }
1483
1484 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1485 {
1486         struct tcphdr *th = tcp_hdr(skb);
1487         const struct iphdr *iph = ip_hdr(skb);
1488         struct sock *nsk;
1489         struct request_sock **prev;
1490         /* Find possible connection requests. */
1491         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1492                                                        iph->saddr, iph->daddr);
1493         if (req)
1494                 return tcp_check_req(sk, skb, req, prev);
1495
1496         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1497                         th->source, iph->daddr, th->dest, inet_iif(skb));
1498
1499         if (nsk) {
1500                 if (nsk->sk_state != TCP_TIME_WAIT) {
1501                         bh_lock_sock(nsk);
1502                         return nsk;
1503                 }
1504                 inet_twsk_put(inet_twsk(nsk));
1505                 return NULL;
1506         }
1507
1508 #ifdef CONFIG_SYN_COOKIES
1509         if (!th->rst && !th->syn && th->ack)
1510                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1511 #endif
1512         return sk;
1513 }
1514
1515 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1516 {
1517         const struct iphdr *iph = ip_hdr(skb);
1518
1519         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1520                 if (!tcp_v4_check(skb->len, iph->saddr,
1521                                   iph->daddr, skb->csum)) {
1522                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1523                         return 0;
1524                 }
1525         }
1526
1527         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1528                                        skb->len, IPPROTO_TCP, 0);
1529
1530         if (skb->len <= 76) {
1531                 return __skb_checksum_complete(skb);
1532         }
1533         return 0;
1534 }
1535
1536
1537 /* The socket must have it's spinlock held when we get
1538  * here.
1539  *
1540  * We have a potential double-lock case here, so even when
1541  * doing backlog processing we use the BH locking scheme.
1542  * This is because we cannot sleep with the original spinlock
1543  * held.
1544  */
1545 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1546 {
1547         struct sock *rsk;
1548 #ifdef CONFIG_TCP_MD5SIG
1549         /*
1550          * We really want to reject the packet as early as possible
1551          * if:
1552          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1553          *  o There is an MD5 option and we're not expecting one
1554          */
1555         if (tcp_v4_inbound_md5_hash(sk, skb))
1556                 goto discard;
1557 #endif
1558
1559         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1560                 TCP_CHECK_TIMER(sk);
1561                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1562                         rsk = sk;
1563                         goto reset;
1564                 }
1565                 TCP_CHECK_TIMER(sk);
1566                 return 0;
1567         }
1568
1569         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1570                 goto csum_err;
1571
1572         if (sk->sk_state == TCP_LISTEN) {
1573                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1574                 if (!nsk)
1575                         goto discard;
1576
1577                 if (nsk != sk) {
1578                         if (tcp_child_process(sk, nsk, skb)) {
1579                                 rsk = nsk;
1580                                 goto reset;
1581                         }
1582                         return 0;
1583                 }
1584         }
1585
1586         TCP_CHECK_TIMER(sk);
1587         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1588                 rsk = sk;
1589                 goto reset;
1590         }
1591         TCP_CHECK_TIMER(sk);
1592         return 0;
1593
1594 reset:
1595         tcp_v4_send_reset(rsk, skb);
1596 discard:
1597         kfree_skb(skb);
1598         /* Be careful here. If this function gets more complicated and
1599          * gcc suffers from register pressure on the x86, sk (in %ebx)
1600          * might be destroyed here. This current version compiles correctly,
1601          * but you have been warned.
1602          */
1603         return 0;
1604
1605 csum_err:
1606         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1607         goto discard;
1608 }
1609
1610 /*
1611  *      From tcp_input.c
1612  */
1613
1614 int tcp_v4_rcv(struct sk_buff *skb)
1615 {
1616         const struct iphdr *iph;
1617         struct tcphdr *th;
1618         struct sock *sk;
1619         int ret;
1620         struct net *net = dev_net(skb->dev);
1621
1622         if (skb->pkt_type != PACKET_HOST)
1623                 goto discard_it;
1624
1625         /* Count it even if it's bad */
1626         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1627
1628         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1629                 goto discard_it;
1630
1631         th = tcp_hdr(skb);
1632
1633         if (th->doff < sizeof(struct tcphdr) / 4)
1634                 goto bad_packet;
1635         if (!pskb_may_pull(skb, th->doff * 4))
1636                 goto discard_it;
1637
1638         /* An explanation is required here, I think.
1639          * Packet length and doff are validated by header prediction,
1640          * provided case of th->doff==0 is eliminated.
1641          * So, we defer the checks. */
1642         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1643                 goto bad_packet;
1644
1645         th = tcp_hdr(skb);
1646         iph = ip_hdr(skb);
1647         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1648         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1649                                     skb->len - th->doff * 4);
1650         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1651         TCP_SKB_CB(skb)->when    = 0;
1652         TCP_SKB_CB(skb)->flags   = iph->tos;
1653         TCP_SKB_CB(skb)->sacked  = 0;
1654
1655         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1656         if (!sk)
1657                 goto no_tcp_socket;
1658
1659 process:
1660         if (sk->sk_state == TCP_TIME_WAIT)
1661                 goto do_time_wait;
1662
1663         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1664                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1665                 goto discard_and_relse;
1666         }
1667
1668         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1669                 goto discard_and_relse;
1670         nf_reset(skb);
1671
1672         if (sk_filter(sk, skb))
1673                 goto discard_and_relse;
1674
1675         skb->dev = NULL;
1676
1677         bh_lock_sock_nested(sk);
1678         ret = 0;
1679         if (!sock_owned_by_user(sk)) {
1680 #ifdef CONFIG_NET_DMA
1681                 struct tcp_sock *tp = tcp_sk(sk);
1682                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1683                         tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1684                 if (tp->ucopy.dma_chan)
1685                         ret = tcp_v4_do_rcv(sk, skb);
1686                 else
1687 #endif
1688                 {
1689                         if (!tcp_prequeue(sk, skb))
1690                                 ret = tcp_v4_do_rcv(sk, skb);
1691                 }
1692         } else if (unlikely(sk_add_backlog(sk, skb))) {
1693                 bh_unlock_sock(sk);
1694                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1695                 goto discard_and_relse;
1696         }
1697         bh_unlock_sock(sk);
1698
1699         sock_put(sk);
1700
1701         return ret;
1702
1703 no_tcp_socket:
1704         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1705                 goto discard_it;
1706
1707         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1708 bad_packet:
1709                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1710         } else {
1711                 tcp_v4_send_reset(NULL, skb);
1712         }
1713
1714 discard_it:
1715         /* Discard frame. */
1716         kfree_skb(skb);
1717         return 0;
1718
1719 discard_and_relse:
1720         sock_put(sk);
1721         goto discard_it;
1722
1723 do_time_wait:
1724         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1725                 inet_twsk_put(inet_twsk(sk));
1726                 goto discard_it;
1727         }
1728
1729         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1730                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1731                 inet_twsk_put(inet_twsk(sk));
1732                 goto discard_it;
1733         }
1734         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1735         case TCP_TW_SYN: {
1736                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1737                                                         &tcp_hashinfo,
1738                                                         iph->daddr, th->dest,
1739                                                         inet_iif(skb));
1740                 if (sk2) {
1741                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1742                         inet_twsk_put(inet_twsk(sk));
1743                         sk = sk2;
1744                         goto process;
1745                 }
1746                 /* Fall through to ACK */
1747         }
1748         case TCP_TW_ACK:
1749                 tcp_v4_timewait_ack(sk, skb);
1750                 break;
1751         case TCP_TW_RST:
1752                 goto no_tcp_socket;
1753         case TCP_TW_SUCCESS:;
1754         }
1755         goto discard_it;
1756 }
1757
1758 /* VJ's idea. Save last timestamp seen from this destination
1759  * and hold it at least for normal timewait interval to use for duplicate
1760  * segment detection in subsequent connections, before they enter synchronized
1761  * state.
1762  */
1763
1764 int tcp_v4_remember_stamp(struct sock *sk)
1765 {
1766         struct inet_sock *inet = inet_sk(sk);
1767         struct tcp_sock *tp = tcp_sk(sk);
1768         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1769         struct inet_peer *peer = NULL;
1770         int release_it = 0;
1771
1772         if (!rt || rt->rt_dst != inet->inet_daddr) {
1773                 peer = inet_getpeer(inet->inet_daddr, 1);
1774                 release_it = 1;
1775         } else {
1776                 if (!rt->peer)
1777                         rt_bind_peer(rt, 1);
1778                 peer = rt->peer;
1779         }
1780
1781         if (peer) {
1782                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1783                     ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1784                      peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
1785                         peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
1786                         peer->tcp_ts = tp->rx_opt.ts_recent;
1787                 }
1788                 if (release_it)
1789                         inet_putpeer(peer);
1790                 return 1;
1791         }
1792
1793         return 0;
1794 }
1795
1796 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1797 {
1798         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1799
1800         if (peer) {
1801                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1802
1803                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1804                     ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1805                      peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
1806                         peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
1807                         peer->tcp_ts       = tcptw->tw_ts_recent;
1808                 }
1809                 inet_putpeer(peer);
1810                 return 1;
1811         }
1812
1813         return 0;
1814 }
1815
1816 const struct inet_connection_sock_af_ops ipv4_specific = {
1817         .queue_xmit        = ip_queue_xmit,
1818         .send_check        = tcp_v4_send_check,
1819         .rebuild_header    = inet_sk_rebuild_header,
1820         .conn_request      = tcp_v4_conn_request,
1821         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1822         .remember_stamp    = tcp_v4_remember_stamp,
1823         .net_header_len    = sizeof(struct iphdr),
1824         .setsockopt        = ip_setsockopt,
1825         .getsockopt        = ip_getsockopt,
1826         .addr2sockaddr     = inet_csk_addr2sockaddr,
1827         .sockaddr_len      = sizeof(struct sockaddr_in),
1828         .bind_conflict     = inet_csk_bind_conflict,
1829 #ifdef CONFIG_COMPAT
1830         .compat_setsockopt = compat_ip_setsockopt,
1831         .compat_getsockopt = compat_ip_getsockopt,
1832 #endif
1833 };
1834
1835 #ifdef CONFIG_TCP_MD5SIG
1836 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1837         .md5_lookup             = tcp_v4_md5_lookup,
1838         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1839         .md5_add                = tcp_v4_md5_add_func,
1840         .md5_parse              = tcp_v4_parse_md5_keys,
1841 };
1842 #endif
1843
1844 /* NOTE: A lot of things set to zero explicitly by call to
1845  *       sk_alloc() so need not be done here.
1846  */
1847 static int tcp_v4_init_sock(struct sock *sk)
1848 {
1849         struct inet_connection_sock *icsk = inet_csk(sk);
1850         struct tcp_sock *tp = tcp_sk(sk);
1851
1852         skb_queue_head_init(&tp->out_of_order_queue);
1853         tcp_init_xmit_timers(sk);
1854         tcp_prequeue_init(tp);
1855
1856         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1857         tp->mdev = TCP_TIMEOUT_INIT;
1858
1859         /* So many TCP implementations out there (incorrectly) count the
1860          * initial SYN frame in their delayed-ACK and congestion control
1861          * algorithms that we must have the following bandaid to talk
1862          * efficiently to them.  -DaveM
1863          */
1864         tp->snd_cwnd = 2;
1865
1866         /* See draft-stevens-tcpca-spec-01 for discussion of the
1867          * initialization of these values.
1868          */
1869         tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1870         tp->snd_cwnd_clamp = ~0;
1871         tp->mss_cache = TCP_MSS_DEFAULT;
1872
1873         tp->reordering = sysctl_tcp_reordering;
1874         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1875
1876         sk->sk_state = TCP_CLOSE;
1877
1878         sk->sk_write_space = sk_stream_write_space;
1879         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1880
1881         icsk->icsk_af_ops = &ipv4_specific;
1882         icsk->icsk_sync_mss = tcp_sync_mss;
1883 #ifdef CONFIG_TCP_MD5SIG
1884         tp->af_specific = &tcp_sock_ipv4_specific;
1885 #endif
1886
1887         /* TCP Cookie Transactions */
1888         if (sysctl_tcp_cookie_size > 0) {
1889                 /* Default, cookies without s_data_payload. */
1890                 tp->cookie_values =
1891                         kzalloc(sizeof(*tp->cookie_values),
1892                                 sk->sk_allocation);
1893                 if (tp->cookie_values != NULL)
1894                         kref_init(&tp->cookie_values->kref);
1895         }
1896         /* Presumed zeroed, in order of appearance:
1897          *      cookie_in_always, cookie_out_never,
1898          *      s_data_constant, s_data_in, s_data_out
1899          */
1900         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1901         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1902
1903         local_bh_disable();
1904         percpu_counter_inc(&tcp_sockets_allocated);
1905         local_bh_enable();
1906
1907         return 0;
1908 }
1909
1910 void tcp_v4_destroy_sock(struct sock *sk)
1911 {
1912         struct tcp_sock *tp = tcp_sk(sk);
1913
1914         tcp_clear_xmit_timers(sk);
1915
1916         tcp_cleanup_congestion_control(sk);
1917
1918         /* Cleanup up the write buffer. */
1919         tcp_write_queue_purge(sk);
1920
1921         /* Cleans up our, hopefully empty, out_of_order_queue. */
1922         __skb_queue_purge(&tp->out_of_order_queue);
1923
1924 #ifdef CONFIG_TCP_MD5SIG
1925         /* Clean up the MD5 key list, if any */
1926         if (tp->md5sig_info) {
1927                 tcp_v4_clear_md5_list(sk);
1928                 kfree(tp->md5sig_info);
1929                 tp->md5sig_info = NULL;
1930         }
1931 #endif
1932
1933 #ifdef CONFIG_NET_DMA
1934         /* Cleans up our sk_async_wait_queue */
1935         __skb_queue_purge(&sk->sk_async_wait_queue);
1936 #endif
1937
1938         /* Clean prequeue, it must be empty really */
1939         __skb_queue_purge(&tp->ucopy.prequeue);
1940
1941         /* Clean up a referenced TCP bind bucket. */
1942         if (inet_csk(sk)->icsk_bind_hash)
1943                 inet_put_port(sk);
1944
1945         /*
1946          * If sendmsg cached page exists, toss it.
1947          */
1948         if (sk->sk_sndmsg_page) {
1949                 __free_page(sk->sk_sndmsg_page);
1950                 sk->sk_sndmsg_page = NULL;
1951         }
1952
1953         /* TCP Cookie Transactions */
1954         if (tp->cookie_values != NULL) {
1955                 kref_put(&tp->cookie_values->kref,
1956                          tcp_cookie_values_release);
1957                 tp->cookie_values = NULL;
1958         }
1959
1960         percpu_counter_dec(&tcp_sockets_allocated);
1961 }
1962
1963 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1964
1965 #ifdef CONFIG_PROC_FS
1966 /* Proc filesystem TCP sock list dumping. */
1967
1968 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1969 {
1970         return hlist_nulls_empty(head) ? NULL :
1971                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1972 }
1973
1974 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1975 {
1976         return !is_a_nulls(tw->tw_node.next) ?
1977                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1978 }
1979
1980 static void *listening_get_next(struct seq_file *seq, void *cur)
1981 {
1982         struct inet_connection_sock *icsk;
1983         struct hlist_nulls_node *node;
1984         struct sock *sk = cur;
1985         struct inet_listen_hashbucket *ilb;
1986         struct tcp_iter_state *st = seq->private;
1987         struct net *net = seq_file_net(seq);
1988
1989         if (!sk) {
1990                 st->bucket = 0;
1991                 ilb = &tcp_hashinfo.listening_hash[0];
1992                 spin_lock_bh(&ilb->lock);
1993                 sk = sk_nulls_head(&ilb->head);
1994                 goto get_sk;
1995         }
1996         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1997         ++st->num;
1998
1999         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2000                 struct request_sock *req = cur;
2001
2002                 icsk = inet_csk(st->syn_wait_sk);
2003                 req = req->dl_next;
2004                 while (1) {
2005                         while (req) {
2006                                 if (req->rsk_ops->family == st->family) {
2007                                         cur = req;
2008                                         goto out;
2009                                 }
2010                                 req = req->dl_next;
2011                         }
2012                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2013                                 break;
2014 get_req:
2015                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2016                 }
2017                 sk        = sk_next(st->syn_wait_sk);
2018                 st->state = TCP_SEQ_STATE_LISTENING;
2019                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2020         } else {
2021                 icsk = inet_csk(sk);
2022                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2023                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2024                         goto start_req;
2025                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2026                 sk = sk_next(sk);
2027         }
2028 get_sk:
2029         sk_nulls_for_each_from(sk, node) {
2030                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
2031                         cur = sk;
2032                         goto out;
2033                 }
2034                 icsk = inet_csk(sk);
2035                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2036                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2037 start_req:
2038                         st->uid         = sock_i_uid(sk);
2039                         st->syn_wait_sk = sk;
2040                         st->state       = TCP_SEQ_STATE_OPENREQ;
2041                         st->sbucket     = 0;
2042                         goto get_req;
2043                 }
2044                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2045         }
2046         spin_unlock_bh(&ilb->lock);
2047         if (++st->bucket < INET_LHTABLE_SIZE) {
2048                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2049                 spin_lock_bh(&ilb->lock);
2050                 sk = sk_nulls_head(&ilb->head);
2051                 goto get_sk;
2052         }
2053         cur = NULL;
2054 out:
2055         return cur;
2056 }
2057
2058 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2059 {
2060         void *rc = listening_get_next(seq, NULL);
2061
2062         while (rc && *pos) {
2063                 rc = listening_get_next(seq, rc);
2064                 --*pos;
2065         }
2066         return rc;
2067 }
2068
2069 static inline int empty_bucket(struct tcp_iter_state *st)
2070 {
2071         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2072                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2073 }
2074
2075 static void *established_get_first(struct seq_file *seq)
2076 {
2077         struct tcp_iter_state *st = seq->private;
2078         struct net *net = seq_file_net(seq);
2079         void *rc = NULL;
2080
2081         for (st->bucket = 0; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2082                 struct sock *sk;
2083                 struct hlist_nulls_node *node;
2084                 struct inet_timewait_sock *tw;
2085                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2086
2087                 /* Lockless fast path for the common case of empty buckets */
2088                 if (empty_bucket(st))
2089                         continue;
2090
2091                 spin_lock_bh(lock);
2092                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2093                         if (sk->sk_family != st->family ||
2094                             !net_eq(sock_net(sk), net)) {
2095                                 continue;
2096                         }
2097                         rc = sk;
2098                         goto out;
2099                 }
2100                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2101                 inet_twsk_for_each(tw, node,
2102                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2103                         if (tw->tw_family != st->family ||
2104                             !net_eq(twsk_net(tw), net)) {
2105                                 continue;
2106                         }
2107                         rc = tw;
2108                         goto out;
2109                 }
2110                 spin_unlock_bh(lock);
2111                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2112         }
2113 out:
2114         return rc;
2115 }
2116
2117 static void *established_get_next(struct seq_file *seq, void *cur)
2118 {
2119         struct sock *sk = cur;
2120         struct inet_timewait_sock *tw;
2121         struct hlist_nulls_node *node;
2122         struct tcp_iter_state *st = seq->private;
2123         struct net *net = seq_file_net(seq);
2124
2125         ++st->num;
2126
2127         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2128                 tw = cur;
2129                 tw = tw_next(tw);
2130 get_tw:
2131                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2132                         tw = tw_next(tw);
2133                 }
2134                 if (tw) {
2135                         cur = tw;
2136                         goto out;
2137                 }
2138                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2139                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2140
2141                 /* Look for next non empty bucket */
2142                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2143                                 empty_bucket(st))
2144                         ;
2145                 if (st->bucket > tcp_hashinfo.ehash_mask)
2146                         return NULL;
2147
2148                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2149                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2150         } else
2151                 sk = sk_nulls_next(sk);
2152
2153         sk_nulls_for_each_from(sk, node) {
2154                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2155                         goto found;
2156         }
2157
2158         st->state = TCP_SEQ_STATE_TIME_WAIT;
2159         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2160         goto get_tw;
2161 found:
2162         cur = sk;
2163 out:
2164         return cur;
2165 }
2166
2167 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2168 {
2169         void *rc = established_get_first(seq);
2170
2171         while (rc && pos) {
2172                 rc = established_get_next(seq, rc);
2173                 --pos;
2174         }
2175         return rc;
2176 }
2177
2178 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2179 {
2180         void *rc;
2181         struct tcp_iter_state *st = seq->private;
2182
2183         st->state = TCP_SEQ_STATE_LISTENING;
2184         rc        = listening_get_idx(seq, &pos);
2185
2186         if (!rc) {
2187                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2188                 rc        = established_get_idx(seq, pos);
2189         }
2190
2191         return rc;
2192 }
2193
2194 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2195 {
2196         struct tcp_iter_state *st = seq->private;
2197         st->state = TCP_SEQ_STATE_LISTENING;
2198         st->num = 0;
2199         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2200 }
2201
2202 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2203 {
2204         void *rc = NULL;
2205         struct tcp_iter_state *st;
2206
2207         if (v == SEQ_START_TOKEN) {
2208                 rc = tcp_get_idx(seq, 0);
2209                 goto out;
2210         }
2211         st = seq->private;
2212
2213         switch (st->state) {
2214         case TCP_SEQ_STATE_OPENREQ:
2215         case TCP_SEQ_STATE_LISTENING:
2216                 rc = listening_get_next(seq, v);
2217                 if (!rc) {
2218                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2219                         rc        = established_get_first(seq);
2220                 }
2221                 break;
2222         case TCP_SEQ_STATE_ESTABLISHED:
2223         case TCP_SEQ_STATE_TIME_WAIT:
2224                 rc = established_get_next(seq, v);
2225                 break;
2226         }
2227 out:
2228         ++*pos;
2229         return rc;
2230 }
2231
2232 static void tcp_seq_stop(struct seq_file *seq, void *v)
2233 {
2234         struct tcp_iter_state *st = seq->private;
2235
2236         switch (st->state) {
2237         case TCP_SEQ_STATE_OPENREQ:
2238                 if (v) {
2239                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2240                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2241                 }
2242         case TCP_SEQ_STATE_LISTENING:
2243                 if (v != SEQ_START_TOKEN)
2244                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2245                 break;
2246         case TCP_SEQ_STATE_TIME_WAIT:
2247         case TCP_SEQ_STATE_ESTABLISHED:
2248                 if (v)
2249                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2250                 break;
2251         }
2252 }
2253
2254 static int tcp_seq_open(struct inode *inode, struct file *file)
2255 {
2256         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2257         struct tcp_iter_state *s;
2258         int err;
2259
2260         err = seq_open_net(inode, file, &afinfo->seq_ops,
2261                           sizeof(struct tcp_iter_state));
2262         if (err < 0)
2263                 return err;
2264
2265         s = ((struct seq_file *)file->private_data)->private;
2266         s->family               = afinfo->family;
2267         return 0;
2268 }
2269
2270 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2271 {
2272         int rc = 0;
2273         struct proc_dir_entry *p;
2274
2275         afinfo->seq_fops.open           = tcp_seq_open;
2276         afinfo->seq_fops.read           = seq_read;
2277         afinfo->seq_fops.llseek         = seq_lseek;
2278         afinfo->seq_fops.release        = seq_release_net;
2279
2280         afinfo->seq_ops.start           = tcp_seq_start;
2281         afinfo->seq_ops.next            = tcp_seq_next;
2282         afinfo->seq_ops.stop            = tcp_seq_stop;
2283
2284         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2285                              &afinfo->seq_fops, afinfo);
2286         if (!p)
2287                 rc = -ENOMEM;
2288         return rc;
2289 }
2290
2291 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2292 {
2293         proc_net_remove(net, afinfo->name);
2294 }
2295
2296 static void get_openreq4(struct sock *sk, struct request_sock *req,
2297                          struct seq_file *f, int i, int uid, int *len)
2298 {
2299         const struct inet_request_sock *ireq = inet_rsk(req);
2300         int ttd = req->expires - jiffies;
2301
2302         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2303                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2304                 i,
2305                 ireq->loc_addr,
2306                 ntohs(inet_sk(sk)->inet_sport),
2307                 ireq->rmt_addr,
2308                 ntohs(ireq->rmt_port),
2309                 TCP_SYN_RECV,
2310                 0, 0, /* could print option size, but that is af dependent. */
2311                 1,    /* timers active (only the expire timer) */
2312                 jiffies_to_clock_t(ttd),
2313                 req->retrans,
2314                 uid,
2315                 0,  /* non standard timer */
2316                 0, /* open_requests have no inode */
2317                 atomic_read(&sk->sk_refcnt),
2318                 req,
2319                 len);
2320 }
2321
2322 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2323 {
2324         int timer_active;
2325         unsigned long timer_expires;
2326         struct tcp_sock *tp = tcp_sk(sk);
2327         const struct inet_connection_sock *icsk = inet_csk(sk);
2328         struct inet_sock *inet = inet_sk(sk);
2329         __be32 dest = inet->inet_daddr;
2330         __be32 src = inet->inet_rcv_saddr;
2331         __u16 destp = ntohs(inet->inet_dport);
2332         __u16 srcp = ntohs(inet->inet_sport);
2333         int rx_queue;
2334
2335         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2336                 timer_active    = 1;
2337                 timer_expires   = icsk->icsk_timeout;
2338         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2339                 timer_active    = 4;
2340                 timer_expires   = icsk->icsk_timeout;
2341         } else if (timer_pending(&sk->sk_timer)) {
2342                 timer_active    = 2;
2343                 timer_expires   = sk->sk_timer.expires;
2344         } else {
2345                 timer_active    = 0;
2346                 timer_expires = jiffies;
2347         }
2348
2349         if (sk->sk_state == TCP_LISTEN)
2350                 rx_queue = sk->sk_ack_backlog;
2351         else
2352                 /*
2353                  * because we dont lock socket, we might find a transient negative value
2354                  */
2355                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2356
2357         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2358                         "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2359                 i, src, srcp, dest, destp, sk->sk_state,
2360                 tp->write_seq - tp->snd_una,
2361                 rx_queue,
2362                 timer_active,
2363                 jiffies_to_clock_t(timer_expires - jiffies),
2364                 icsk->icsk_retransmits,
2365                 sock_i_uid(sk),
2366                 icsk->icsk_probes_out,
2367                 sock_i_ino(sk),
2368                 atomic_read(&sk->sk_refcnt), sk,
2369                 jiffies_to_clock_t(icsk->icsk_rto),
2370                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2371                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2372                 tp->snd_cwnd,
2373                 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2374                 len);
2375 }
2376
2377 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2378                                struct seq_file *f, int i, int *len)
2379 {
2380         __be32 dest, src;
2381         __u16 destp, srcp;
2382         int ttd = tw->tw_ttd - jiffies;
2383
2384         if (ttd < 0)
2385                 ttd = 0;
2386
2387         dest  = tw->tw_daddr;
2388         src   = tw->tw_rcv_saddr;
2389         destp = ntohs(tw->tw_dport);
2390         srcp  = ntohs(tw->tw_sport);
2391
2392         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2393                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2394                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2395                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2396                 atomic_read(&tw->tw_refcnt), tw, len);
2397 }
2398
2399 #define TMPSZ 150
2400
2401 static int tcp4_seq_show(struct seq_file *seq, void *v)
2402 {
2403         struct tcp_iter_state *st;
2404         int len;
2405
2406         if (v == SEQ_START_TOKEN) {
2407                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2408                            "  sl  local_address rem_address   st tx_queue "
2409                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2410                            "inode");
2411                 goto out;
2412         }
2413         st = seq->private;
2414
2415         switch (st->state) {
2416         case TCP_SEQ_STATE_LISTENING:
2417         case TCP_SEQ_STATE_ESTABLISHED:
2418                 get_tcp4_sock(v, seq, st->num, &len);
2419                 break;
2420         case TCP_SEQ_STATE_OPENREQ:
2421                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2422                 break;
2423         case TCP_SEQ_STATE_TIME_WAIT:
2424                 get_timewait4_sock(v, seq, st->num, &len);
2425                 break;
2426         }
2427         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2428 out:
2429         return 0;
2430 }
2431
2432 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2433         .name           = "tcp",
2434         .family         = AF_INET,
2435         .seq_fops       = {
2436                 .owner          = THIS_MODULE,
2437         },
2438         .seq_ops        = {
2439                 .show           = tcp4_seq_show,
2440         },
2441 };
2442
2443 static int __net_init tcp4_proc_init_net(struct net *net)
2444 {
2445         return tcp_proc_register(net, &tcp4_seq_afinfo);
2446 }
2447
2448 static void __net_exit tcp4_proc_exit_net(struct net *net)
2449 {
2450         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2451 }
2452
2453 static struct pernet_operations tcp4_net_ops = {
2454         .init = tcp4_proc_init_net,
2455         .exit = tcp4_proc_exit_net,
2456 };
2457
2458 int __init tcp4_proc_init(void)
2459 {
2460         return register_pernet_subsys(&tcp4_net_ops);
2461 }
2462
2463 void tcp4_proc_exit(void)
2464 {
2465         unregister_pernet_subsys(&tcp4_net_ops);
2466 }
2467 #endif /* CONFIG_PROC_FS */
2468
2469 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2470 {
2471         struct iphdr *iph = skb_gro_network_header(skb);
2472
2473         switch (skb->ip_summed) {
2474         case CHECKSUM_COMPLETE:
2475                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2476                                   skb->csum)) {
2477                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2478                         break;
2479                 }
2480
2481                 /* fall through */
2482         case CHECKSUM_NONE:
2483                 NAPI_GRO_CB(skb)->flush = 1;
2484                 return NULL;
2485         }
2486
2487         return tcp_gro_receive(head, skb);
2488 }
2489 EXPORT_SYMBOL(tcp4_gro_receive);
2490
2491 int tcp4_gro_complete(struct sk_buff *skb)
2492 {
2493         struct iphdr *iph = ip_hdr(skb);
2494         struct tcphdr *th = tcp_hdr(skb);
2495
2496         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2497                                   iph->saddr, iph->daddr, 0);
2498         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2499
2500         return tcp_gro_complete(skb);
2501 }
2502 EXPORT_SYMBOL(tcp4_gro_complete);
2503
2504 struct proto tcp_prot = {
2505         .name                   = "TCP",
2506         .owner                  = THIS_MODULE,
2507         .close                  = tcp_close,
2508         .connect                = tcp_v4_connect,
2509         .disconnect             = tcp_disconnect,
2510         .accept                 = inet_csk_accept,
2511         .ioctl                  = tcp_ioctl,
2512         .init                   = tcp_v4_init_sock,
2513         .destroy                = tcp_v4_destroy_sock,
2514         .shutdown               = tcp_shutdown,
2515         .setsockopt             = tcp_setsockopt,
2516         .getsockopt             = tcp_getsockopt,
2517         .recvmsg                = tcp_recvmsg,
2518         .backlog_rcv            = tcp_v4_do_rcv,
2519         .hash                   = inet_hash,
2520         .unhash                 = inet_unhash,
2521         .get_port               = inet_csk_get_port,
2522         .enter_memory_pressure  = tcp_enter_memory_pressure,
2523         .sockets_allocated      = &tcp_sockets_allocated,
2524         .orphan_count           = &tcp_orphan_count,
2525         .memory_allocated       = &tcp_memory_allocated,
2526         .memory_pressure        = &tcp_memory_pressure,
2527         .sysctl_mem             = sysctl_tcp_mem,
2528         .sysctl_wmem            = sysctl_tcp_wmem,
2529         .sysctl_rmem            = sysctl_tcp_rmem,
2530         .max_header             = MAX_TCP_HEADER,
2531         .obj_size               = sizeof(struct tcp_sock),
2532         .slab_flags             = SLAB_DESTROY_BY_RCU,
2533         .twsk_prot              = &tcp_timewait_sock_ops,
2534         .rsk_prot               = &tcp_request_sock_ops,
2535         .h.hashinfo             = &tcp_hashinfo,
2536 #ifdef CONFIG_COMPAT
2537         .compat_setsockopt      = compat_tcp_setsockopt,
2538         .compat_getsockopt      = compat_tcp_getsockopt,
2539 #endif
2540 };
2541
2542
2543 static int __net_init tcp_sk_init(struct net *net)
2544 {
2545         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2546                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2547 }
2548
2549 static void __net_exit tcp_sk_exit(struct net *net)
2550 {
2551         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2552 }
2553
2554 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2555 {
2556         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2557 }
2558
2559 static struct pernet_operations __net_initdata tcp_sk_ops = {
2560        .init       = tcp_sk_init,
2561        .exit       = tcp_sk_exit,
2562        .exit_batch = tcp_sk_exit_batch,
2563 };
2564
2565 void __init tcp_v4_init(void)
2566 {
2567         inet_hashinfo_init(&tcp_hashinfo);
2568         if (register_pernet_subsys(&tcp_sk_ops))
2569                 panic("Failed to create the TCP control socket.\n");
2570 }
2571
2572 EXPORT_SYMBOL(ipv4_specific);
2573 EXPORT_SYMBOL(tcp_hashinfo);
2574 EXPORT_SYMBOL(tcp_prot);
2575 EXPORT_SYMBOL(tcp_v4_conn_request);
2576 EXPORT_SYMBOL(tcp_v4_connect);
2577 EXPORT_SYMBOL(tcp_v4_do_rcv);
2578 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2579 EXPORT_SYMBOL(tcp_v4_send_check);
2580 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2581
2582 #ifdef CONFIG_PROC_FS
2583 EXPORT_SYMBOL(tcp_proc_register);
2584 EXPORT_SYMBOL(tcp_proc_unregister);
2585 #endif
2586 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2587