tcp: Fix tcp_v4_rcv()
[linux-flexiantxendom0-natty.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53
54 #include <linux/bottom_half.h>
55 #include <linux/types.h>
56 #include <linux/fcntl.h>
57 #include <linux/module.h>
58 #include <linux/random.h>
59 #include <linux/cache.h>
60 #include <linux/jhash.h>
61 #include <linux/init.h>
62 #include <linux/times.h>
63
64 #include <net/net_namespace.h>
65 #include <net/icmp.h>
66 #include <net/inet_hashtables.h>
67 #include <net/tcp.h>
68 #include <net/transp_v6.h>
69 #include <net/ipv6.h>
70 #include <net/inet_common.h>
71 #include <net/timewait_sock.h>
72 #include <net/xfrm.h>
73 #include <net/netdma.h>
74
75 #include <linux/inet.h>
76 #include <linux/ipv6.h>
77 #include <linux/stddef.h>
78 #include <linux/proc_fs.h>
79 #include <linux/seq_file.h>
80
81 #include <linux/crypto.h>
82 #include <linux/scatterlist.h>
83
84 int sysctl_tcp_tw_reuse __read_mostly;
85 int sysctl_tcp_low_latency __read_mostly;
86
87
88 #ifdef CONFIG_TCP_MD5SIG
89 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
90                                                    __be32 addr);
91 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
92                                __be32 daddr, __be32 saddr, struct tcphdr *th);
93 #else
94 static inline
95 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
96 {
97         return NULL;
98 }
99 #endif
100
101 struct inet_hashinfo tcp_hashinfo;
102
103 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
104 {
105         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
106                                           ip_hdr(skb)->saddr,
107                                           tcp_hdr(skb)->dest,
108                                           tcp_hdr(skb)->source);
109 }
110
111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112 {
113         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114         struct tcp_sock *tp = tcp_sk(sk);
115
116         /* With PAWS, it is safe from the viewpoint
117            of data integrity. Even without PAWS it is safe provided sequence
118            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
119
120            Actually, the idea is close to VJ's one, only timestamp cache is
121            held not per host, but per port pair and TW bucket is used as state
122            holder.
123
124            If TW bucket has been already destroyed we fall back to VJ's scheme
125            and use initial timestamp retrieved from peer table.
126          */
127         if (tcptw->tw_ts_recent_stamp &&
128             (twp == NULL || (sysctl_tcp_tw_reuse &&
129                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
130                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
131                 if (tp->write_seq == 0)
132                         tp->write_seq = 1;
133                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
134                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
135                 sock_hold(sktw);
136                 return 1;
137         }
138
139         return 0;
140 }
141
142 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
143
144 /* This will initiate an outgoing connection. */
145 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
146 {
147         struct inet_sock *inet = inet_sk(sk);
148         struct tcp_sock *tp = tcp_sk(sk);
149         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
150         struct rtable *rt;
151         __be32 daddr, nexthop;
152         int tmp;
153         int err;
154
155         if (addr_len < sizeof(struct sockaddr_in))
156                 return -EINVAL;
157
158         if (usin->sin_family != AF_INET)
159                 return -EAFNOSUPPORT;
160
161         nexthop = daddr = usin->sin_addr.s_addr;
162         if (inet->opt && inet->opt->srr) {
163                 if (!daddr)
164                         return -EINVAL;
165                 nexthop = inet->opt->faddr;
166         }
167
168         tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
169                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
170                                IPPROTO_TCP,
171                                inet->inet_sport, usin->sin_port, sk, 1);
172         if (tmp < 0) {
173                 if (tmp == -ENETUNREACH)
174                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
175                 return tmp;
176         }
177
178         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
179                 ip_rt_put(rt);
180                 return -ENETUNREACH;
181         }
182
183         if (!inet->opt || !inet->opt->srr)
184                 daddr = rt->rt_dst;
185
186         if (!inet->inet_saddr)
187                 inet->inet_saddr = rt->rt_src;
188         inet->inet_rcv_saddr = inet->inet_saddr;
189
190         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
191                 /* Reset inherited state */
192                 tp->rx_opt.ts_recent       = 0;
193                 tp->rx_opt.ts_recent_stamp = 0;
194                 tp->write_seq              = 0;
195         }
196
197         if (tcp_death_row.sysctl_tw_recycle &&
198             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
199                 struct inet_peer *peer = rt_get_peer(rt);
200                 /*
201                  * VJ's idea. We save last timestamp seen from
202                  * the destination in peer table, when entering state
203                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
204                  * when trying new connection.
205                  */
206                 if (peer != NULL &&
207                     (u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
208                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
209                         tp->rx_opt.ts_recent = peer->tcp_ts;
210                 }
211         }
212
213         inet->inet_dport = usin->sin_port;
214         inet->inet_daddr = daddr;
215
216         inet_csk(sk)->icsk_ext_hdr_len = 0;
217         if (inet->opt)
218                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
219
220         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
221
222         /* Socket identity is still unknown (sport may be zero).
223          * However we set state to SYN-SENT and not releasing socket
224          * lock select source port, enter ourselves into the hash tables and
225          * complete initialization after this.
226          */
227         tcp_set_state(sk, TCP_SYN_SENT);
228         err = inet_hash_connect(&tcp_death_row, sk);
229         if (err)
230                 goto failure;
231
232         err = ip_route_newports(&rt, IPPROTO_TCP,
233                                 inet->inet_sport, inet->inet_dport, sk);
234         if (err)
235                 goto failure;
236
237         /* OK, now commit destination to socket.  */
238         sk->sk_gso_type = SKB_GSO_TCPV4;
239         sk_setup_caps(sk, &rt->u.dst);
240
241         if (!tp->write_seq)
242                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
243                                                            inet->inet_daddr,
244                                                            inet->inet_sport,
245                                                            usin->sin_port);
246
247         inet->inet_id = tp->write_seq ^ jiffies;
248
249         err = tcp_connect(sk);
250         rt = NULL;
251         if (err)
252                 goto failure;
253
254         return 0;
255
256 failure:
257         /*
258          * This unhashes the socket and releases the local port,
259          * if necessary.
260          */
261         tcp_set_state(sk, TCP_CLOSE);
262         ip_rt_put(rt);
263         sk->sk_route_caps = 0;
264         inet->inet_dport = 0;
265         return err;
266 }
267
268 /*
269  * This routine does path mtu discovery as defined in RFC1191.
270  */
271 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
272 {
273         struct dst_entry *dst;
274         struct inet_sock *inet = inet_sk(sk);
275
276         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
277          * send out by Linux are always <576bytes so they should go through
278          * unfragmented).
279          */
280         if (sk->sk_state == TCP_LISTEN)
281                 return;
282
283         /* We don't check in the destentry if pmtu discovery is forbidden
284          * on this route. We just assume that no packet_to_big packets
285          * are send back when pmtu discovery is not active.
286          * There is a small race when the user changes this flag in the
287          * route, but I think that's acceptable.
288          */
289         if ((dst = __sk_dst_check(sk, 0)) == NULL)
290                 return;
291
292         dst->ops->update_pmtu(dst, mtu);
293
294         /* Something is about to be wrong... Remember soft error
295          * for the case, if this connection will not able to recover.
296          */
297         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
298                 sk->sk_err_soft = EMSGSIZE;
299
300         mtu = dst_mtu(dst);
301
302         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
303             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
304                 tcp_sync_mss(sk, mtu);
305
306                 /* Resend the TCP packet because it's
307                  * clear that the old packet has been
308                  * dropped. This is the new "fast" path mtu
309                  * discovery.
310                  */
311                 tcp_simple_retransmit(sk);
312         } /* else let the usual retransmit timer handle it */
313 }
314
315 /*
316  * This routine is called by the ICMP module when it gets some
317  * sort of error condition.  If err < 0 then the socket should
318  * be closed and the error returned to the user.  If err > 0
319  * it's just the icmp type << 8 | icmp code.  After adjustment
320  * header points to the first 8 bytes of the tcp header.  We need
321  * to find the appropriate port.
322  *
323  * The locking strategy used here is very "optimistic". When
324  * someone else accesses the socket the ICMP is just dropped
325  * and for some paths there is no check at all.
326  * A more general error queue to queue errors for later handling
327  * is probably better.
328  *
329  */
330
331 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
332 {
333         struct iphdr *iph = (struct iphdr *)icmp_skb->data;
334         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
335         struct inet_connection_sock *icsk;
336         struct tcp_sock *tp;
337         struct inet_sock *inet;
338         const int type = icmp_hdr(icmp_skb)->type;
339         const int code = icmp_hdr(icmp_skb)->code;
340         struct sock *sk;
341         struct sk_buff *skb;
342         __u32 seq;
343         __u32 remaining;
344         int err;
345         struct net *net = dev_net(icmp_skb->dev);
346
347         if (icmp_skb->len < (iph->ihl << 2) + 8) {
348                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
349                 return;
350         }
351
352         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
353                         iph->saddr, th->source, inet_iif(icmp_skb));
354         if (!sk) {
355                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
356                 return;
357         }
358         if (sk->sk_state == TCP_TIME_WAIT) {
359                 inet_twsk_put(inet_twsk(sk));
360                 return;
361         }
362
363         bh_lock_sock(sk);
364         /* If too many ICMPs get dropped on busy
365          * servers this needs to be solved differently.
366          */
367         if (sock_owned_by_user(sk))
368                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
369
370         if (sk->sk_state == TCP_CLOSE)
371                 goto out;
372
373         icsk = inet_csk(sk);
374         tp = tcp_sk(sk);
375         seq = ntohl(th->seq);
376         if (sk->sk_state != TCP_LISTEN &&
377             !between(seq, tp->snd_una, tp->snd_nxt)) {
378                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
379                 goto out;
380         }
381
382         switch (type) {
383         case ICMP_SOURCE_QUENCH:
384                 /* Just silently ignore these. */
385                 goto out;
386         case ICMP_PARAMETERPROB:
387                 err = EPROTO;
388                 break;
389         case ICMP_DEST_UNREACH:
390                 if (code > NR_ICMP_UNREACH)
391                         goto out;
392
393                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
394                         if (!sock_owned_by_user(sk))
395                                 do_pmtu_discovery(sk, iph, info);
396                         goto out;
397                 }
398
399                 err = icmp_err_convert[code].errno;
400                 /* check if icmp_skb allows revert of backoff
401                  * (see draft-zimmermann-tcp-lcd) */
402                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
403                         break;
404                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
405                     !icsk->icsk_backoff)
406                         break;
407
408                 icsk->icsk_backoff--;
409                 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
410                                          icsk->icsk_backoff;
411                 tcp_bound_rto(sk);
412
413                 skb = tcp_write_queue_head(sk);
414                 BUG_ON(!skb);
415
416                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
417                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
418
419                 if (remaining) {
420                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
421                                                   remaining, TCP_RTO_MAX);
422                 } else if (sock_owned_by_user(sk)) {
423                         /* RTO revert clocked out retransmission,
424                          * but socket is locked. Will defer. */
425                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
426                                                   HZ/20, TCP_RTO_MAX);
427                 } else {
428                         /* RTO revert clocked out retransmission.
429                          * Will retransmit now */
430                         tcp_retransmit_timer(sk);
431                 }
432
433                 break;
434         case ICMP_TIME_EXCEEDED:
435                 err = EHOSTUNREACH;
436                 break;
437         default:
438                 goto out;
439         }
440
441         switch (sk->sk_state) {
442                 struct request_sock *req, **prev;
443         case TCP_LISTEN:
444                 if (sock_owned_by_user(sk))
445                         goto out;
446
447                 req = inet_csk_search_req(sk, &prev, th->dest,
448                                           iph->daddr, iph->saddr);
449                 if (!req)
450                         goto out;
451
452                 /* ICMPs are not backlogged, hence we cannot get
453                    an established socket here.
454                  */
455                 WARN_ON(req->sk);
456
457                 if (seq != tcp_rsk(req)->snt_isn) {
458                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
459                         goto out;
460                 }
461
462                 /*
463                  * Still in SYN_RECV, just remove it silently.
464                  * There is no good way to pass the error to the newly
465                  * created socket, and POSIX does not want network
466                  * errors returned from accept().
467                  */
468                 inet_csk_reqsk_queue_drop(sk, req, prev);
469                 goto out;
470
471         case TCP_SYN_SENT:
472         case TCP_SYN_RECV:  /* Cannot happen.
473                                It can f.e. if SYNs crossed.
474                              */
475                 if (!sock_owned_by_user(sk)) {
476                         sk->sk_err = err;
477
478                         sk->sk_error_report(sk);
479
480                         tcp_done(sk);
481                 } else {
482                         sk->sk_err_soft = err;
483                 }
484                 goto out;
485         }
486
487         /* If we've already connected we will keep trying
488          * until we time out, or the user gives up.
489          *
490          * rfc1122 4.2.3.9 allows to consider as hard errors
491          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
492          * but it is obsoleted by pmtu discovery).
493          *
494          * Note, that in modern internet, where routing is unreliable
495          * and in each dark corner broken firewalls sit, sending random
496          * errors ordered by their masters even this two messages finally lose
497          * their original sense (even Linux sends invalid PORT_UNREACHs)
498          *
499          * Now we are in compliance with RFCs.
500          *                                                      --ANK (980905)
501          */
502
503         inet = inet_sk(sk);
504         if (!sock_owned_by_user(sk) && inet->recverr) {
505                 sk->sk_err = err;
506                 sk->sk_error_report(sk);
507         } else  { /* Only an error on timeout */
508                 sk->sk_err_soft = err;
509         }
510
511 out:
512         bh_unlock_sock(sk);
513         sock_put(sk);
514 }
515
516 /* This routine computes an IPv4 TCP checksum. */
517 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
518 {
519         struct inet_sock *inet = inet_sk(sk);
520         struct tcphdr *th = tcp_hdr(skb);
521
522         if (skb->ip_summed == CHECKSUM_PARTIAL) {
523                 th->check = ~tcp_v4_check(len, inet->inet_saddr,
524                                           inet->inet_daddr, 0);
525                 skb->csum_start = skb_transport_header(skb) - skb->head;
526                 skb->csum_offset = offsetof(struct tcphdr, check);
527         } else {
528                 th->check = tcp_v4_check(len, inet->inet_saddr,
529                                          inet->inet_daddr,
530                                          csum_partial(th,
531                                                       th->doff << 2,
532                                                       skb->csum));
533         }
534 }
535
536 int tcp_v4_gso_send_check(struct sk_buff *skb)
537 {
538         const struct iphdr *iph;
539         struct tcphdr *th;
540
541         if (!pskb_may_pull(skb, sizeof(*th)))
542                 return -EINVAL;
543
544         iph = ip_hdr(skb);
545         th = tcp_hdr(skb);
546
547         th->check = 0;
548         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
549         skb->csum_start = skb_transport_header(skb) - skb->head;
550         skb->csum_offset = offsetof(struct tcphdr, check);
551         skb->ip_summed = CHECKSUM_PARTIAL;
552         return 0;
553 }
554
555 /*
556  *      This routine will send an RST to the other tcp.
557  *
558  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
559  *                    for reset.
560  *      Answer: if a packet caused RST, it is not for a socket
561  *              existing in our system, if it is matched to a socket,
562  *              it is just duplicate segment or bug in other side's TCP.
563  *              So that we build reply only basing on parameters
564  *              arrived with segment.
565  *      Exception: precedence violation. We do not implement it in any case.
566  */
567
568 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
569 {
570         struct tcphdr *th = tcp_hdr(skb);
571         struct {
572                 struct tcphdr th;
573 #ifdef CONFIG_TCP_MD5SIG
574                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
575 #endif
576         } rep;
577         struct ip_reply_arg arg;
578 #ifdef CONFIG_TCP_MD5SIG
579         struct tcp_md5sig_key *key;
580 #endif
581         struct net *net;
582
583         /* Never send a reset in response to a reset. */
584         if (th->rst)
585                 return;
586
587         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
588                 return;
589
590         /* Swap the send and the receive. */
591         memset(&rep, 0, sizeof(rep));
592         rep.th.dest   = th->source;
593         rep.th.source = th->dest;
594         rep.th.doff   = sizeof(struct tcphdr) / 4;
595         rep.th.rst    = 1;
596
597         if (th->ack) {
598                 rep.th.seq = th->ack_seq;
599         } else {
600                 rep.th.ack = 1;
601                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
602                                        skb->len - (th->doff << 2));
603         }
604
605         memset(&arg, 0, sizeof(arg));
606         arg.iov[0].iov_base = (unsigned char *)&rep;
607         arg.iov[0].iov_len  = sizeof(rep.th);
608
609 #ifdef CONFIG_TCP_MD5SIG
610         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
611         if (key) {
612                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
613                                    (TCPOPT_NOP << 16) |
614                                    (TCPOPT_MD5SIG << 8) |
615                                    TCPOLEN_MD5SIG);
616                 /* Update length and the length the header thinks exists */
617                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
618                 rep.th.doff = arg.iov[0].iov_len / 4;
619
620                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
621                                      key, ip_hdr(skb)->saddr,
622                                      ip_hdr(skb)->daddr, &rep.th);
623         }
624 #endif
625         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
626                                       ip_hdr(skb)->saddr, /* XXX */
627                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
628         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
629         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
630
631         net = dev_net(skb_dst(skb)->dev);
632         ip_send_reply(net->ipv4.tcp_sock, skb,
633                       &arg, arg.iov[0].iov_len);
634
635         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
636         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
637 }
638
639 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
640    outside socket context is ugly, certainly. What can I do?
641  */
642
643 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
644                             u32 win, u32 ts, int oif,
645                             struct tcp_md5sig_key *key,
646                             int reply_flags)
647 {
648         struct tcphdr *th = tcp_hdr(skb);
649         struct {
650                 struct tcphdr th;
651                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
652 #ifdef CONFIG_TCP_MD5SIG
653                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
654 #endif
655                         ];
656         } rep;
657         struct ip_reply_arg arg;
658         struct net *net = dev_net(skb_dst(skb)->dev);
659
660         memset(&rep.th, 0, sizeof(struct tcphdr));
661         memset(&arg, 0, sizeof(arg));
662
663         arg.iov[0].iov_base = (unsigned char *)&rep;
664         arg.iov[0].iov_len  = sizeof(rep.th);
665         if (ts) {
666                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
667                                    (TCPOPT_TIMESTAMP << 8) |
668                                    TCPOLEN_TIMESTAMP);
669                 rep.opt[1] = htonl(tcp_time_stamp);
670                 rep.opt[2] = htonl(ts);
671                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
672         }
673
674         /* Swap the send and the receive. */
675         rep.th.dest    = th->source;
676         rep.th.source  = th->dest;
677         rep.th.doff    = arg.iov[0].iov_len / 4;
678         rep.th.seq     = htonl(seq);
679         rep.th.ack_seq = htonl(ack);
680         rep.th.ack     = 1;
681         rep.th.window  = htons(win);
682
683 #ifdef CONFIG_TCP_MD5SIG
684         if (key) {
685                 int offset = (ts) ? 3 : 0;
686
687                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
688                                           (TCPOPT_NOP << 16) |
689                                           (TCPOPT_MD5SIG << 8) |
690                                           TCPOLEN_MD5SIG);
691                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
692                 rep.th.doff = arg.iov[0].iov_len/4;
693
694                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
695                                     key, ip_hdr(skb)->saddr,
696                                     ip_hdr(skb)->daddr, &rep.th);
697         }
698 #endif
699         arg.flags = reply_flags;
700         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
701                                       ip_hdr(skb)->saddr, /* XXX */
702                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
703         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
704         if (oif)
705                 arg.bound_dev_if = oif;
706
707         ip_send_reply(net->ipv4.tcp_sock, skb,
708                       &arg, arg.iov[0].iov_len);
709
710         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
711 }
712
713 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
714 {
715         struct inet_timewait_sock *tw = inet_twsk(sk);
716         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
717
718         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
719                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
720                         tcptw->tw_ts_recent,
721                         tw->tw_bound_dev_if,
722                         tcp_twsk_md5_key(tcptw),
723                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
724                         );
725
726         inet_twsk_put(tw);
727 }
728
729 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
730                                   struct request_sock *req)
731 {
732         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
733                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
734                         req->ts_recent,
735                         0,
736                         tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
737                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
738 }
739
740 /*
741  *      Send a SYN-ACK after having received a SYN.
742  *      This still operates on a request_sock only, not on a big
743  *      socket.
744  */
745 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
746                               struct request_sock *req,
747                               struct request_values *rvp)
748 {
749         const struct inet_request_sock *ireq = inet_rsk(req);
750         int err = -1;
751         struct sk_buff * skb;
752
753         /* First, grab a route. */
754         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
755                 return -1;
756
757         skb = tcp_make_synack(sk, dst, req, rvp);
758
759         if (skb) {
760                 struct tcphdr *th = tcp_hdr(skb);
761
762                 th->check = tcp_v4_check(skb->len,
763                                          ireq->loc_addr,
764                                          ireq->rmt_addr,
765                                          csum_partial(th, skb->len,
766                                                       skb->csum));
767
768                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
769                                             ireq->rmt_addr,
770                                             ireq->opt);
771                 err = net_xmit_eval(err);
772         }
773
774         dst_release(dst);
775         return err;
776 }
777
778 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
779                               struct request_values *rvp)
780 {
781         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
782         return tcp_v4_send_synack(sk, NULL, req, rvp);
783 }
784
785 /*
786  *      IPv4 request_sock destructor.
787  */
788 static void tcp_v4_reqsk_destructor(struct request_sock *req)
789 {
790         kfree(inet_rsk(req)->opt);
791 }
792
793 #ifdef CONFIG_SYN_COOKIES
794 static void syn_flood_warning(struct sk_buff *skb)
795 {
796         static unsigned long warntime;
797
798         if (time_after(jiffies, (warntime + HZ * 60))) {
799                 warntime = jiffies;
800                 printk(KERN_INFO
801                        "possible SYN flooding on port %d. Sending cookies.\n",
802                        ntohs(tcp_hdr(skb)->dest));
803         }
804 }
805 #endif
806
807 /*
808  * Save and compile IPv4 options into the request_sock if needed.
809  */
810 static struct ip_options *tcp_v4_save_options(struct sock *sk,
811                                               struct sk_buff *skb)
812 {
813         struct ip_options *opt = &(IPCB(skb)->opt);
814         struct ip_options *dopt = NULL;
815
816         if (opt && opt->optlen) {
817                 int opt_size = optlength(opt);
818                 dopt = kmalloc(opt_size, GFP_ATOMIC);
819                 if (dopt) {
820                         if (ip_options_echo(dopt, skb)) {
821                                 kfree(dopt);
822                                 dopt = NULL;
823                         }
824                 }
825         }
826         return dopt;
827 }
828
829 #ifdef CONFIG_TCP_MD5SIG
830 /*
831  * RFC2385 MD5 checksumming requires a mapping of
832  * IP address->MD5 Key.
833  * We need to maintain these in the sk structure.
834  */
835
836 /* Find the Key structure for an address.  */
837 static struct tcp_md5sig_key *
838                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
839 {
840         struct tcp_sock *tp = tcp_sk(sk);
841         int i;
842
843         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
844                 return NULL;
845         for (i = 0; i < tp->md5sig_info->entries4; i++) {
846                 if (tp->md5sig_info->keys4[i].addr == addr)
847                         return &tp->md5sig_info->keys4[i].base;
848         }
849         return NULL;
850 }
851
852 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
853                                          struct sock *addr_sk)
854 {
855         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
856 }
857
858 EXPORT_SYMBOL(tcp_v4_md5_lookup);
859
860 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
861                                                       struct request_sock *req)
862 {
863         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
864 }
865
866 /* This can be called on a newly created socket, from other files */
867 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
868                       u8 *newkey, u8 newkeylen)
869 {
870         /* Add Key to the list */
871         struct tcp_md5sig_key *key;
872         struct tcp_sock *tp = tcp_sk(sk);
873         struct tcp4_md5sig_key *keys;
874
875         key = tcp_v4_md5_do_lookup(sk, addr);
876         if (key) {
877                 /* Pre-existing entry - just update that one. */
878                 kfree(key->key);
879                 key->key = newkey;
880                 key->keylen = newkeylen;
881         } else {
882                 struct tcp_md5sig_info *md5sig;
883
884                 if (!tp->md5sig_info) {
885                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
886                                                   GFP_ATOMIC);
887                         if (!tp->md5sig_info) {
888                                 kfree(newkey);
889                                 return -ENOMEM;
890                         }
891                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
892                 }
893                 if (tcp_alloc_md5sig_pool(sk) == NULL) {
894                         kfree(newkey);
895                         return -ENOMEM;
896                 }
897                 md5sig = tp->md5sig_info;
898
899                 if (md5sig->alloced4 == md5sig->entries4) {
900                         keys = kmalloc((sizeof(*keys) *
901                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
902                         if (!keys) {
903                                 kfree(newkey);
904                                 tcp_free_md5sig_pool();
905                                 return -ENOMEM;
906                         }
907
908                         if (md5sig->entries4)
909                                 memcpy(keys, md5sig->keys4,
910                                        sizeof(*keys) * md5sig->entries4);
911
912                         /* Free old key list, and reference new one */
913                         kfree(md5sig->keys4);
914                         md5sig->keys4 = keys;
915                         md5sig->alloced4++;
916                 }
917                 md5sig->entries4++;
918                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
919                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
920                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
921         }
922         return 0;
923 }
924
925 EXPORT_SYMBOL(tcp_v4_md5_do_add);
926
927 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
928                                u8 *newkey, u8 newkeylen)
929 {
930         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
931                                  newkey, newkeylen);
932 }
933
934 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
935 {
936         struct tcp_sock *tp = tcp_sk(sk);
937         int i;
938
939         for (i = 0; i < tp->md5sig_info->entries4; i++) {
940                 if (tp->md5sig_info->keys4[i].addr == addr) {
941                         /* Free the key */
942                         kfree(tp->md5sig_info->keys4[i].base.key);
943                         tp->md5sig_info->entries4--;
944
945                         if (tp->md5sig_info->entries4 == 0) {
946                                 kfree(tp->md5sig_info->keys4);
947                                 tp->md5sig_info->keys4 = NULL;
948                                 tp->md5sig_info->alloced4 = 0;
949                         } else if (tp->md5sig_info->entries4 != i) {
950                                 /* Need to do some manipulation */
951                                 memmove(&tp->md5sig_info->keys4[i],
952                                         &tp->md5sig_info->keys4[i+1],
953                                         (tp->md5sig_info->entries4 - i) *
954                                          sizeof(struct tcp4_md5sig_key));
955                         }
956                         tcp_free_md5sig_pool();
957                         return 0;
958                 }
959         }
960         return -ENOENT;
961 }
962
963 EXPORT_SYMBOL(tcp_v4_md5_do_del);
964
965 static void tcp_v4_clear_md5_list(struct sock *sk)
966 {
967         struct tcp_sock *tp = tcp_sk(sk);
968
969         /* Free each key, then the set of key keys,
970          * the crypto element, and then decrement our
971          * hold on the last resort crypto.
972          */
973         if (tp->md5sig_info->entries4) {
974                 int i;
975                 for (i = 0; i < tp->md5sig_info->entries4; i++)
976                         kfree(tp->md5sig_info->keys4[i].base.key);
977                 tp->md5sig_info->entries4 = 0;
978                 tcp_free_md5sig_pool();
979         }
980         if (tp->md5sig_info->keys4) {
981                 kfree(tp->md5sig_info->keys4);
982                 tp->md5sig_info->keys4 = NULL;
983                 tp->md5sig_info->alloced4  = 0;
984         }
985 }
986
987 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
988                                  int optlen)
989 {
990         struct tcp_md5sig cmd;
991         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
992         u8 *newkey;
993
994         if (optlen < sizeof(cmd))
995                 return -EINVAL;
996
997         if (copy_from_user(&cmd, optval, sizeof(cmd)))
998                 return -EFAULT;
999
1000         if (sin->sin_family != AF_INET)
1001                 return -EINVAL;
1002
1003         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1004                 if (!tcp_sk(sk)->md5sig_info)
1005                         return -ENOENT;
1006                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1007         }
1008
1009         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1010                 return -EINVAL;
1011
1012         if (!tcp_sk(sk)->md5sig_info) {
1013                 struct tcp_sock *tp = tcp_sk(sk);
1014                 struct tcp_md5sig_info *p;
1015
1016                 p = kzalloc(sizeof(*p), sk->sk_allocation);
1017                 if (!p)
1018                         return -EINVAL;
1019
1020                 tp->md5sig_info = p;
1021                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1022         }
1023
1024         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1025         if (!newkey)
1026                 return -ENOMEM;
1027         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1028                                  newkey, cmd.tcpm_keylen);
1029 }
1030
1031 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1032                                         __be32 daddr, __be32 saddr, int nbytes)
1033 {
1034         struct tcp4_pseudohdr *bp;
1035         struct scatterlist sg;
1036
1037         bp = &hp->md5_blk.ip4;
1038
1039         /*
1040          * 1. the TCP pseudo-header (in the order: source IP address,
1041          * destination IP address, zero-padded protocol number, and
1042          * segment length)
1043          */
1044         bp->saddr = saddr;
1045         bp->daddr = daddr;
1046         bp->pad = 0;
1047         bp->protocol = IPPROTO_TCP;
1048         bp->len = cpu_to_be16(nbytes);
1049
1050         sg_init_one(&sg, bp, sizeof(*bp));
1051         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1052 }
1053
1054 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1055                                __be32 daddr, __be32 saddr, struct tcphdr *th)
1056 {
1057         struct tcp_md5sig_pool *hp;
1058         struct hash_desc *desc;
1059
1060         hp = tcp_get_md5sig_pool();
1061         if (!hp)
1062                 goto clear_hash_noput;
1063         desc = &hp->md5_desc;
1064
1065         if (crypto_hash_init(desc))
1066                 goto clear_hash;
1067         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1068                 goto clear_hash;
1069         if (tcp_md5_hash_header(hp, th))
1070                 goto clear_hash;
1071         if (tcp_md5_hash_key(hp, key))
1072                 goto clear_hash;
1073         if (crypto_hash_final(desc, md5_hash))
1074                 goto clear_hash;
1075
1076         tcp_put_md5sig_pool();
1077         return 0;
1078
1079 clear_hash:
1080         tcp_put_md5sig_pool();
1081 clear_hash_noput:
1082         memset(md5_hash, 0, 16);
1083         return 1;
1084 }
1085
1086 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1087                         struct sock *sk, struct request_sock *req,
1088                         struct sk_buff *skb)
1089 {
1090         struct tcp_md5sig_pool *hp;
1091         struct hash_desc *desc;
1092         struct tcphdr *th = tcp_hdr(skb);
1093         __be32 saddr, daddr;
1094
1095         if (sk) {
1096                 saddr = inet_sk(sk)->inet_saddr;
1097                 daddr = inet_sk(sk)->inet_daddr;
1098         } else if (req) {
1099                 saddr = inet_rsk(req)->loc_addr;
1100                 daddr = inet_rsk(req)->rmt_addr;
1101         } else {
1102                 const struct iphdr *iph = ip_hdr(skb);
1103                 saddr = iph->saddr;
1104                 daddr = iph->daddr;
1105         }
1106
1107         hp = tcp_get_md5sig_pool();
1108         if (!hp)
1109                 goto clear_hash_noput;
1110         desc = &hp->md5_desc;
1111
1112         if (crypto_hash_init(desc))
1113                 goto clear_hash;
1114
1115         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1116                 goto clear_hash;
1117         if (tcp_md5_hash_header(hp, th))
1118                 goto clear_hash;
1119         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1120                 goto clear_hash;
1121         if (tcp_md5_hash_key(hp, key))
1122                 goto clear_hash;
1123         if (crypto_hash_final(desc, md5_hash))
1124                 goto clear_hash;
1125
1126         tcp_put_md5sig_pool();
1127         return 0;
1128
1129 clear_hash:
1130         tcp_put_md5sig_pool();
1131 clear_hash_noput:
1132         memset(md5_hash, 0, 16);
1133         return 1;
1134 }
1135
1136 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1137
1138 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1139 {
1140         /*
1141          * This gets called for each TCP segment that arrives
1142          * so we want to be efficient.
1143          * We have 3 drop cases:
1144          * o No MD5 hash and one expected.
1145          * o MD5 hash and we're not expecting one.
1146          * o MD5 hash and its wrong.
1147          */
1148         __u8 *hash_location = NULL;
1149         struct tcp_md5sig_key *hash_expected;
1150         const struct iphdr *iph = ip_hdr(skb);
1151         struct tcphdr *th = tcp_hdr(skb);
1152         int genhash;
1153         unsigned char newhash[16];
1154
1155         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1156         hash_location = tcp_parse_md5sig_option(th);
1157
1158         /* We've parsed the options - do we have a hash? */
1159         if (!hash_expected && !hash_location)
1160                 return 0;
1161
1162         if (hash_expected && !hash_location) {
1163                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1164                 return 1;
1165         }
1166
1167         if (!hash_expected && hash_location) {
1168                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1169                 return 1;
1170         }
1171
1172         /* Okay, so this is hash_expected and hash_location -
1173          * so we need to calculate the checksum.
1174          */
1175         genhash = tcp_v4_md5_hash_skb(newhash,
1176                                       hash_expected,
1177                                       NULL, NULL, skb);
1178
1179         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1180                 if (net_ratelimit()) {
1181                         printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1182                                &iph->saddr, ntohs(th->source),
1183                                &iph->daddr, ntohs(th->dest),
1184                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1185                 }
1186                 return 1;
1187         }
1188         return 0;
1189 }
1190
1191 #endif
1192
1193 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1194         .family         =       PF_INET,
1195         .obj_size       =       sizeof(struct tcp_request_sock),
1196         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1197         .send_ack       =       tcp_v4_reqsk_send_ack,
1198         .destructor     =       tcp_v4_reqsk_destructor,
1199         .send_reset     =       tcp_v4_send_reset,
1200         .syn_ack_timeout =      tcp_syn_ack_timeout,
1201 };
1202
1203 #ifdef CONFIG_TCP_MD5SIG
1204 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1205         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1206         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1207 };
1208 #endif
1209
1210 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1211         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1212         .twsk_unique    = tcp_twsk_unique,
1213         .twsk_destructor= tcp_twsk_destructor,
1214 };
1215
1216 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1217 {
1218         struct tcp_extend_values tmp_ext;
1219         struct tcp_options_received tmp_opt;
1220         u8 *hash_location;
1221         struct request_sock *req;
1222         struct inet_request_sock *ireq;
1223         struct tcp_sock *tp = tcp_sk(sk);
1224         struct dst_entry *dst = NULL;
1225         __be32 saddr = ip_hdr(skb)->saddr;
1226         __be32 daddr = ip_hdr(skb)->daddr;
1227         __u32 isn = TCP_SKB_CB(skb)->when;
1228 #ifdef CONFIG_SYN_COOKIES
1229         int want_cookie = 0;
1230 #else
1231 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1232 #endif
1233
1234         /* Never answer to SYNs send to broadcast or multicast */
1235         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1236                 goto drop;
1237
1238         /* TW buckets are converted to open requests without
1239          * limitations, they conserve resources and peer is
1240          * evidently real one.
1241          */
1242         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1243 #ifdef CONFIG_SYN_COOKIES
1244                 if (sysctl_tcp_syncookies) {
1245                         want_cookie = 1;
1246                 } else
1247 #endif
1248                 goto drop;
1249         }
1250
1251         /* Accept backlog is full. If we have already queued enough
1252          * of warm entries in syn queue, drop request. It is better than
1253          * clogging syn queue with openreqs with exponentially increasing
1254          * timeout.
1255          */
1256         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1257                 goto drop;
1258
1259         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1260         if (!req)
1261                 goto drop;
1262
1263 #ifdef CONFIG_TCP_MD5SIG
1264         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1265 #endif
1266
1267         tcp_clear_options(&tmp_opt);
1268         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1269         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1270         tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1271
1272         if (tmp_opt.cookie_plus > 0 &&
1273             tmp_opt.saw_tstamp &&
1274             !tp->rx_opt.cookie_out_never &&
1275             (sysctl_tcp_cookie_size > 0 ||
1276              (tp->cookie_values != NULL &&
1277               tp->cookie_values->cookie_desired > 0))) {
1278                 u8 *c;
1279                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1280                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1281
1282                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1283                         goto drop_and_release;
1284
1285                 /* Secret recipe starts with IP addresses */
1286                 *mess++ ^= daddr;
1287                 *mess++ ^= saddr;
1288
1289                 /* plus variable length Initiator Cookie */
1290                 c = (u8 *)mess;
1291                 while (l-- > 0)
1292                         *c++ ^= *hash_location++;
1293
1294 #ifdef CONFIG_SYN_COOKIES
1295                 want_cookie = 0;        /* not our kind of cookie */
1296 #endif
1297                 tmp_ext.cookie_out_never = 0; /* false */
1298                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1299         } else if (!tp->rx_opt.cookie_in_always) {
1300                 /* redundant indications, but ensure initialization. */
1301                 tmp_ext.cookie_out_never = 1; /* true */
1302                 tmp_ext.cookie_plus = 0;
1303         } else {
1304                 goto drop_and_release;
1305         }
1306         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1307
1308         if (want_cookie && !tmp_opt.saw_tstamp)
1309                 tcp_clear_options(&tmp_opt);
1310
1311         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1312         tcp_openreq_init(req, &tmp_opt, skb);
1313
1314         ireq = inet_rsk(req);
1315         ireq->loc_addr = daddr;
1316         ireq->rmt_addr = saddr;
1317         ireq->no_srccheck = inet_sk(sk)->transparent;
1318         ireq->opt = tcp_v4_save_options(sk, skb);
1319
1320         if (security_inet_conn_request(sk, skb, req))
1321                 goto drop_and_free;
1322
1323         if (!want_cookie)
1324                 TCP_ECN_create_request(req, tcp_hdr(skb));
1325
1326         if (want_cookie) {
1327 #ifdef CONFIG_SYN_COOKIES
1328                 syn_flood_warning(skb);
1329                 req->cookie_ts = tmp_opt.tstamp_ok;
1330 #endif
1331                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1332         } else if (!isn) {
1333                 struct inet_peer *peer = NULL;
1334
1335                 /* VJ's idea. We save last timestamp seen
1336                  * from the destination in peer table, when entering
1337                  * state TIME-WAIT, and check against it before
1338                  * accepting new connection request.
1339                  *
1340                  * If "isn" is not zero, this request hit alive
1341                  * timewait bucket, so that all the necessary checks
1342                  * are made in the function processing timewait state.
1343                  */
1344                 if (tmp_opt.saw_tstamp &&
1345                     tcp_death_row.sysctl_tw_recycle &&
1346                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1347                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1348                     peer->v4daddr == saddr) {
1349                         if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1350                             (s32)(peer->tcp_ts - req->ts_recent) >
1351                                                         TCP_PAWS_WINDOW) {
1352                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1353                                 goto drop_and_release;
1354                         }
1355                 }
1356                 /* Kill the following clause, if you dislike this way. */
1357                 else if (!sysctl_tcp_syncookies &&
1358                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1359                           (sysctl_max_syn_backlog >> 2)) &&
1360                          (!peer || !peer->tcp_ts_stamp) &&
1361                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1362                         /* Without syncookies last quarter of
1363                          * backlog is filled with destinations,
1364                          * proven to be alive.
1365                          * It means that we continue to communicate
1366                          * to destinations, already remembered
1367                          * to the moment of synflood.
1368                          */
1369                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1370                                        &saddr, ntohs(tcp_hdr(skb)->source));
1371                         goto drop_and_release;
1372                 }
1373
1374                 isn = tcp_v4_init_sequence(skb);
1375         }
1376         tcp_rsk(req)->snt_isn = isn;
1377
1378         if (tcp_v4_send_synack(sk, dst, req,
1379                                (struct request_values *)&tmp_ext) ||
1380             want_cookie)
1381                 goto drop_and_free;
1382
1383         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1384         return 0;
1385
1386 drop_and_release:
1387         dst_release(dst);
1388 drop_and_free:
1389         reqsk_free(req);
1390 drop:
1391         return 0;
1392 }
1393
1394
1395 /*
1396  * The three way handshake has completed - we got a valid synack -
1397  * now create the new socket.
1398  */
1399 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1400                                   struct request_sock *req,
1401                                   struct dst_entry *dst)
1402 {
1403         struct inet_request_sock *ireq;
1404         struct inet_sock *newinet;
1405         struct tcp_sock *newtp;
1406         struct sock *newsk;
1407 #ifdef CONFIG_TCP_MD5SIG
1408         struct tcp_md5sig_key *key;
1409 #endif
1410
1411         if (sk_acceptq_is_full(sk))
1412                 goto exit_overflow;
1413
1414         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1415                 goto exit;
1416
1417         newsk = tcp_create_openreq_child(sk, req, skb);
1418         if (!newsk)
1419                 goto exit;
1420
1421         newsk->sk_gso_type = SKB_GSO_TCPV4;
1422         sk_setup_caps(newsk, dst);
1423
1424         newtp                 = tcp_sk(newsk);
1425         newinet               = inet_sk(newsk);
1426         ireq                  = inet_rsk(req);
1427         newinet->inet_daddr   = ireq->rmt_addr;
1428         newinet->inet_rcv_saddr = ireq->loc_addr;
1429         newinet->inet_saddr           = ireq->loc_addr;
1430         newinet->opt          = ireq->opt;
1431         ireq->opt             = NULL;
1432         newinet->mc_index     = inet_iif(skb);
1433         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1434         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1435         if (newinet->opt)
1436                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1437         newinet->inet_id = newtp->write_seq ^ jiffies;
1438
1439         tcp_mtup_init(newsk);
1440         tcp_sync_mss(newsk, dst_mtu(dst));
1441         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1442         if (tcp_sk(sk)->rx_opt.user_mss &&
1443             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1444                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1445
1446         tcp_initialize_rcv_mss(newsk);
1447
1448 #ifdef CONFIG_TCP_MD5SIG
1449         /* Copy over the MD5 key from the original socket */
1450         key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1451         if (key != NULL) {
1452                 /*
1453                  * We're using one, so create a matching key
1454                  * on the newsk structure. If we fail to get
1455                  * memory, then we end up not copying the key
1456                  * across. Shucks.
1457                  */
1458                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1459                 if (newkey != NULL)
1460                         tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1461                                           newkey, key->keylen);
1462                 newsk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1463         }
1464 #endif
1465
1466         __inet_hash_nolisten(newsk, NULL);
1467         __inet_inherit_port(sk, newsk);
1468
1469         return newsk;
1470
1471 exit_overflow:
1472         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1473 exit:
1474         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1475         dst_release(dst);
1476         return NULL;
1477 }
1478
1479 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1480 {
1481         struct tcphdr *th = tcp_hdr(skb);
1482         const struct iphdr *iph = ip_hdr(skb);
1483         struct sock *nsk;
1484         struct request_sock **prev;
1485         /* Find possible connection requests. */
1486         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1487                                                        iph->saddr, iph->daddr);
1488         if (req)
1489                 return tcp_check_req(sk, skb, req, prev);
1490
1491         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1492                         th->source, iph->daddr, th->dest, inet_iif(skb));
1493
1494         if (nsk) {
1495                 if (nsk->sk_state != TCP_TIME_WAIT) {
1496                         bh_lock_sock(nsk);
1497                         return nsk;
1498                 }
1499                 inet_twsk_put(inet_twsk(nsk));
1500                 return NULL;
1501         }
1502
1503 #ifdef CONFIG_SYN_COOKIES
1504         if (!th->rst && !th->syn && th->ack)
1505                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1506 #endif
1507         return sk;
1508 }
1509
1510 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1511 {
1512         const struct iphdr *iph = ip_hdr(skb);
1513
1514         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1515                 if (!tcp_v4_check(skb->len, iph->saddr,
1516                                   iph->daddr, skb->csum)) {
1517                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1518                         return 0;
1519                 }
1520         }
1521
1522         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1523                                        skb->len, IPPROTO_TCP, 0);
1524
1525         if (skb->len <= 76) {
1526                 return __skb_checksum_complete(skb);
1527         }
1528         return 0;
1529 }
1530
1531
1532 /* The socket must have it's spinlock held when we get
1533  * here.
1534  *
1535  * We have a potential double-lock case here, so even when
1536  * doing backlog processing we use the BH locking scheme.
1537  * This is because we cannot sleep with the original spinlock
1538  * held.
1539  */
1540 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1541 {
1542         struct sock *rsk;
1543 #ifdef CONFIG_TCP_MD5SIG
1544         /*
1545          * We really want to reject the packet as early as possible
1546          * if:
1547          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1548          *  o There is an MD5 option and we're not expecting one
1549          */
1550         if (tcp_v4_inbound_md5_hash(sk, skb))
1551                 goto discard;
1552 #endif
1553
1554         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1555                 TCP_CHECK_TIMER(sk);
1556                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1557                         rsk = sk;
1558                         goto reset;
1559                 }
1560                 TCP_CHECK_TIMER(sk);
1561                 return 0;
1562         }
1563
1564         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1565                 goto csum_err;
1566
1567         if (sk->sk_state == TCP_LISTEN) {
1568                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1569                 if (!nsk)
1570                         goto discard;
1571
1572                 if (nsk != sk) {
1573                         if (tcp_child_process(sk, nsk, skb)) {
1574                                 rsk = nsk;
1575                                 goto reset;
1576                         }
1577                         return 0;
1578                 }
1579         }
1580
1581         TCP_CHECK_TIMER(sk);
1582         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1583                 rsk = sk;
1584                 goto reset;
1585         }
1586         TCP_CHECK_TIMER(sk);
1587         return 0;
1588
1589 reset:
1590         tcp_v4_send_reset(rsk, skb);
1591 discard:
1592         kfree_skb(skb);
1593         /* Be careful here. If this function gets more complicated and
1594          * gcc suffers from register pressure on the x86, sk (in %ebx)
1595          * might be destroyed here. This current version compiles correctly,
1596          * but you have been warned.
1597          */
1598         return 0;
1599
1600 csum_err:
1601         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1602         goto discard;
1603 }
1604
1605 /*
1606  *      From tcp_input.c
1607  */
1608
1609 int tcp_v4_rcv(struct sk_buff *skb)
1610 {
1611         const struct iphdr *iph;
1612         struct tcphdr *th;
1613         struct sock *sk;
1614         int ret;
1615         struct net *net = dev_net(skb->dev);
1616
1617         if (skb->pkt_type != PACKET_HOST)
1618                 goto discard_it;
1619
1620         /* Count it even if it's bad */
1621         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1622
1623         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1624                 goto discard_it;
1625
1626         th = tcp_hdr(skb);
1627
1628         if (th->doff < sizeof(struct tcphdr) / 4)
1629                 goto bad_packet;
1630         if (!pskb_may_pull(skb, th->doff * 4))
1631                 goto discard_it;
1632
1633         /* An explanation is required here, I think.
1634          * Packet length and doff are validated by header prediction,
1635          * provided case of th->doff==0 is eliminated.
1636          * So, we defer the checks. */
1637         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1638                 goto bad_packet;
1639
1640         th = tcp_hdr(skb);
1641         iph = ip_hdr(skb);
1642         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1643         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1644                                     skb->len - th->doff * 4);
1645         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1646         TCP_SKB_CB(skb)->when    = 0;
1647         TCP_SKB_CB(skb)->flags   = iph->tos;
1648         TCP_SKB_CB(skb)->sacked  = 0;
1649
1650         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1651         if (!sk)
1652                 goto no_tcp_socket;
1653
1654 process:
1655         if (sk->sk_state == TCP_TIME_WAIT)
1656                 goto do_time_wait;
1657
1658         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1659                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1660                 goto discard_and_relse;
1661         }
1662
1663         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1664                 goto discard_and_relse;
1665         nf_reset(skb);
1666
1667         if (sk_filter(sk, skb))
1668                 goto discard_and_relse;
1669
1670         skb->dev = NULL;
1671
1672         bh_lock_sock_nested(sk);
1673         ret = 0;
1674         if (!sock_owned_by_user(sk)) {
1675 #ifdef CONFIG_NET_DMA
1676                 struct tcp_sock *tp = tcp_sk(sk);
1677                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1678                         tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1679                 if (tp->ucopy.dma_chan)
1680                         ret = tcp_v4_do_rcv(sk, skb);
1681                 else
1682 #endif
1683                 {
1684                         if (!tcp_prequeue(sk, skb))
1685                                 ret = tcp_v4_do_rcv(sk, skb);
1686                 }
1687         } else if (unlikely(sk_add_backlog(sk, skb))) {
1688                 bh_unlock_sock(sk);
1689                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1690                 goto discard_and_relse;
1691         }
1692         bh_unlock_sock(sk);
1693
1694         sock_put(sk);
1695
1696         return ret;
1697
1698 no_tcp_socket:
1699         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1700                 goto discard_it;
1701
1702         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1703 bad_packet:
1704                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1705         } else {
1706                 tcp_v4_send_reset(NULL, skb);
1707         }
1708
1709 discard_it:
1710         /* Discard frame. */
1711         kfree_skb(skb);
1712         return 0;
1713
1714 discard_and_relse:
1715         sock_put(sk);
1716         goto discard_it;
1717
1718 do_time_wait:
1719         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1720                 inet_twsk_put(inet_twsk(sk));
1721                 goto discard_it;
1722         }
1723
1724         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1725                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1726                 inet_twsk_put(inet_twsk(sk));
1727                 goto discard_it;
1728         }
1729         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1730         case TCP_TW_SYN: {
1731                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1732                                                         &tcp_hashinfo,
1733                                                         iph->daddr, th->dest,
1734                                                         inet_iif(skb));
1735                 if (sk2) {
1736                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1737                         inet_twsk_put(inet_twsk(sk));
1738                         sk = sk2;
1739                         goto process;
1740                 }
1741                 /* Fall through to ACK */
1742         }
1743         case TCP_TW_ACK:
1744                 tcp_v4_timewait_ack(sk, skb);
1745                 break;
1746         case TCP_TW_RST:
1747                 goto no_tcp_socket;
1748         case TCP_TW_SUCCESS:;
1749         }
1750         goto discard_it;
1751 }
1752
1753 /* VJ's idea. Save last timestamp seen from this destination
1754  * and hold it at least for normal timewait interval to use for duplicate
1755  * segment detection in subsequent connections, before they enter synchronized
1756  * state.
1757  */
1758
1759 int tcp_v4_remember_stamp(struct sock *sk)
1760 {
1761         struct inet_sock *inet = inet_sk(sk);
1762         struct tcp_sock *tp = tcp_sk(sk);
1763         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1764         struct inet_peer *peer = NULL;
1765         int release_it = 0;
1766
1767         if (!rt || rt->rt_dst != inet->inet_daddr) {
1768                 peer = inet_getpeer(inet->inet_daddr, 1);
1769                 release_it = 1;
1770         } else {
1771                 if (!rt->peer)
1772                         rt_bind_peer(rt, 1);
1773                 peer = rt->peer;
1774         }
1775
1776         if (peer) {
1777                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1778                     ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1779                      peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
1780                         peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
1781                         peer->tcp_ts = tp->rx_opt.ts_recent;
1782                 }
1783                 if (release_it)
1784                         inet_putpeer(peer);
1785                 return 1;
1786         }
1787
1788         return 0;
1789 }
1790
1791 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1792 {
1793         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1794
1795         if (peer) {
1796                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1797
1798                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1799                     ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1800                      peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
1801                         peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
1802                         peer->tcp_ts       = tcptw->tw_ts_recent;
1803                 }
1804                 inet_putpeer(peer);
1805                 return 1;
1806         }
1807
1808         return 0;
1809 }
1810
1811 const struct inet_connection_sock_af_ops ipv4_specific = {
1812         .queue_xmit        = ip_queue_xmit,
1813         .send_check        = tcp_v4_send_check,
1814         .rebuild_header    = inet_sk_rebuild_header,
1815         .conn_request      = tcp_v4_conn_request,
1816         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1817         .remember_stamp    = tcp_v4_remember_stamp,
1818         .net_header_len    = sizeof(struct iphdr),
1819         .setsockopt        = ip_setsockopt,
1820         .getsockopt        = ip_getsockopt,
1821         .addr2sockaddr     = inet_csk_addr2sockaddr,
1822         .sockaddr_len      = sizeof(struct sockaddr_in),
1823         .bind_conflict     = inet_csk_bind_conflict,
1824 #ifdef CONFIG_COMPAT
1825         .compat_setsockopt = compat_ip_setsockopt,
1826         .compat_getsockopt = compat_ip_getsockopt,
1827 #endif
1828 };
1829
1830 #ifdef CONFIG_TCP_MD5SIG
1831 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1832         .md5_lookup             = tcp_v4_md5_lookup,
1833         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1834         .md5_add                = tcp_v4_md5_add_func,
1835         .md5_parse              = tcp_v4_parse_md5_keys,
1836 };
1837 #endif
1838
1839 /* NOTE: A lot of things set to zero explicitly by call to
1840  *       sk_alloc() so need not be done here.
1841  */
1842 static int tcp_v4_init_sock(struct sock *sk)
1843 {
1844         struct inet_connection_sock *icsk = inet_csk(sk);
1845         struct tcp_sock *tp = tcp_sk(sk);
1846
1847         skb_queue_head_init(&tp->out_of_order_queue);
1848         tcp_init_xmit_timers(sk);
1849         tcp_prequeue_init(tp);
1850
1851         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1852         tp->mdev = TCP_TIMEOUT_INIT;
1853
1854         /* So many TCP implementations out there (incorrectly) count the
1855          * initial SYN frame in their delayed-ACK and congestion control
1856          * algorithms that we must have the following bandaid to talk
1857          * efficiently to them.  -DaveM
1858          */
1859         tp->snd_cwnd = 2;
1860
1861         /* See draft-stevens-tcpca-spec-01 for discussion of the
1862          * initialization of these values.
1863          */
1864         tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1865         tp->snd_cwnd_clamp = ~0;
1866         tp->mss_cache = TCP_MSS_DEFAULT;
1867
1868         tp->reordering = sysctl_tcp_reordering;
1869         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1870
1871         sk->sk_state = TCP_CLOSE;
1872
1873         sk->sk_write_space = sk_stream_write_space;
1874         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1875
1876         icsk->icsk_af_ops = &ipv4_specific;
1877         icsk->icsk_sync_mss = tcp_sync_mss;
1878 #ifdef CONFIG_TCP_MD5SIG
1879         tp->af_specific = &tcp_sock_ipv4_specific;
1880 #endif
1881
1882         /* TCP Cookie Transactions */
1883         if (sysctl_tcp_cookie_size > 0) {
1884                 /* Default, cookies without s_data_payload. */
1885                 tp->cookie_values =
1886                         kzalloc(sizeof(*tp->cookie_values),
1887                                 sk->sk_allocation);
1888                 if (tp->cookie_values != NULL)
1889                         kref_init(&tp->cookie_values->kref);
1890         }
1891         /* Presumed zeroed, in order of appearance:
1892          *      cookie_in_always, cookie_out_never,
1893          *      s_data_constant, s_data_in, s_data_out
1894          */
1895         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1896         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1897
1898         local_bh_disable();
1899         percpu_counter_inc(&tcp_sockets_allocated);
1900         local_bh_enable();
1901
1902         return 0;
1903 }
1904
1905 void tcp_v4_destroy_sock(struct sock *sk)
1906 {
1907         struct tcp_sock *tp = tcp_sk(sk);
1908
1909         tcp_clear_xmit_timers(sk);
1910
1911         tcp_cleanup_congestion_control(sk);
1912
1913         /* Cleanup up the write buffer. */
1914         tcp_write_queue_purge(sk);
1915
1916         /* Cleans up our, hopefully empty, out_of_order_queue. */
1917         __skb_queue_purge(&tp->out_of_order_queue);
1918
1919 #ifdef CONFIG_TCP_MD5SIG
1920         /* Clean up the MD5 key list, if any */
1921         if (tp->md5sig_info) {
1922                 tcp_v4_clear_md5_list(sk);
1923                 kfree(tp->md5sig_info);
1924                 tp->md5sig_info = NULL;
1925         }
1926 #endif
1927
1928 #ifdef CONFIG_NET_DMA
1929         /* Cleans up our sk_async_wait_queue */
1930         __skb_queue_purge(&sk->sk_async_wait_queue);
1931 #endif
1932
1933         /* Clean prequeue, it must be empty really */
1934         __skb_queue_purge(&tp->ucopy.prequeue);
1935
1936         /* Clean up a referenced TCP bind bucket. */
1937         if (inet_csk(sk)->icsk_bind_hash)
1938                 inet_put_port(sk);
1939
1940         /*
1941          * If sendmsg cached page exists, toss it.
1942          */
1943         if (sk->sk_sndmsg_page) {
1944                 __free_page(sk->sk_sndmsg_page);
1945                 sk->sk_sndmsg_page = NULL;
1946         }
1947
1948         /* TCP Cookie Transactions */
1949         if (tp->cookie_values != NULL) {
1950                 kref_put(&tp->cookie_values->kref,
1951                          tcp_cookie_values_release);
1952                 tp->cookie_values = NULL;
1953         }
1954
1955         percpu_counter_dec(&tcp_sockets_allocated);
1956 }
1957
1958 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1959
1960 #ifdef CONFIG_PROC_FS
1961 /* Proc filesystem TCP sock list dumping. */
1962
1963 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1964 {
1965         return hlist_nulls_empty(head) ? NULL :
1966                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1967 }
1968
1969 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1970 {
1971         return !is_a_nulls(tw->tw_node.next) ?
1972                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1973 }
1974
1975 static void *listening_get_next(struct seq_file *seq, void *cur)
1976 {
1977         struct inet_connection_sock *icsk;
1978         struct hlist_nulls_node *node;
1979         struct sock *sk = cur;
1980         struct inet_listen_hashbucket *ilb;
1981         struct tcp_iter_state *st = seq->private;
1982         struct net *net = seq_file_net(seq);
1983
1984         if (!sk) {
1985                 st->bucket = 0;
1986                 ilb = &tcp_hashinfo.listening_hash[0];
1987                 spin_lock_bh(&ilb->lock);
1988                 sk = sk_nulls_head(&ilb->head);
1989                 goto get_sk;
1990         }
1991         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1992         ++st->num;
1993
1994         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1995                 struct request_sock *req = cur;
1996
1997                 icsk = inet_csk(st->syn_wait_sk);
1998                 req = req->dl_next;
1999                 while (1) {
2000                         while (req) {
2001                                 if (req->rsk_ops->family == st->family) {
2002                                         cur = req;
2003                                         goto out;
2004                                 }
2005                                 req = req->dl_next;
2006                         }
2007                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2008                                 break;
2009 get_req:
2010                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2011                 }
2012                 sk        = sk_next(st->syn_wait_sk);
2013                 st->state = TCP_SEQ_STATE_LISTENING;
2014                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2015         } else {
2016                 icsk = inet_csk(sk);
2017                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2018                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2019                         goto start_req;
2020                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2021                 sk = sk_next(sk);
2022         }
2023 get_sk:
2024         sk_nulls_for_each_from(sk, node) {
2025                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
2026                         cur = sk;
2027                         goto out;
2028                 }
2029                 icsk = inet_csk(sk);
2030                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2031                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2032 start_req:
2033                         st->uid         = sock_i_uid(sk);
2034                         st->syn_wait_sk = sk;
2035                         st->state       = TCP_SEQ_STATE_OPENREQ;
2036                         st->sbucket     = 0;
2037                         goto get_req;
2038                 }
2039                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2040         }
2041         spin_unlock_bh(&ilb->lock);
2042         if (++st->bucket < INET_LHTABLE_SIZE) {
2043                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2044                 spin_lock_bh(&ilb->lock);
2045                 sk = sk_nulls_head(&ilb->head);
2046                 goto get_sk;
2047         }
2048         cur = NULL;
2049 out:
2050         return cur;
2051 }
2052
2053 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2054 {
2055         void *rc = listening_get_next(seq, NULL);
2056
2057         while (rc && *pos) {
2058                 rc = listening_get_next(seq, rc);
2059                 --*pos;
2060         }
2061         return rc;
2062 }
2063
2064 static inline int empty_bucket(struct tcp_iter_state *st)
2065 {
2066         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2067                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2068 }
2069
2070 static void *established_get_first(struct seq_file *seq)
2071 {
2072         struct tcp_iter_state *st = seq->private;
2073         struct net *net = seq_file_net(seq);
2074         void *rc = NULL;
2075
2076         for (st->bucket = 0; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2077                 struct sock *sk;
2078                 struct hlist_nulls_node *node;
2079                 struct inet_timewait_sock *tw;
2080                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2081
2082                 /* Lockless fast path for the common case of empty buckets */
2083                 if (empty_bucket(st))
2084                         continue;
2085
2086                 spin_lock_bh(lock);
2087                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2088                         if (sk->sk_family != st->family ||
2089                             !net_eq(sock_net(sk), net)) {
2090                                 continue;
2091                         }
2092                         rc = sk;
2093                         goto out;
2094                 }
2095                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2096                 inet_twsk_for_each(tw, node,
2097                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2098                         if (tw->tw_family != st->family ||
2099                             !net_eq(twsk_net(tw), net)) {
2100                                 continue;
2101                         }
2102                         rc = tw;
2103                         goto out;
2104                 }
2105                 spin_unlock_bh(lock);
2106                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2107         }
2108 out:
2109         return rc;
2110 }
2111
2112 static void *established_get_next(struct seq_file *seq, void *cur)
2113 {
2114         struct sock *sk = cur;
2115         struct inet_timewait_sock *tw;
2116         struct hlist_nulls_node *node;
2117         struct tcp_iter_state *st = seq->private;
2118         struct net *net = seq_file_net(seq);
2119
2120         ++st->num;
2121
2122         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2123                 tw = cur;
2124                 tw = tw_next(tw);
2125 get_tw:
2126                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2127                         tw = tw_next(tw);
2128                 }
2129                 if (tw) {
2130                         cur = tw;
2131                         goto out;
2132                 }
2133                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2134                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2135
2136                 /* Look for next non empty bucket */
2137                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2138                                 empty_bucket(st))
2139                         ;
2140                 if (st->bucket > tcp_hashinfo.ehash_mask)
2141                         return NULL;
2142
2143                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2144                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2145         } else
2146                 sk = sk_nulls_next(sk);
2147
2148         sk_nulls_for_each_from(sk, node) {
2149                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2150                         goto found;
2151         }
2152
2153         st->state = TCP_SEQ_STATE_TIME_WAIT;
2154         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2155         goto get_tw;
2156 found:
2157         cur = sk;
2158 out:
2159         return cur;
2160 }
2161
2162 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2163 {
2164         void *rc = established_get_first(seq);
2165
2166         while (rc && pos) {
2167                 rc = established_get_next(seq, rc);
2168                 --pos;
2169         }
2170         return rc;
2171 }
2172
2173 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2174 {
2175         void *rc;
2176         struct tcp_iter_state *st = seq->private;
2177
2178         st->state = TCP_SEQ_STATE_LISTENING;
2179         rc        = listening_get_idx(seq, &pos);
2180
2181         if (!rc) {
2182                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2183                 rc        = established_get_idx(seq, pos);
2184         }
2185
2186         return rc;
2187 }
2188
2189 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2190 {
2191         struct tcp_iter_state *st = seq->private;
2192         st->state = TCP_SEQ_STATE_LISTENING;
2193         st->num = 0;
2194         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2195 }
2196
2197 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2198 {
2199         void *rc = NULL;
2200         struct tcp_iter_state *st;
2201
2202         if (v == SEQ_START_TOKEN) {
2203                 rc = tcp_get_idx(seq, 0);
2204                 goto out;
2205         }
2206         st = seq->private;
2207
2208         switch (st->state) {
2209         case TCP_SEQ_STATE_OPENREQ:
2210         case TCP_SEQ_STATE_LISTENING:
2211                 rc = listening_get_next(seq, v);
2212                 if (!rc) {
2213                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2214                         rc        = established_get_first(seq);
2215                 }
2216                 break;
2217         case TCP_SEQ_STATE_ESTABLISHED:
2218         case TCP_SEQ_STATE_TIME_WAIT:
2219                 rc = established_get_next(seq, v);
2220                 break;
2221         }
2222 out:
2223         ++*pos;
2224         return rc;
2225 }
2226
2227 static void tcp_seq_stop(struct seq_file *seq, void *v)
2228 {
2229         struct tcp_iter_state *st = seq->private;
2230
2231         switch (st->state) {
2232         case TCP_SEQ_STATE_OPENREQ:
2233                 if (v) {
2234                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2235                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2236                 }
2237         case TCP_SEQ_STATE_LISTENING:
2238                 if (v != SEQ_START_TOKEN)
2239                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2240                 break;
2241         case TCP_SEQ_STATE_TIME_WAIT:
2242         case TCP_SEQ_STATE_ESTABLISHED:
2243                 if (v)
2244                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2245                 break;
2246         }
2247 }
2248
2249 static int tcp_seq_open(struct inode *inode, struct file *file)
2250 {
2251         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2252         struct tcp_iter_state *s;
2253         int err;
2254
2255         err = seq_open_net(inode, file, &afinfo->seq_ops,
2256                           sizeof(struct tcp_iter_state));
2257         if (err < 0)
2258                 return err;
2259
2260         s = ((struct seq_file *)file->private_data)->private;
2261         s->family               = afinfo->family;
2262         return 0;
2263 }
2264
2265 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2266 {
2267         int rc = 0;
2268         struct proc_dir_entry *p;
2269
2270         afinfo->seq_fops.open           = tcp_seq_open;
2271         afinfo->seq_fops.read           = seq_read;
2272         afinfo->seq_fops.llseek         = seq_lseek;
2273         afinfo->seq_fops.release        = seq_release_net;
2274
2275         afinfo->seq_ops.start           = tcp_seq_start;
2276         afinfo->seq_ops.next            = tcp_seq_next;
2277         afinfo->seq_ops.stop            = tcp_seq_stop;
2278
2279         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2280                              &afinfo->seq_fops, afinfo);
2281         if (!p)
2282                 rc = -ENOMEM;
2283         return rc;
2284 }
2285
2286 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2287 {
2288         proc_net_remove(net, afinfo->name);
2289 }
2290
2291 static void get_openreq4(struct sock *sk, struct request_sock *req,
2292                          struct seq_file *f, int i, int uid, int *len)
2293 {
2294         const struct inet_request_sock *ireq = inet_rsk(req);
2295         int ttd = req->expires - jiffies;
2296
2297         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2298                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2299                 i,
2300                 ireq->loc_addr,
2301                 ntohs(inet_sk(sk)->inet_sport),
2302                 ireq->rmt_addr,
2303                 ntohs(ireq->rmt_port),
2304                 TCP_SYN_RECV,
2305                 0, 0, /* could print option size, but that is af dependent. */
2306                 1,    /* timers active (only the expire timer) */
2307                 jiffies_to_clock_t(ttd),
2308                 req->retrans,
2309                 uid,
2310                 0,  /* non standard timer */
2311                 0, /* open_requests have no inode */
2312                 atomic_read(&sk->sk_refcnt),
2313                 req,
2314                 len);
2315 }
2316
2317 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2318 {
2319         int timer_active;
2320         unsigned long timer_expires;
2321         struct tcp_sock *tp = tcp_sk(sk);
2322         const struct inet_connection_sock *icsk = inet_csk(sk);
2323         struct inet_sock *inet = inet_sk(sk);
2324         __be32 dest = inet->inet_daddr;
2325         __be32 src = inet->inet_rcv_saddr;
2326         __u16 destp = ntohs(inet->inet_dport);
2327         __u16 srcp = ntohs(inet->inet_sport);
2328         int rx_queue;
2329
2330         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2331                 timer_active    = 1;
2332                 timer_expires   = icsk->icsk_timeout;
2333         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2334                 timer_active    = 4;
2335                 timer_expires   = icsk->icsk_timeout;
2336         } else if (timer_pending(&sk->sk_timer)) {
2337                 timer_active    = 2;
2338                 timer_expires   = sk->sk_timer.expires;
2339         } else {
2340                 timer_active    = 0;
2341                 timer_expires = jiffies;
2342         }
2343
2344         if (sk->sk_state == TCP_LISTEN)
2345                 rx_queue = sk->sk_ack_backlog;
2346         else
2347                 /*
2348                  * because we dont lock socket, we might find a transient negative value
2349                  */
2350                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2351
2352         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2353                         "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2354                 i, src, srcp, dest, destp, sk->sk_state,
2355                 tp->write_seq - tp->snd_una,
2356                 rx_queue,
2357                 timer_active,
2358                 jiffies_to_clock_t(timer_expires - jiffies),
2359                 icsk->icsk_retransmits,
2360                 sock_i_uid(sk),
2361                 icsk->icsk_probes_out,
2362                 sock_i_ino(sk),
2363                 atomic_read(&sk->sk_refcnt), sk,
2364                 jiffies_to_clock_t(icsk->icsk_rto),
2365                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2366                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2367                 tp->snd_cwnd,
2368                 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2369                 len);
2370 }
2371
2372 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2373                                struct seq_file *f, int i, int *len)
2374 {
2375         __be32 dest, src;
2376         __u16 destp, srcp;
2377         int ttd = tw->tw_ttd - jiffies;
2378
2379         if (ttd < 0)
2380                 ttd = 0;
2381
2382         dest  = tw->tw_daddr;
2383         src   = tw->tw_rcv_saddr;
2384         destp = ntohs(tw->tw_dport);
2385         srcp  = ntohs(tw->tw_sport);
2386
2387         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2388                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2389                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2390                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2391                 atomic_read(&tw->tw_refcnt), tw, len);
2392 }
2393
2394 #define TMPSZ 150
2395
2396 static int tcp4_seq_show(struct seq_file *seq, void *v)
2397 {
2398         struct tcp_iter_state *st;
2399         int len;
2400
2401         if (v == SEQ_START_TOKEN) {
2402                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2403                            "  sl  local_address rem_address   st tx_queue "
2404                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2405                            "inode");
2406                 goto out;
2407         }
2408         st = seq->private;
2409
2410         switch (st->state) {
2411         case TCP_SEQ_STATE_LISTENING:
2412         case TCP_SEQ_STATE_ESTABLISHED:
2413                 get_tcp4_sock(v, seq, st->num, &len);
2414                 break;
2415         case TCP_SEQ_STATE_OPENREQ:
2416                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2417                 break;
2418         case TCP_SEQ_STATE_TIME_WAIT:
2419                 get_timewait4_sock(v, seq, st->num, &len);
2420                 break;
2421         }
2422         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2423 out:
2424         return 0;
2425 }
2426
2427 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2428         .name           = "tcp",
2429         .family         = AF_INET,
2430         .seq_fops       = {
2431                 .owner          = THIS_MODULE,
2432         },
2433         .seq_ops        = {
2434                 .show           = tcp4_seq_show,
2435         },
2436 };
2437
2438 static int __net_init tcp4_proc_init_net(struct net *net)
2439 {
2440         return tcp_proc_register(net, &tcp4_seq_afinfo);
2441 }
2442
2443 static void __net_exit tcp4_proc_exit_net(struct net *net)
2444 {
2445         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2446 }
2447
2448 static struct pernet_operations tcp4_net_ops = {
2449         .init = tcp4_proc_init_net,
2450         .exit = tcp4_proc_exit_net,
2451 };
2452
2453 int __init tcp4_proc_init(void)
2454 {
2455         return register_pernet_subsys(&tcp4_net_ops);
2456 }
2457
2458 void tcp4_proc_exit(void)
2459 {
2460         unregister_pernet_subsys(&tcp4_net_ops);
2461 }
2462 #endif /* CONFIG_PROC_FS */
2463
2464 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2465 {
2466         struct iphdr *iph = skb_gro_network_header(skb);
2467
2468         switch (skb->ip_summed) {
2469         case CHECKSUM_COMPLETE:
2470                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2471                                   skb->csum)) {
2472                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2473                         break;
2474                 }
2475
2476                 /* fall through */
2477         case CHECKSUM_NONE:
2478                 NAPI_GRO_CB(skb)->flush = 1;
2479                 return NULL;
2480         }
2481
2482         return tcp_gro_receive(head, skb);
2483 }
2484 EXPORT_SYMBOL(tcp4_gro_receive);
2485
2486 int tcp4_gro_complete(struct sk_buff *skb)
2487 {
2488         struct iphdr *iph = ip_hdr(skb);
2489         struct tcphdr *th = tcp_hdr(skb);
2490
2491         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2492                                   iph->saddr, iph->daddr, 0);
2493         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2494
2495         return tcp_gro_complete(skb);
2496 }
2497 EXPORT_SYMBOL(tcp4_gro_complete);
2498
2499 struct proto tcp_prot = {
2500         .name                   = "TCP",
2501         .owner                  = THIS_MODULE,
2502         .close                  = tcp_close,
2503         .connect                = tcp_v4_connect,
2504         .disconnect             = tcp_disconnect,
2505         .accept                 = inet_csk_accept,
2506         .ioctl                  = tcp_ioctl,
2507         .init                   = tcp_v4_init_sock,
2508         .destroy                = tcp_v4_destroy_sock,
2509         .shutdown               = tcp_shutdown,
2510         .setsockopt             = tcp_setsockopt,
2511         .getsockopt             = tcp_getsockopt,
2512         .recvmsg                = tcp_recvmsg,
2513         .backlog_rcv            = tcp_v4_do_rcv,
2514         .hash                   = inet_hash,
2515         .unhash                 = inet_unhash,
2516         .get_port               = inet_csk_get_port,
2517         .enter_memory_pressure  = tcp_enter_memory_pressure,
2518         .sockets_allocated      = &tcp_sockets_allocated,
2519         .orphan_count           = &tcp_orphan_count,
2520         .memory_allocated       = &tcp_memory_allocated,
2521         .memory_pressure        = &tcp_memory_pressure,
2522         .sysctl_mem             = sysctl_tcp_mem,
2523         .sysctl_wmem            = sysctl_tcp_wmem,
2524         .sysctl_rmem            = sysctl_tcp_rmem,
2525         .max_header             = MAX_TCP_HEADER,
2526         .obj_size               = sizeof(struct tcp_sock),
2527         .slab_flags             = SLAB_DESTROY_BY_RCU,
2528         .twsk_prot              = &tcp_timewait_sock_ops,
2529         .rsk_prot               = &tcp_request_sock_ops,
2530         .h.hashinfo             = &tcp_hashinfo,
2531 #ifdef CONFIG_COMPAT
2532         .compat_setsockopt      = compat_tcp_setsockopt,
2533         .compat_getsockopt      = compat_tcp_getsockopt,
2534 #endif
2535 };
2536
2537
2538 static int __net_init tcp_sk_init(struct net *net)
2539 {
2540         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2541                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2542 }
2543
2544 static void __net_exit tcp_sk_exit(struct net *net)
2545 {
2546         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2547 }
2548
2549 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2550 {
2551         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2552 }
2553
2554 static struct pernet_operations __net_initdata tcp_sk_ops = {
2555        .init       = tcp_sk_init,
2556        .exit       = tcp_sk_exit,
2557        .exit_batch = tcp_sk_exit_batch,
2558 };
2559
2560 void __init tcp_v4_init(void)
2561 {
2562         inet_hashinfo_init(&tcp_hashinfo);
2563         if (register_pernet_subsys(&tcp_sk_ops))
2564                 panic("Failed to create the TCP control socket.\n");
2565 }
2566
2567 EXPORT_SYMBOL(ipv4_specific);
2568 EXPORT_SYMBOL(tcp_hashinfo);
2569 EXPORT_SYMBOL(tcp_prot);
2570 EXPORT_SYMBOL(tcp_v4_conn_request);
2571 EXPORT_SYMBOL(tcp_v4_connect);
2572 EXPORT_SYMBOL(tcp_v4_do_rcv);
2573 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2574 EXPORT_SYMBOL(tcp_v4_send_check);
2575 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2576
2577 #ifdef CONFIG_PROC_FS
2578 EXPORT_SYMBOL(tcp_proc_register);
2579 EXPORT_SYMBOL(tcp_proc_unregister);
2580 #endif
2581 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2582