tcp: advertise MSS requested by user
[linux-flexiantxendom0-natty.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53
54 #include <linux/types.h>
55 #include <linux/fcntl.h>
56 #include <linux/module.h>
57 #include <linux/random.h>
58 #include <linux/cache.h>
59 #include <linux/jhash.h>
60 #include <linux/init.h>
61 #include <linux/times.h>
62
63 #include <net/net_namespace.h>
64 #include <net/icmp.h>
65 #include <net/inet_hashtables.h>
66 #include <net/tcp.h>
67 #include <net/transp_v6.h>
68 #include <net/ipv6.h>
69 #include <net/inet_common.h>
70 #include <net/timewait_sock.h>
71 #include <net/xfrm.h>
72 #include <net/netdma.h>
73
74 #include <linux/inet.h>
75 #include <linux/ipv6.h>
76 #include <linux/stddef.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79
80 #include <linux/crypto.h>
81 #include <linux/scatterlist.h>
82
83 int sysctl_tcp_tw_reuse __read_mostly;
84 int sysctl_tcp_low_latency __read_mostly;
85
86
87 #ifdef CONFIG_TCP_MD5SIG
88 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
89                                                    __be32 addr);
90 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
91                                __be32 daddr, __be32 saddr, struct tcphdr *th);
92 #else
93 static inline
94 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
95 {
96         return NULL;
97 }
98 #endif
99
100 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
101         .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
102         .lhash_users = ATOMIC_INIT(0),
103         .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
104 };
105
106 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
107 {
108         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
109                                           ip_hdr(skb)->saddr,
110                                           tcp_hdr(skb)->dest,
111                                           tcp_hdr(skb)->source);
112 }
113
114 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
115 {
116         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
117         struct tcp_sock *tp = tcp_sk(sk);
118
119         /* With PAWS, it is safe from the viewpoint
120            of data integrity. Even without PAWS it is safe provided sequence
121            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
122
123            Actually, the idea is close to VJ's one, only timestamp cache is
124            held not per host, but per port pair and TW bucket is used as state
125            holder.
126
127            If TW bucket has been already destroyed we fall back to VJ's scheme
128            and use initial timestamp retrieved from peer table.
129          */
130         if (tcptw->tw_ts_recent_stamp &&
131             (twp == NULL || (sysctl_tcp_tw_reuse &&
132                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
133                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
134                 if (tp->write_seq == 0)
135                         tp->write_seq = 1;
136                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
137                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
138                 sock_hold(sktw);
139                 return 1;
140         }
141
142         return 0;
143 }
144
145 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
146
147 /* This will initiate an outgoing connection. */
148 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
149 {
150         struct inet_sock *inet = inet_sk(sk);
151         struct tcp_sock *tp = tcp_sk(sk);
152         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
153         struct rtable *rt;
154         __be32 daddr, nexthop;
155         int tmp;
156         int err;
157
158         if (addr_len < sizeof(struct sockaddr_in))
159                 return -EINVAL;
160
161         if (usin->sin_family != AF_INET)
162                 return -EAFNOSUPPORT;
163
164         nexthop = daddr = usin->sin_addr.s_addr;
165         if (inet->opt && inet->opt->srr) {
166                 if (!daddr)
167                         return -EINVAL;
168                 nexthop = inet->opt->faddr;
169         }
170
171         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
172                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
173                                IPPROTO_TCP,
174                                inet->sport, usin->sin_port, sk, 1);
175         if (tmp < 0) {
176                 if (tmp == -ENETUNREACH)
177                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
178                 return tmp;
179         }
180
181         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
182                 ip_rt_put(rt);
183                 return -ENETUNREACH;
184         }
185
186         if (!inet->opt || !inet->opt->srr)
187                 daddr = rt->rt_dst;
188
189         if (!inet->saddr)
190                 inet->saddr = rt->rt_src;
191         inet->rcv_saddr = inet->saddr;
192
193         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
194                 /* Reset inherited state */
195                 tp->rx_opt.ts_recent       = 0;
196                 tp->rx_opt.ts_recent_stamp = 0;
197                 tp->write_seq              = 0;
198         }
199
200         if (tcp_death_row.sysctl_tw_recycle &&
201             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
202                 struct inet_peer *peer = rt_get_peer(rt);
203                 /*
204                  * VJ's idea. We save last timestamp seen from
205                  * the destination in peer table, when entering state
206                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
207                  * when trying new connection.
208                  */
209                 if (peer != NULL &&
210                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
211                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
212                         tp->rx_opt.ts_recent = peer->tcp_ts;
213                 }
214         }
215
216         inet->dport = usin->sin_port;
217         inet->daddr = daddr;
218
219         inet_csk(sk)->icsk_ext_hdr_len = 0;
220         if (inet->opt)
221                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
222
223         tp->rx_opt.mss_clamp = 536;
224
225         /* Socket identity is still unknown (sport may be zero).
226          * However we set state to SYN-SENT and not releasing socket
227          * lock select source port, enter ourselves into the hash tables and
228          * complete initialization after this.
229          */
230         tcp_set_state(sk, TCP_SYN_SENT);
231         err = inet_hash_connect(&tcp_death_row, sk);
232         if (err)
233                 goto failure;
234
235         err = ip_route_newports(&rt, IPPROTO_TCP,
236                                 inet->sport, inet->dport, sk);
237         if (err)
238                 goto failure;
239
240         /* OK, now commit destination to socket.  */
241         sk->sk_gso_type = SKB_GSO_TCPV4;
242         sk_setup_caps(sk, &rt->u.dst);
243
244         if (!tp->write_seq)
245                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
246                                                            inet->daddr,
247                                                            inet->sport,
248                                                            usin->sin_port);
249
250         inet->id = tp->write_seq ^ jiffies;
251
252         err = tcp_connect(sk);
253         rt = NULL;
254         if (err)
255                 goto failure;
256
257         return 0;
258
259 failure:
260         /*
261          * This unhashes the socket and releases the local port,
262          * if necessary.
263          */
264         tcp_set_state(sk, TCP_CLOSE);
265         ip_rt_put(rt);
266         sk->sk_route_caps = 0;
267         inet->dport = 0;
268         return err;
269 }
270
271 /*
272  * This routine does path mtu discovery as defined in RFC1191.
273  */
274 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
275 {
276         struct dst_entry *dst;
277         struct inet_sock *inet = inet_sk(sk);
278
279         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
280          * send out by Linux are always <576bytes so they should go through
281          * unfragmented).
282          */
283         if (sk->sk_state == TCP_LISTEN)
284                 return;
285
286         /* We don't check in the destentry if pmtu discovery is forbidden
287          * on this route. We just assume that no packet_to_big packets
288          * are send back when pmtu discovery is not active.
289          * There is a small race when the user changes this flag in the
290          * route, but I think that's acceptable.
291          */
292         if ((dst = __sk_dst_check(sk, 0)) == NULL)
293                 return;
294
295         dst->ops->update_pmtu(dst, mtu);
296
297         /* Something is about to be wrong... Remember soft error
298          * for the case, if this connection will not able to recover.
299          */
300         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
301                 sk->sk_err_soft = EMSGSIZE;
302
303         mtu = dst_mtu(dst);
304
305         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
306             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
307                 tcp_sync_mss(sk, mtu);
308
309                 /* Resend the TCP packet because it's
310                  * clear that the old packet has been
311                  * dropped. This is the new "fast" path mtu
312                  * discovery.
313                  */
314                 tcp_simple_retransmit(sk);
315         } /* else let the usual retransmit timer handle it */
316 }
317
318 /*
319  * This routine is called by the ICMP module when it gets some
320  * sort of error condition.  If err < 0 then the socket should
321  * be closed and the error returned to the user.  If err > 0
322  * it's just the icmp type << 8 | icmp code.  After adjustment
323  * header points to the first 8 bytes of the tcp header.  We need
324  * to find the appropriate port.
325  *
326  * The locking strategy used here is very "optimistic". When
327  * someone else accesses the socket the ICMP is just dropped
328  * and for some paths there is no check at all.
329  * A more general error queue to queue errors for later handling
330  * is probably better.
331  *
332  */
333
334 void tcp_v4_err(struct sk_buff *skb, u32 info)
335 {
336         struct iphdr *iph = (struct iphdr *)skb->data;
337         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
338         struct tcp_sock *tp;
339         struct inet_sock *inet;
340         const int type = icmp_hdr(skb)->type;
341         const int code = icmp_hdr(skb)->code;
342         struct sock *sk;
343         __u32 seq;
344         int err;
345         struct net *net = dev_net(skb->dev);
346
347         if (skb->len < (iph->ihl << 2) + 8) {
348                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
349                 return;
350         }
351
352         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
353                         iph->saddr, th->source, inet_iif(skb));
354         if (!sk) {
355                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
356                 return;
357         }
358         if (sk->sk_state == TCP_TIME_WAIT) {
359                 inet_twsk_put(inet_twsk(sk));
360                 return;
361         }
362
363         bh_lock_sock(sk);
364         /* If too many ICMPs get dropped on busy
365          * servers this needs to be solved differently.
366          */
367         if (sock_owned_by_user(sk))
368                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
369
370         if (sk->sk_state == TCP_CLOSE)
371                 goto out;
372
373         tp = tcp_sk(sk);
374         seq = ntohl(th->seq);
375         if (sk->sk_state != TCP_LISTEN &&
376             !between(seq, tp->snd_una, tp->snd_nxt)) {
377                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
378                 goto out;
379         }
380
381         switch (type) {
382         case ICMP_SOURCE_QUENCH:
383                 /* Just silently ignore these. */
384                 goto out;
385         case ICMP_PARAMETERPROB:
386                 err = EPROTO;
387                 break;
388         case ICMP_DEST_UNREACH:
389                 if (code > NR_ICMP_UNREACH)
390                         goto out;
391
392                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
393                         if (!sock_owned_by_user(sk))
394                                 do_pmtu_discovery(sk, iph, info);
395                         goto out;
396                 }
397
398                 err = icmp_err_convert[code].errno;
399                 break;
400         case ICMP_TIME_EXCEEDED:
401                 err = EHOSTUNREACH;
402                 break;
403         default:
404                 goto out;
405         }
406
407         switch (sk->sk_state) {
408                 struct request_sock *req, **prev;
409         case TCP_LISTEN:
410                 if (sock_owned_by_user(sk))
411                         goto out;
412
413                 req = inet_csk_search_req(sk, &prev, th->dest,
414                                           iph->daddr, iph->saddr);
415                 if (!req)
416                         goto out;
417
418                 /* ICMPs are not backlogged, hence we cannot get
419                    an established socket here.
420                  */
421                 WARN_ON(req->sk);
422
423                 if (seq != tcp_rsk(req)->snt_isn) {
424                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
425                         goto out;
426                 }
427
428                 /*
429                  * Still in SYN_RECV, just remove it silently.
430                  * There is no good way to pass the error to the newly
431                  * created socket, and POSIX does not want network
432                  * errors returned from accept().
433                  */
434                 inet_csk_reqsk_queue_drop(sk, req, prev);
435                 goto out;
436
437         case TCP_SYN_SENT:
438         case TCP_SYN_RECV:  /* Cannot happen.
439                                It can f.e. if SYNs crossed.
440                              */
441                 if (!sock_owned_by_user(sk)) {
442                         sk->sk_err = err;
443
444                         sk->sk_error_report(sk);
445
446                         tcp_done(sk);
447                 } else {
448                         sk->sk_err_soft = err;
449                 }
450                 goto out;
451         }
452
453         /* If we've already connected we will keep trying
454          * until we time out, or the user gives up.
455          *
456          * rfc1122 4.2.3.9 allows to consider as hard errors
457          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
458          * but it is obsoleted by pmtu discovery).
459          *
460          * Note, that in modern internet, where routing is unreliable
461          * and in each dark corner broken firewalls sit, sending random
462          * errors ordered by their masters even this two messages finally lose
463          * their original sense (even Linux sends invalid PORT_UNREACHs)
464          *
465          * Now we are in compliance with RFCs.
466          *                                                      --ANK (980905)
467          */
468
469         inet = inet_sk(sk);
470         if (!sock_owned_by_user(sk) && inet->recverr) {
471                 sk->sk_err = err;
472                 sk->sk_error_report(sk);
473         } else  { /* Only an error on timeout */
474                 sk->sk_err_soft = err;
475         }
476
477 out:
478         bh_unlock_sock(sk);
479         sock_put(sk);
480 }
481
482 /* This routine computes an IPv4 TCP checksum. */
483 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
484 {
485         struct inet_sock *inet = inet_sk(sk);
486         struct tcphdr *th = tcp_hdr(skb);
487
488         if (skb->ip_summed == CHECKSUM_PARTIAL) {
489                 th->check = ~tcp_v4_check(len, inet->saddr,
490                                           inet->daddr, 0);
491                 skb->csum_start = skb_transport_header(skb) - skb->head;
492                 skb->csum_offset = offsetof(struct tcphdr, check);
493         } else {
494                 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
495                                          csum_partial((char *)th,
496                                                       th->doff << 2,
497                                                       skb->csum));
498         }
499 }
500
501 int tcp_v4_gso_send_check(struct sk_buff *skb)
502 {
503         const struct iphdr *iph;
504         struct tcphdr *th;
505
506         if (!pskb_may_pull(skb, sizeof(*th)))
507                 return -EINVAL;
508
509         iph = ip_hdr(skb);
510         th = tcp_hdr(skb);
511
512         th->check = 0;
513         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
514         skb->csum_start = skb_transport_header(skb) - skb->head;
515         skb->csum_offset = offsetof(struct tcphdr, check);
516         skb->ip_summed = CHECKSUM_PARTIAL;
517         return 0;
518 }
519
520 /*
521  *      This routine will send an RST to the other tcp.
522  *
523  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
524  *                    for reset.
525  *      Answer: if a packet caused RST, it is not for a socket
526  *              existing in our system, if it is matched to a socket,
527  *              it is just duplicate segment or bug in other side's TCP.
528  *              So that we build reply only basing on parameters
529  *              arrived with segment.
530  *      Exception: precedence violation. We do not implement it in any case.
531  */
532
533 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
534 {
535         struct tcphdr *th = tcp_hdr(skb);
536         struct {
537                 struct tcphdr th;
538 #ifdef CONFIG_TCP_MD5SIG
539                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
540 #endif
541         } rep;
542         struct ip_reply_arg arg;
543 #ifdef CONFIG_TCP_MD5SIG
544         struct tcp_md5sig_key *key;
545 #endif
546         struct net *net;
547
548         /* Never send a reset in response to a reset. */
549         if (th->rst)
550                 return;
551
552         if (skb->rtable->rt_type != RTN_LOCAL)
553                 return;
554
555         /* Swap the send and the receive. */
556         memset(&rep, 0, sizeof(rep));
557         rep.th.dest   = th->source;
558         rep.th.source = th->dest;
559         rep.th.doff   = sizeof(struct tcphdr) / 4;
560         rep.th.rst    = 1;
561
562         if (th->ack) {
563                 rep.th.seq = th->ack_seq;
564         } else {
565                 rep.th.ack = 1;
566                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
567                                        skb->len - (th->doff << 2));
568         }
569
570         memset(&arg, 0, sizeof(arg));
571         arg.iov[0].iov_base = (unsigned char *)&rep;
572         arg.iov[0].iov_len  = sizeof(rep.th);
573
574 #ifdef CONFIG_TCP_MD5SIG
575         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
576         if (key) {
577                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
578                                    (TCPOPT_NOP << 16) |
579                                    (TCPOPT_MD5SIG << 8) |
580                                    TCPOLEN_MD5SIG);
581                 /* Update length and the length the header thinks exists */
582                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
583                 rep.th.doff = arg.iov[0].iov_len / 4;
584
585                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
586                                      key, ip_hdr(skb)->daddr,
587                                      ip_hdr(skb)->saddr, &rep.th);
588         }
589 #endif
590         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
591                                       ip_hdr(skb)->saddr, /* XXX */
592                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
593         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
594
595         net = dev_net(skb->dst->dev);
596         ip_send_reply(net->ipv4.tcp_sock, skb,
597                       &arg, arg.iov[0].iov_len);
598
599         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
600         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
601 }
602
603 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
604    outside socket context is ugly, certainly. What can I do?
605  */
606
607 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
608                             u32 win, u32 ts, int oif,
609                             struct tcp_md5sig_key *key)
610 {
611         struct tcphdr *th = tcp_hdr(skb);
612         struct {
613                 struct tcphdr th;
614                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
615 #ifdef CONFIG_TCP_MD5SIG
616                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
617 #endif
618                         ];
619         } rep;
620         struct ip_reply_arg arg;
621         struct net *net = dev_net(skb->dev);
622
623         memset(&rep.th, 0, sizeof(struct tcphdr));
624         memset(&arg, 0, sizeof(arg));
625
626         arg.iov[0].iov_base = (unsigned char *)&rep;
627         arg.iov[0].iov_len  = sizeof(rep.th);
628         if (ts) {
629                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
630                                    (TCPOPT_TIMESTAMP << 8) |
631                                    TCPOLEN_TIMESTAMP);
632                 rep.opt[1] = htonl(tcp_time_stamp);
633                 rep.opt[2] = htonl(ts);
634                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
635         }
636
637         /* Swap the send and the receive. */
638         rep.th.dest    = th->source;
639         rep.th.source  = th->dest;
640         rep.th.doff    = arg.iov[0].iov_len / 4;
641         rep.th.seq     = htonl(seq);
642         rep.th.ack_seq = htonl(ack);
643         rep.th.ack     = 1;
644         rep.th.window  = htons(win);
645
646 #ifdef CONFIG_TCP_MD5SIG
647         if (key) {
648                 int offset = (ts) ? 3 : 0;
649
650                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
651                                           (TCPOPT_NOP << 16) |
652                                           (TCPOPT_MD5SIG << 8) |
653                                           TCPOLEN_MD5SIG);
654                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
655                 rep.th.doff = arg.iov[0].iov_len/4;
656
657                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
658                                     key, ip_hdr(skb)->saddr,
659                                     ip_hdr(skb)->daddr, &rep.th);
660         }
661 #endif
662         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
663                                       ip_hdr(skb)->saddr, /* XXX */
664                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
665         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
666         if (oif)
667                 arg.bound_dev_if = oif;
668
669         ip_send_reply(net->ipv4.tcp_sock, skb,
670                       &arg, arg.iov[0].iov_len);
671
672         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
673 }
674
675 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
676 {
677         struct inet_timewait_sock *tw = inet_twsk(sk);
678         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
679
680         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
681                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
682                         tcptw->tw_ts_recent,
683                         tw->tw_bound_dev_if,
684                         tcp_twsk_md5_key(tcptw)
685                         );
686
687         inet_twsk_put(tw);
688 }
689
690 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
691                                   struct request_sock *req)
692 {
693         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
694                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
695                         req->ts_recent,
696                         0,
697                         tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr));
698 }
699
700 /*
701  *      Send a SYN-ACK after having received a SYN.
702  *      This still operates on a request_sock only, not on a big
703  *      socket.
704  */
705 static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
706                                 struct dst_entry *dst)
707 {
708         const struct inet_request_sock *ireq = inet_rsk(req);
709         int err = -1;
710         struct sk_buff * skb;
711
712         /* First, grab a route. */
713         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
714                 return -1;
715
716         skb = tcp_make_synack(sk, dst, req);
717
718         if (skb) {
719                 struct tcphdr *th = tcp_hdr(skb);
720
721                 th->check = tcp_v4_check(skb->len,
722                                          ireq->loc_addr,
723                                          ireq->rmt_addr,
724                                          csum_partial((char *)th, skb->len,
725                                                       skb->csum));
726
727                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
728                                             ireq->rmt_addr,
729                                             ireq->opt);
730                 err = net_xmit_eval(err);
731         }
732
733         dst_release(dst);
734         return err;
735 }
736
737 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
738 {
739         return __tcp_v4_send_synack(sk, req, NULL);
740 }
741
742 /*
743  *      IPv4 request_sock destructor.
744  */
745 static void tcp_v4_reqsk_destructor(struct request_sock *req)
746 {
747         kfree(inet_rsk(req)->opt);
748 }
749
750 #ifdef CONFIG_SYN_COOKIES
751 static void syn_flood_warning(struct sk_buff *skb)
752 {
753         static unsigned long warntime;
754
755         if (time_after(jiffies, (warntime + HZ * 60))) {
756                 warntime = jiffies;
757                 printk(KERN_INFO
758                        "possible SYN flooding on port %d. Sending cookies.\n",
759                        ntohs(tcp_hdr(skb)->dest));
760         }
761 }
762 #endif
763
764 /*
765  * Save and compile IPv4 options into the request_sock if needed.
766  */
767 static struct ip_options *tcp_v4_save_options(struct sock *sk,
768                                               struct sk_buff *skb)
769 {
770         struct ip_options *opt = &(IPCB(skb)->opt);
771         struct ip_options *dopt = NULL;
772
773         if (opt && opt->optlen) {
774                 int opt_size = optlength(opt);
775                 dopt = kmalloc(opt_size, GFP_ATOMIC);
776                 if (dopt) {
777                         if (ip_options_echo(dopt, skb)) {
778                                 kfree(dopt);
779                                 dopt = NULL;
780                         }
781                 }
782         }
783         return dopt;
784 }
785
786 #ifdef CONFIG_TCP_MD5SIG
787 /*
788  * RFC2385 MD5 checksumming requires a mapping of
789  * IP address->MD5 Key.
790  * We need to maintain these in the sk structure.
791  */
792
793 /* Find the Key structure for an address.  */
794 static struct tcp_md5sig_key *
795                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
796 {
797         struct tcp_sock *tp = tcp_sk(sk);
798         int i;
799
800         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
801                 return NULL;
802         for (i = 0; i < tp->md5sig_info->entries4; i++) {
803                 if (tp->md5sig_info->keys4[i].addr == addr)
804                         return &tp->md5sig_info->keys4[i].base;
805         }
806         return NULL;
807 }
808
809 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
810                                          struct sock *addr_sk)
811 {
812         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
813 }
814
815 EXPORT_SYMBOL(tcp_v4_md5_lookup);
816
817 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
818                                                       struct request_sock *req)
819 {
820         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
821 }
822
823 /* This can be called on a newly created socket, from other files */
824 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
825                       u8 *newkey, u8 newkeylen)
826 {
827         /* Add Key to the list */
828         struct tcp_md5sig_key *key;
829         struct tcp_sock *tp = tcp_sk(sk);
830         struct tcp4_md5sig_key *keys;
831
832         key = tcp_v4_md5_do_lookup(sk, addr);
833         if (key) {
834                 /* Pre-existing entry - just update that one. */
835                 kfree(key->key);
836                 key->key = newkey;
837                 key->keylen = newkeylen;
838         } else {
839                 struct tcp_md5sig_info *md5sig;
840
841                 if (!tp->md5sig_info) {
842                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
843                                                   GFP_ATOMIC);
844                         if (!tp->md5sig_info) {
845                                 kfree(newkey);
846                                 return -ENOMEM;
847                         }
848                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
849                 }
850                 if (tcp_alloc_md5sig_pool() == NULL) {
851                         kfree(newkey);
852                         return -ENOMEM;
853                 }
854                 md5sig = tp->md5sig_info;
855
856                 if (md5sig->alloced4 == md5sig->entries4) {
857                         keys = kmalloc((sizeof(*keys) *
858                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
859                         if (!keys) {
860                                 kfree(newkey);
861                                 tcp_free_md5sig_pool();
862                                 return -ENOMEM;
863                         }
864
865                         if (md5sig->entries4)
866                                 memcpy(keys, md5sig->keys4,
867                                        sizeof(*keys) * md5sig->entries4);
868
869                         /* Free old key list, and reference new one */
870                         kfree(md5sig->keys4);
871                         md5sig->keys4 = keys;
872                         md5sig->alloced4++;
873                 }
874                 md5sig->entries4++;
875                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
876                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
877                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
878         }
879         return 0;
880 }
881
882 EXPORT_SYMBOL(tcp_v4_md5_do_add);
883
884 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
885                                u8 *newkey, u8 newkeylen)
886 {
887         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
888                                  newkey, newkeylen);
889 }
890
891 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
892 {
893         struct tcp_sock *tp = tcp_sk(sk);
894         int i;
895
896         for (i = 0; i < tp->md5sig_info->entries4; i++) {
897                 if (tp->md5sig_info->keys4[i].addr == addr) {
898                         /* Free the key */
899                         kfree(tp->md5sig_info->keys4[i].base.key);
900                         tp->md5sig_info->entries4--;
901
902                         if (tp->md5sig_info->entries4 == 0) {
903                                 kfree(tp->md5sig_info->keys4);
904                                 tp->md5sig_info->keys4 = NULL;
905                                 tp->md5sig_info->alloced4 = 0;
906                         } else if (tp->md5sig_info->entries4 != i) {
907                                 /* Need to do some manipulation */
908                                 memmove(&tp->md5sig_info->keys4[i],
909                                         &tp->md5sig_info->keys4[i+1],
910                                         (tp->md5sig_info->entries4 - i) *
911                                          sizeof(struct tcp4_md5sig_key));
912                         }
913                         tcp_free_md5sig_pool();
914                         return 0;
915                 }
916         }
917         return -ENOENT;
918 }
919
920 EXPORT_SYMBOL(tcp_v4_md5_do_del);
921
922 static void tcp_v4_clear_md5_list(struct sock *sk)
923 {
924         struct tcp_sock *tp = tcp_sk(sk);
925
926         /* Free each key, then the set of key keys,
927          * the crypto element, and then decrement our
928          * hold on the last resort crypto.
929          */
930         if (tp->md5sig_info->entries4) {
931                 int i;
932                 for (i = 0; i < tp->md5sig_info->entries4; i++)
933                         kfree(tp->md5sig_info->keys4[i].base.key);
934                 tp->md5sig_info->entries4 = 0;
935                 tcp_free_md5sig_pool();
936         }
937         if (tp->md5sig_info->keys4) {
938                 kfree(tp->md5sig_info->keys4);
939                 tp->md5sig_info->keys4 = NULL;
940                 tp->md5sig_info->alloced4  = 0;
941         }
942 }
943
944 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
945                                  int optlen)
946 {
947         struct tcp_md5sig cmd;
948         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
949         u8 *newkey;
950
951         if (optlen < sizeof(cmd))
952                 return -EINVAL;
953
954         if (copy_from_user(&cmd, optval, sizeof(cmd)))
955                 return -EFAULT;
956
957         if (sin->sin_family != AF_INET)
958                 return -EINVAL;
959
960         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
961                 if (!tcp_sk(sk)->md5sig_info)
962                         return -ENOENT;
963                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
964         }
965
966         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
967                 return -EINVAL;
968
969         if (!tcp_sk(sk)->md5sig_info) {
970                 struct tcp_sock *tp = tcp_sk(sk);
971                 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
972
973                 if (!p)
974                         return -EINVAL;
975
976                 tp->md5sig_info = p;
977                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
978         }
979
980         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
981         if (!newkey)
982                 return -ENOMEM;
983         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
984                                  newkey, cmd.tcpm_keylen);
985 }
986
987 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
988                                         __be32 daddr, __be32 saddr, int nbytes)
989 {
990         struct tcp4_pseudohdr *bp;
991         struct scatterlist sg;
992
993         bp = &hp->md5_blk.ip4;
994
995         /*
996          * 1. the TCP pseudo-header (in the order: source IP address,
997          * destination IP address, zero-padded protocol number, and
998          * segment length)
999          */
1000         bp->saddr = saddr;
1001         bp->daddr = daddr;
1002         bp->pad = 0;
1003         bp->protocol = IPPROTO_TCP;
1004         bp->len = cpu_to_be16(nbytes);
1005
1006         sg_init_one(&sg, bp, sizeof(*bp));
1007         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1008 }
1009
1010 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1011                                __be32 daddr, __be32 saddr, struct tcphdr *th)
1012 {
1013         struct tcp_md5sig_pool *hp;
1014         struct hash_desc *desc;
1015
1016         hp = tcp_get_md5sig_pool();
1017         if (!hp)
1018                 goto clear_hash_noput;
1019         desc = &hp->md5_desc;
1020
1021         if (crypto_hash_init(desc))
1022                 goto clear_hash;
1023         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1024                 goto clear_hash;
1025         if (tcp_md5_hash_header(hp, th))
1026                 goto clear_hash;
1027         if (tcp_md5_hash_key(hp, key))
1028                 goto clear_hash;
1029         if (crypto_hash_final(desc, md5_hash))
1030                 goto clear_hash;
1031
1032         tcp_put_md5sig_pool();
1033         return 0;
1034
1035 clear_hash:
1036         tcp_put_md5sig_pool();
1037 clear_hash_noput:
1038         memset(md5_hash, 0, 16);
1039         return 1;
1040 }
1041
1042 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1043                         struct sock *sk, struct request_sock *req,
1044                         struct sk_buff *skb)
1045 {
1046         struct tcp_md5sig_pool *hp;
1047         struct hash_desc *desc;
1048         struct tcphdr *th = tcp_hdr(skb);
1049         __be32 saddr, daddr;
1050
1051         if (sk) {
1052                 saddr = inet_sk(sk)->saddr;
1053                 daddr = inet_sk(sk)->daddr;
1054         } else if (req) {
1055                 saddr = inet_rsk(req)->loc_addr;
1056                 daddr = inet_rsk(req)->rmt_addr;
1057         } else {
1058                 const struct iphdr *iph = ip_hdr(skb);
1059                 saddr = iph->saddr;
1060                 daddr = iph->daddr;
1061         }
1062
1063         hp = tcp_get_md5sig_pool();
1064         if (!hp)
1065                 goto clear_hash_noput;
1066         desc = &hp->md5_desc;
1067
1068         if (crypto_hash_init(desc))
1069                 goto clear_hash;
1070
1071         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1072                 goto clear_hash;
1073         if (tcp_md5_hash_header(hp, th))
1074                 goto clear_hash;
1075         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1076                 goto clear_hash;
1077         if (tcp_md5_hash_key(hp, key))
1078                 goto clear_hash;
1079         if (crypto_hash_final(desc, md5_hash))
1080                 goto clear_hash;
1081
1082         tcp_put_md5sig_pool();
1083         return 0;
1084
1085 clear_hash:
1086         tcp_put_md5sig_pool();
1087 clear_hash_noput:
1088         memset(md5_hash, 0, 16);
1089         return 1;
1090 }
1091
1092 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1093
1094 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1095 {
1096         /*
1097          * This gets called for each TCP segment that arrives
1098          * so we want to be efficient.
1099          * We have 3 drop cases:
1100          * o No MD5 hash and one expected.
1101          * o MD5 hash and we're not expecting one.
1102          * o MD5 hash and its wrong.
1103          */
1104         __u8 *hash_location = NULL;
1105         struct tcp_md5sig_key *hash_expected;
1106         const struct iphdr *iph = ip_hdr(skb);
1107         struct tcphdr *th = tcp_hdr(skb);
1108         int genhash;
1109         unsigned char newhash[16];
1110
1111         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1112         hash_location = tcp_parse_md5sig_option(th);
1113
1114         /* We've parsed the options - do we have a hash? */
1115         if (!hash_expected && !hash_location)
1116                 return 0;
1117
1118         if (hash_expected && !hash_location) {
1119                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1120                 return 1;
1121         }
1122
1123         if (!hash_expected && hash_location) {
1124                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1125                 return 1;
1126         }
1127
1128         /* Okay, so this is hash_expected and hash_location -
1129          * so we need to calculate the checksum.
1130          */
1131         genhash = tcp_v4_md5_hash_skb(newhash,
1132                                       hash_expected,
1133                                       NULL, NULL, skb);
1134
1135         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1136                 if (net_ratelimit()) {
1137                         printk(KERN_INFO "MD5 Hash failed for "
1138                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1139                                NIPQUAD(iph->saddr), ntohs(th->source),
1140                                NIPQUAD(iph->daddr), ntohs(th->dest),
1141                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1142                 }
1143                 return 1;
1144         }
1145         return 0;
1146 }
1147
1148 #endif
1149
1150 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1151         .family         =       PF_INET,
1152         .obj_size       =       sizeof(struct tcp_request_sock),
1153         .rtx_syn_ack    =       tcp_v4_send_synack,
1154         .send_ack       =       tcp_v4_reqsk_send_ack,
1155         .destructor     =       tcp_v4_reqsk_destructor,
1156         .send_reset     =       tcp_v4_send_reset,
1157 };
1158
1159 #ifdef CONFIG_TCP_MD5SIG
1160 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1161         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1162 };
1163 #endif
1164
1165 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1166         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1167         .twsk_unique    = tcp_twsk_unique,
1168         .twsk_destructor= tcp_twsk_destructor,
1169 };
1170
1171 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1172 {
1173         struct inet_request_sock *ireq;
1174         struct tcp_options_received tmp_opt;
1175         struct request_sock *req;
1176         __be32 saddr = ip_hdr(skb)->saddr;
1177         __be32 daddr = ip_hdr(skb)->daddr;
1178         __u32 isn = TCP_SKB_CB(skb)->when;
1179         struct dst_entry *dst = NULL;
1180 #ifdef CONFIG_SYN_COOKIES
1181         int want_cookie = 0;
1182 #else
1183 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1184 #endif
1185
1186         /* Never answer to SYNs send to broadcast or multicast */
1187         if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1188                 goto drop;
1189
1190         /* TW buckets are converted to open requests without
1191          * limitations, they conserve resources and peer is
1192          * evidently real one.
1193          */
1194         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1195 #ifdef CONFIG_SYN_COOKIES
1196                 if (sysctl_tcp_syncookies) {
1197                         want_cookie = 1;
1198                 } else
1199 #endif
1200                 goto drop;
1201         }
1202
1203         /* Accept backlog is full. If we have already queued enough
1204          * of warm entries in syn queue, drop request. It is better than
1205          * clogging syn queue with openreqs with exponentially increasing
1206          * timeout.
1207          */
1208         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1209                 goto drop;
1210
1211         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1212         if (!req)
1213                 goto drop;
1214
1215 #ifdef CONFIG_TCP_MD5SIG
1216         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1217 #endif
1218
1219         tcp_clear_options(&tmp_opt);
1220         tmp_opt.mss_clamp = 536;
1221         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1222
1223         tcp_parse_options(skb, &tmp_opt, 0);
1224
1225         if (want_cookie && !tmp_opt.saw_tstamp)
1226                 tcp_clear_options(&tmp_opt);
1227
1228         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1229                 /* Some OSes (unknown ones, but I see them on web server, which
1230                  * contains information interesting only for windows'
1231                  * users) do not send their stamp in SYN. It is easy case.
1232                  * We simply do not advertise TS support.
1233                  */
1234                 tmp_opt.saw_tstamp = 0;
1235                 tmp_opt.tstamp_ok  = 0;
1236         }
1237         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1238
1239         tcp_openreq_init(req, &tmp_opt, skb);
1240
1241         if (security_inet_conn_request(sk, skb, req))
1242                 goto drop_and_free;
1243
1244         ireq = inet_rsk(req);
1245         ireq->loc_addr = daddr;
1246         ireq->rmt_addr = saddr;
1247         ireq->opt = tcp_v4_save_options(sk, skb);
1248         if (!want_cookie)
1249                 TCP_ECN_create_request(req, tcp_hdr(skb));
1250
1251         if (want_cookie) {
1252 #ifdef CONFIG_SYN_COOKIES
1253                 syn_flood_warning(skb);
1254                 req->cookie_ts = tmp_opt.tstamp_ok;
1255 #endif
1256                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1257         } else if (!isn) {
1258                 struct inet_peer *peer = NULL;
1259
1260                 /* VJ's idea. We save last timestamp seen
1261                  * from the destination in peer table, when entering
1262                  * state TIME-WAIT, and check against it before
1263                  * accepting new connection request.
1264                  *
1265                  * If "isn" is not zero, this request hit alive
1266                  * timewait bucket, so that all the necessary checks
1267                  * are made in the function processing timewait state.
1268                  */
1269                 if (tmp_opt.saw_tstamp &&
1270                     tcp_death_row.sysctl_tw_recycle &&
1271                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1272                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1273                     peer->v4daddr == saddr) {
1274                         if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1275                             (s32)(peer->tcp_ts - req->ts_recent) >
1276                                                         TCP_PAWS_WINDOW) {
1277                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1278                                 goto drop_and_release;
1279                         }
1280                 }
1281                 /* Kill the following clause, if you dislike this way. */
1282                 else if (!sysctl_tcp_syncookies &&
1283                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1284                           (sysctl_max_syn_backlog >> 2)) &&
1285                          (!peer || !peer->tcp_ts_stamp) &&
1286                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1287                         /* Without syncookies last quarter of
1288                          * backlog is filled with destinations,
1289                          * proven to be alive.
1290                          * It means that we continue to communicate
1291                          * to destinations, already remembered
1292                          * to the moment of synflood.
1293                          */
1294                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1295                                        "request from " NIPQUAD_FMT "/%u\n",
1296                                        NIPQUAD(saddr),
1297                                        ntohs(tcp_hdr(skb)->source));
1298                         goto drop_and_release;
1299                 }
1300
1301                 isn = tcp_v4_init_sequence(skb);
1302         }
1303         tcp_rsk(req)->snt_isn = isn;
1304
1305         if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1306                 goto drop_and_free;
1307
1308         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1309         return 0;
1310
1311 drop_and_release:
1312         dst_release(dst);
1313 drop_and_free:
1314         reqsk_free(req);
1315 drop:
1316         return 0;
1317 }
1318
1319
1320 /*
1321  * The three way handshake has completed - we got a valid synack -
1322  * now create the new socket.
1323  */
1324 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1325                                   struct request_sock *req,
1326                                   struct dst_entry *dst)
1327 {
1328         struct inet_request_sock *ireq;
1329         struct inet_sock *newinet;
1330         struct tcp_sock *newtp;
1331         struct sock *newsk;
1332 #ifdef CONFIG_TCP_MD5SIG
1333         struct tcp_md5sig_key *key;
1334 #endif
1335
1336         if (sk_acceptq_is_full(sk))
1337                 goto exit_overflow;
1338
1339         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1340                 goto exit;
1341
1342         newsk = tcp_create_openreq_child(sk, req, skb);
1343         if (!newsk)
1344                 goto exit;
1345
1346         newsk->sk_gso_type = SKB_GSO_TCPV4;
1347         sk_setup_caps(newsk, dst);
1348
1349         newtp                 = tcp_sk(newsk);
1350         newinet               = inet_sk(newsk);
1351         ireq                  = inet_rsk(req);
1352         newinet->daddr        = ireq->rmt_addr;
1353         newinet->rcv_saddr    = ireq->loc_addr;
1354         newinet->saddr        = ireq->loc_addr;
1355         newinet->opt          = ireq->opt;
1356         ireq->opt             = NULL;
1357         newinet->mc_index     = inet_iif(skb);
1358         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1359         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1360         if (newinet->opt)
1361                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1362         newinet->id = newtp->write_seq ^ jiffies;
1363
1364         tcp_mtup_init(newsk);
1365         tcp_sync_mss(newsk, dst_mtu(dst));
1366         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1367         if (tcp_sk(sk)->rx_opt.user_mss &&
1368             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1369                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1370
1371         tcp_initialize_rcv_mss(newsk);
1372
1373 #ifdef CONFIG_TCP_MD5SIG
1374         /* Copy over the MD5 key from the original socket */
1375         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1376                 /*
1377                  * We're using one, so create a matching key
1378                  * on the newsk structure. If we fail to get
1379                  * memory, then we end up not copying the key
1380                  * across. Shucks.
1381                  */
1382                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1383                 if (newkey != NULL)
1384                         tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1385                                           newkey, key->keylen);
1386                 newsk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1387         }
1388 #endif
1389
1390         __inet_hash_nolisten(newsk);
1391         __inet_inherit_port(sk, newsk);
1392
1393         return newsk;
1394
1395 exit_overflow:
1396         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1397 exit:
1398         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1399         dst_release(dst);
1400         return NULL;
1401 }
1402
1403 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1404 {
1405         struct tcphdr *th = tcp_hdr(skb);
1406         const struct iphdr *iph = ip_hdr(skb);
1407         struct sock *nsk;
1408         struct request_sock **prev;
1409         /* Find possible connection requests. */
1410         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1411                                                        iph->saddr, iph->daddr);
1412         if (req)
1413                 return tcp_check_req(sk, skb, req, prev);
1414
1415         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1416                         th->source, iph->daddr, th->dest, inet_iif(skb));
1417
1418         if (nsk) {
1419                 if (nsk->sk_state != TCP_TIME_WAIT) {
1420                         bh_lock_sock(nsk);
1421                         return nsk;
1422                 }
1423                 inet_twsk_put(inet_twsk(nsk));
1424                 return NULL;
1425         }
1426
1427 #ifdef CONFIG_SYN_COOKIES
1428         if (!th->rst && !th->syn && th->ack)
1429                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1430 #endif
1431         return sk;
1432 }
1433
1434 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1435 {
1436         const struct iphdr *iph = ip_hdr(skb);
1437
1438         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1439                 if (!tcp_v4_check(skb->len, iph->saddr,
1440                                   iph->daddr, skb->csum)) {
1441                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1442                         return 0;
1443                 }
1444         }
1445
1446         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1447                                        skb->len, IPPROTO_TCP, 0);
1448
1449         if (skb->len <= 76) {
1450                 return __skb_checksum_complete(skb);
1451         }
1452         return 0;
1453 }
1454
1455
1456 /* The socket must have it's spinlock held when we get
1457  * here.
1458  *
1459  * We have a potential double-lock case here, so even when
1460  * doing backlog processing we use the BH locking scheme.
1461  * This is because we cannot sleep with the original spinlock
1462  * held.
1463  */
1464 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1465 {
1466         struct sock *rsk;
1467 #ifdef CONFIG_TCP_MD5SIG
1468         /*
1469          * We really want to reject the packet as early as possible
1470          * if:
1471          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1472          *  o There is an MD5 option and we're not expecting one
1473          */
1474         if (tcp_v4_inbound_md5_hash(sk, skb))
1475                 goto discard;
1476 #endif
1477
1478         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1479                 TCP_CHECK_TIMER(sk);
1480                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1481                         rsk = sk;
1482                         goto reset;
1483                 }
1484                 TCP_CHECK_TIMER(sk);
1485                 return 0;
1486         }
1487
1488         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1489                 goto csum_err;
1490
1491         if (sk->sk_state == TCP_LISTEN) {
1492                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1493                 if (!nsk)
1494                         goto discard;
1495
1496                 if (nsk != sk) {
1497                         if (tcp_child_process(sk, nsk, skb)) {
1498                                 rsk = nsk;
1499                                 goto reset;
1500                         }
1501                         return 0;
1502                 }
1503         }
1504
1505         TCP_CHECK_TIMER(sk);
1506         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1507                 rsk = sk;
1508                 goto reset;
1509         }
1510         TCP_CHECK_TIMER(sk);
1511         return 0;
1512
1513 reset:
1514         tcp_v4_send_reset(rsk, skb);
1515 discard:
1516         kfree_skb(skb);
1517         /* Be careful here. If this function gets more complicated and
1518          * gcc suffers from register pressure on the x86, sk (in %ebx)
1519          * might be destroyed here. This current version compiles correctly,
1520          * but you have been warned.
1521          */
1522         return 0;
1523
1524 csum_err:
1525         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1526         goto discard;
1527 }
1528
1529 /*
1530  *      From tcp_input.c
1531  */
1532
1533 int tcp_v4_rcv(struct sk_buff *skb)
1534 {
1535         const struct iphdr *iph;
1536         struct tcphdr *th;
1537         struct sock *sk;
1538         int ret;
1539         struct net *net = dev_net(skb->dev);
1540
1541         if (skb->pkt_type != PACKET_HOST)
1542                 goto discard_it;
1543
1544         /* Count it even if it's bad */
1545         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1546
1547         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1548                 goto discard_it;
1549
1550         th = tcp_hdr(skb);
1551
1552         if (th->doff < sizeof(struct tcphdr) / 4)
1553                 goto bad_packet;
1554         if (!pskb_may_pull(skb, th->doff * 4))
1555                 goto discard_it;
1556
1557         /* An explanation is required here, I think.
1558          * Packet length and doff are validated by header prediction,
1559          * provided case of th->doff==0 is eliminated.
1560          * So, we defer the checks. */
1561         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1562                 goto bad_packet;
1563
1564         th = tcp_hdr(skb);
1565         iph = ip_hdr(skb);
1566         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1567         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1568                                     skb->len - th->doff * 4);
1569         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1570         TCP_SKB_CB(skb)->when    = 0;
1571         TCP_SKB_CB(skb)->flags   = iph->tos;
1572         TCP_SKB_CB(skb)->sacked  = 0;
1573
1574         sk = __inet_lookup(net, &tcp_hashinfo, iph->saddr,
1575                         th->source, iph->daddr, th->dest, inet_iif(skb));
1576         if (!sk)
1577                 goto no_tcp_socket;
1578
1579 process:
1580         if (sk->sk_state == TCP_TIME_WAIT)
1581                 goto do_time_wait;
1582
1583         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1584                 goto discard_and_relse;
1585         nf_reset(skb);
1586
1587         if (sk_filter(sk, skb))
1588                 goto discard_and_relse;
1589
1590         skb->dev = NULL;
1591
1592         bh_lock_sock_nested(sk);
1593         ret = 0;
1594         if (!sock_owned_by_user(sk)) {
1595 #ifdef CONFIG_NET_DMA
1596                 struct tcp_sock *tp = tcp_sk(sk);
1597                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1598                         tp->ucopy.dma_chan = get_softnet_dma();
1599                 if (tp->ucopy.dma_chan)
1600                         ret = tcp_v4_do_rcv(sk, skb);
1601                 else
1602 #endif
1603                 {
1604                         if (!tcp_prequeue(sk, skb))
1605                         ret = tcp_v4_do_rcv(sk, skb);
1606                 }
1607         } else
1608                 sk_add_backlog(sk, skb);
1609         bh_unlock_sock(sk);
1610
1611         sock_put(sk);
1612
1613         return ret;
1614
1615 no_tcp_socket:
1616         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1617                 goto discard_it;
1618
1619         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1620 bad_packet:
1621                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1622         } else {
1623                 tcp_v4_send_reset(NULL, skb);
1624         }
1625
1626 discard_it:
1627         /* Discard frame. */
1628         kfree_skb(skb);
1629         return 0;
1630
1631 discard_and_relse:
1632         sock_put(sk);
1633         goto discard_it;
1634
1635 do_time_wait:
1636         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1637                 inet_twsk_put(inet_twsk(sk));
1638                 goto discard_it;
1639         }
1640
1641         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1642                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1643                 inet_twsk_put(inet_twsk(sk));
1644                 goto discard_it;
1645         }
1646         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1647         case TCP_TW_SYN: {
1648                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1649                                                         &tcp_hashinfo,
1650                                                         iph->daddr, th->dest,
1651                                                         inet_iif(skb));
1652                 if (sk2) {
1653                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1654                         inet_twsk_put(inet_twsk(sk));
1655                         sk = sk2;
1656                         goto process;
1657                 }
1658                 /* Fall through to ACK */
1659         }
1660         case TCP_TW_ACK:
1661                 tcp_v4_timewait_ack(sk, skb);
1662                 break;
1663         case TCP_TW_RST:
1664                 goto no_tcp_socket;
1665         case TCP_TW_SUCCESS:;
1666         }
1667         goto discard_it;
1668 }
1669
1670 /* VJ's idea. Save last timestamp seen from this destination
1671  * and hold it at least for normal timewait interval to use for duplicate
1672  * segment detection in subsequent connections, before they enter synchronized
1673  * state.
1674  */
1675
1676 int tcp_v4_remember_stamp(struct sock *sk)
1677 {
1678         struct inet_sock *inet = inet_sk(sk);
1679         struct tcp_sock *tp = tcp_sk(sk);
1680         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1681         struct inet_peer *peer = NULL;
1682         int release_it = 0;
1683
1684         if (!rt || rt->rt_dst != inet->daddr) {
1685                 peer = inet_getpeer(inet->daddr, 1);
1686                 release_it = 1;
1687         } else {
1688                 if (!rt->peer)
1689                         rt_bind_peer(rt, 1);
1690                 peer = rt->peer;
1691         }
1692
1693         if (peer) {
1694                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1695                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1696                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1697                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1698                         peer->tcp_ts = tp->rx_opt.ts_recent;
1699                 }
1700                 if (release_it)
1701                         inet_putpeer(peer);
1702                 return 1;
1703         }
1704
1705         return 0;
1706 }
1707
1708 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1709 {
1710         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1711
1712         if (peer) {
1713                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1714
1715                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1716                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1717                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1718                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1719                         peer->tcp_ts       = tcptw->tw_ts_recent;
1720                 }
1721                 inet_putpeer(peer);
1722                 return 1;
1723         }
1724
1725         return 0;
1726 }
1727
1728 struct inet_connection_sock_af_ops ipv4_specific = {
1729         .queue_xmit        = ip_queue_xmit,
1730         .send_check        = tcp_v4_send_check,
1731         .rebuild_header    = inet_sk_rebuild_header,
1732         .conn_request      = tcp_v4_conn_request,
1733         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1734         .remember_stamp    = tcp_v4_remember_stamp,
1735         .net_header_len    = sizeof(struct iphdr),
1736         .setsockopt        = ip_setsockopt,
1737         .getsockopt        = ip_getsockopt,
1738         .addr2sockaddr     = inet_csk_addr2sockaddr,
1739         .sockaddr_len      = sizeof(struct sockaddr_in),
1740         .bind_conflict     = inet_csk_bind_conflict,
1741 #ifdef CONFIG_COMPAT
1742         .compat_setsockopt = compat_ip_setsockopt,
1743         .compat_getsockopt = compat_ip_getsockopt,
1744 #endif
1745 };
1746
1747 #ifdef CONFIG_TCP_MD5SIG
1748 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1749         .md5_lookup             = tcp_v4_md5_lookup,
1750         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1751         .md5_add                = tcp_v4_md5_add_func,
1752         .md5_parse              = tcp_v4_parse_md5_keys,
1753 };
1754 #endif
1755
1756 /* NOTE: A lot of things set to zero explicitly by call to
1757  *       sk_alloc() so need not be done here.
1758  */
1759 static int tcp_v4_init_sock(struct sock *sk)
1760 {
1761         struct inet_connection_sock *icsk = inet_csk(sk);
1762         struct tcp_sock *tp = tcp_sk(sk);
1763
1764         skb_queue_head_init(&tp->out_of_order_queue);
1765         tcp_init_xmit_timers(sk);
1766         tcp_prequeue_init(tp);
1767
1768         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1769         tp->mdev = TCP_TIMEOUT_INIT;
1770
1771         /* So many TCP implementations out there (incorrectly) count the
1772          * initial SYN frame in their delayed-ACK and congestion control
1773          * algorithms that we must have the following bandaid to talk
1774          * efficiently to them.  -DaveM
1775          */
1776         tp->snd_cwnd = 2;
1777
1778         /* See draft-stevens-tcpca-spec-01 for discussion of the
1779          * initialization of these values.
1780          */
1781         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1782         tp->snd_cwnd_clamp = ~0;
1783         tp->mss_cache = 536;
1784
1785         tp->reordering = sysctl_tcp_reordering;
1786         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1787
1788         sk->sk_state = TCP_CLOSE;
1789
1790         sk->sk_write_space = sk_stream_write_space;
1791         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1792
1793         icsk->icsk_af_ops = &ipv4_specific;
1794         icsk->icsk_sync_mss = tcp_sync_mss;
1795 #ifdef CONFIG_TCP_MD5SIG
1796         tp->af_specific = &tcp_sock_ipv4_specific;
1797 #endif
1798
1799         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1800         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1801
1802         atomic_inc(&tcp_sockets_allocated);
1803
1804         return 0;
1805 }
1806
1807 void tcp_v4_destroy_sock(struct sock *sk)
1808 {
1809         struct tcp_sock *tp = tcp_sk(sk);
1810
1811         tcp_clear_xmit_timers(sk);
1812
1813         tcp_cleanup_congestion_control(sk);
1814
1815         /* Cleanup up the write buffer. */
1816         tcp_write_queue_purge(sk);
1817
1818         /* Cleans up our, hopefully empty, out_of_order_queue. */
1819         __skb_queue_purge(&tp->out_of_order_queue);
1820
1821 #ifdef CONFIG_TCP_MD5SIG
1822         /* Clean up the MD5 key list, if any */
1823         if (tp->md5sig_info) {
1824                 tcp_v4_clear_md5_list(sk);
1825                 kfree(tp->md5sig_info);
1826                 tp->md5sig_info = NULL;
1827         }
1828 #endif
1829
1830 #ifdef CONFIG_NET_DMA
1831         /* Cleans up our sk_async_wait_queue */
1832         __skb_queue_purge(&sk->sk_async_wait_queue);
1833 #endif
1834
1835         /* Clean prequeue, it must be empty really */
1836         __skb_queue_purge(&tp->ucopy.prequeue);
1837
1838         /* Clean up a referenced TCP bind bucket. */
1839         if (inet_csk(sk)->icsk_bind_hash)
1840                 inet_put_port(sk);
1841
1842         /*
1843          * If sendmsg cached page exists, toss it.
1844          */
1845         if (sk->sk_sndmsg_page) {
1846                 __free_page(sk->sk_sndmsg_page);
1847                 sk->sk_sndmsg_page = NULL;
1848         }
1849
1850         atomic_dec(&tcp_sockets_allocated);
1851 }
1852
1853 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1854
1855 #ifdef CONFIG_PROC_FS
1856 /* Proc filesystem TCP sock list dumping. */
1857
1858 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1859 {
1860         return hlist_empty(head) ? NULL :
1861                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1862 }
1863
1864 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1865 {
1866         return tw->tw_node.next ?
1867                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1868 }
1869
1870 static void *listening_get_next(struct seq_file *seq, void *cur)
1871 {
1872         struct inet_connection_sock *icsk;
1873         struct hlist_node *node;
1874         struct sock *sk = cur;
1875         struct tcp_iter_state* st = seq->private;
1876         struct net *net = seq_file_net(seq);
1877
1878         if (!sk) {
1879                 st->bucket = 0;
1880                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1881                 goto get_sk;
1882         }
1883
1884         ++st->num;
1885
1886         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1887                 struct request_sock *req = cur;
1888
1889                 icsk = inet_csk(st->syn_wait_sk);
1890                 req = req->dl_next;
1891                 while (1) {
1892                         while (req) {
1893                                 if (req->rsk_ops->family == st->family) {
1894                                         cur = req;
1895                                         goto out;
1896                                 }
1897                                 req = req->dl_next;
1898                         }
1899                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1900                                 break;
1901 get_req:
1902                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1903                 }
1904                 sk        = sk_next(st->syn_wait_sk);
1905                 st->state = TCP_SEQ_STATE_LISTENING;
1906                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1907         } else {
1908                 icsk = inet_csk(sk);
1909                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1910                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1911                         goto start_req;
1912                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1913                 sk = sk_next(sk);
1914         }
1915 get_sk:
1916         sk_for_each_from(sk, node) {
1917                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1918                         cur = sk;
1919                         goto out;
1920                 }
1921                 icsk = inet_csk(sk);
1922                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1923                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1924 start_req:
1925                         st->uid         = sock_i_uid(sk);
1926                         st->syn_wait_sk = sk;
1927                         st->state       = TCP_SEQ_STATE_OPENREQ;
1928                         st->sbucket     = 0;
1929                         goto get_req;
1930                 }
1931                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1932         }
1933         if (++st->bucket < INET_LHTABLE_SIZE) {
1934                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1935                 goto get_sk;
1936         }
1937         cur = NULL;
1938 out:
1939         return cur;
1940 }
1941
1942 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1943 {
1944         void *rc = listening_get_next(seq, NULL);
1945
1946         while (rc && *pos) {
1947                 rc = listening_get_next(seq, rc);
1948                 --*pos;
1949         }
1950         return rc;
1951 }
1952
1953 static inline int empty_bucket(struct tcp_iter_state *st)
1954 {
1955         return hlist_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
1956                 hlist_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
1957 }
1958
1959 static void *established_get_first(struct seq_file *seq)
1960 {
1961         struct tcp_iter_state* st = seq->private;
1962         struct net *net = seq_file_net(seq);
1963         void *rc = NULL;
1964
1965         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1966                 struct sock *sk;
1967                 struct hlist_node *node;
1968                 struct inet_timewait_sock *tw;
1969                 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1970
1971                 /* Lockless fast path for the common case of empty buckets */
1972                 if (empty_bucket(st))
1973                         continue;
1974
1975                 read_lock_bh(lock);
1976                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1977                         if (sk->sk_family != st->family ||
1978                             !net_eq(sock_net(sk), net)) {
1979                                 continue;
1980                         }
1981                         rc = sk;
1982                         goto out;
1983                 }
1984                 st->state = TCP_SEQ_STATE_TIME_WAIT;
1985                 inet_twsk_for_each(tw, node,
1986                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
1987                         if (tw->tw_family != st->family ||
1988                             !net_eq(twsk_net(tw), net)) {
1989                                 continue;
1990                         }
1991                         rc = tw;
1992                         goto out;
1993                 }
1994                 read_unlock_bh(lock);
1995                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1996         }
1997 out:
1998         return rc;
1999 }
2000
2001 static void *established_get_next(struct seq_file *seq, void *cur)
2002 {
2003         struct sock *sk = cur;
2004         struct inet_timewait_sock *tw;
2005         struct hlist_node *node;
2006         struct tcp_iter_state* st = seq->private;
2007         struct net *net = seq_file_net(seq);
2008
2009         ++st->num;
2010
2011         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2012                 tw = cur;
2013                 tw = tw_next(tw);
2014 get_tw:
2015                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2016                         tw = tw_next(tw);
2017                 }
2018                 if (tw) {
2019                         cur = tw;
2020                         goto out;
2021                 }
2022                 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2023                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2024
2025                 /* Look for next non empty bucket */
2026                 while (++st->bucket < tcp_hashinfo.ehash_size &&
2027                                 empty_bucket(st))
2028                         ;
2029                 if (st->bucket >= tcp_hashinfo.ehash_size)
2030                         return NULL;
2031
2032                 read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2033                 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2034         } else
2035                 sk = sk_next(sk);
2036
2037         sk_for_each_from(sk, node) {
2038                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2039                         goto found;
2040         }
2041
2042         st->state = TCP_SEQ_STATE_TIME_WAIT;
2043         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2044         goto get_tw;
2045 found:
2046         cur = sk;
2047 out:
2048         return cur;
2049 }
2050
2051 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2052 {
2053         void *rc = established_get_first(seq);
2054
2055         while (rc && pos) {
2056                 rc = established_get_next(seq, rc);
2057                 --pos;
2058         }
2059         return rc;
2060 }
2061
2062 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2063 {
2064         void *rc;
2065         struct tcp_iter_state* st = seq->private;
2066
2067         inet_listen_lock(&tcp_hashinfo);
2068         st->state = TCP_SEQ_STATE_LISTENING;
2069         rc        = listening_get_idx(seq, &pos);
2070
2071         if (!rc) {
2072                 inet_listen_unlock(&tcp_hashinfo);
2073                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2074                 rc        = established_get_idx(seq, pos);
2075         }
2076
2077         return rc;
2078 }
2079
2080 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2081 {
2082         struct tcp_iter_state* st = seq->private;
2083         st->state = TCP_SEQ_STATE_LISTENING;
2084         st->num = 0;
2085         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2086 }
2087
2088 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2089 {
2090         void *rc = NULL;
2091         struct tcp_iter_state* st;
2092
2093         if (v == SEQ_START_TOKEN) {
2094                 rc = tcp_get_idx(seq, 0);
2095                 goto out;
2096         }
2097         st = seq->private;
2098
2099         switch (st->state) {
2100         case TCP_SEQ_STATE_OPENREQ:
2101         case TCP_SEQ_STATE_LISTENING:
2102                 rc = listening_get_next(seq, v);
2103                 if (!rc) {
2104                         inet_listen_unlock(&tcp_hashinfo);
2105                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2106                         rc        = established_get_first(seq);
2107                 }
2108                 break;
2109         case TCP_SEQ_STATE_ESTABLISHED:
2110         case TCP_SEQ_STATE_TIME_WAIT:
2111                 rc = established_get_next(seq, v);
2112                 break;
2113         }
2114 out:
2115         ++*pos;
2116         return rc;
2117 }
2118
2119 static void tcp_seq_stop(struct seq_file *seq, void *v)
2120 {
2121         struct tcp_iter_state* st = seq->private;
2122
2123         switch (st->state) {
2124         case TCP_SEQ_STATE_OPENREQ:
2125                 if (v) {
2126                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2127                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2128                 }
2129         case TCP_SEQ_STATE_LISTENING:
2130                 if (v != SEQ_START_TOKEN)
2131                         inet_listen_unlock(&tcp_hashinfo);
2132                 break;
2133         case TCP_SEQ_STATE_TIME_WAIT:
2134         case TCP_SEQ_STATE_ESTABLISHED:
2135                 if (v)
2136                         read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2137                 break;
2138         }
2139 }
2140
2141 static int tcp_seq_open(struct inode *inode, struct file *file)
2142 {
2143         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2144         struct tcp_iter_state *s;
2145         int err;
2146
2147         err = seq_open_net(inode, file, &afinfo->seq_ops,
2148                           sizeof(struct tcp_iter_state));
2149         if (err < 0)
2150                 return err;
2151
2152         s = ((struct seq_file *)file->private_data)->private;
2153         s->family               = afinfo->family;
2154         return 0;
2155 }
2156
2157 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2158 {
2159         int rc = 0;
2160         struct proc_dir_entry *p;
2161
2162         afinfo->seq_fops.open           = tcp_seq_open;
2163         afinfo->seq_fops.read           = seq_read;
2164         afinfo->seq_fops.llseek         = seq_lseek;
2165         afinfo->seq_fops.release        = seq_release_net;
2166
2167         afinfo->seq_ops.start           = tcp_seq_start;
2168         afinfo->seq_ops.next            = tcp_seq_next;
2169         afinfo->seq_ops.stop            = tcp_seq_stop;
2170
2171         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2172                              &afinfo->seq_fops, afinfo);
2173         if (!p)
2174                 rc = -ENOMEM;
2175         return rc;
2176 }
2177
2178 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2179 {
2180         proc_net_remove(net, afinfo->name);
2181 }
2182
2183 static void get_openreq4(struct sock *sk, struct request_sock *req,
2184                          struct seq_file *f, int i, int uid, int *len)
2185 {
2186         const struct inet_request_sock *ireq = inet_rsk(req);
2187         int ttd = req->expires - jiffies;
2188
2189         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2190                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2191                 i,
2192                 ireq->loc_addr,
2193                 ntohs(inet_sk(sk)->sport),
2194                 ireq->rmt_addr,
2195                 ntohs(ireq->rmt_port),
2196                 TCP_SYN_RECV,
2197                 0, 0, /* could print option size, but that is af dependent. */
2198                 1,    /* timers active (only the expire timer) */
2199                 jiffies_to_clock_t(ttd),
2200                 req->retrans,
2201                 uid,
2202                 0,  /* non standard timer */
2203                 0, /* open_requests have no inode */
2204                 atomic_read(&sk->sk_refcnt),
2205                 req,
2206                 len);
2207 }
2208
2209 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2210 {
2211         int timer_active;
2212         unsigned long timer_expires;
2213         struct tcp_sock *tp = tcp_sk(sk);
2214         const struct inet_connection_sock *icsk = inet_csk(sk);
2215         struct inet_sock *inet = inet_sk(sk);
2216         __be32 dest = inet->daddr;
2217         __be32 src = inet->rcv_saddr;
2218         __u16 destp = ntohs(inet->dport);
2219         __u16 srcp = ntohs(inet->sport);
2220
2221         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2222                 timer_active    = 1;
2223                 timer_expires   = icsk->icsk_timeout;
2224         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2225                 timer_active    = 4;
2226                 timer_expires   = icsk->icsk_timeout;
2227         } else if (timer_pending(&sk->sk_timer)) {
2228                 timer_active    = 2;
2229                 timer_expires   = sk->sk_timer.expires;
2230         } else {
2231                 timer_active    = 0;
2232                 timer_expires = jiffies;
2233         }
2234
2235         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2236                         "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2237                 i, src, srcp, dest, destp, sk->sk_state,
2238                 tp->write_seq - tp->snd_una,
2239                 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2240                                              (tp->rcv_nxt - tp->copied_seq),
2241                 timer_active,
2242                 jiffies_to_clock_t(timer_expires - jiffies),
2243                 icsk->icsk_retransmits,
2244                 sock_i_uid(sk),
2245                 icsk->icsk_probes_out,
2246                 sock_i_ino(sk),
2247                 atomic_read(&sk->sk_refcnt), sk,
2248                 jiffies_to_clock_t(icsk->icsk_rto),
2249                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2250                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2251                 tp->snd_cwnd,
2252                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh,
2253                 len);
2254 }
2255
2256 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2257                                struct seq_file *f, int i, int *len)
2258 {
2259         __be32 dest, src;
2260         __u16 destp, srcp;
2261         int ttd = tw->tw_ttd - jiffies;
2262
2263         if (ttd < 0)
2264                 ttd = 0;
2265
2266         dest  = tw->tw_daddr;
2267         src   = tw->tw_rcv_saddr;
2268         destp = ntohs(tw->tw_dport);
2269         srcp  = ntohs(tw->tw_sport);
2270
2271         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2272                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2273                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2274                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2275                 atomic_read(&tw->tw_refcnt), tw, len);
2276 }
2277
2278 #define TMPSZ 150
2279
2280 static int tcp4_seq_show(struct seq_file *seq, void *v)
2281 {
2282         struct tcp_iter_state* st;
2283         int len;
2284
2285         if (v == SEQ_START_TOKEN) {
2286                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2287                            "  sl  local_address rem_address   st tx_queue "
2288                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2289                            "inode");
2290                 goto out;
2291         }
2292         st = seq->private;
2293
2294         switch (st->state) {
2295         case TCP_SEQ_STATE_LISTENING:
2296         case TCP_SEQ_STATE_ESTABLISHED:
2297                 get_tcp4_sock(v, seq, st->num, &len);
2298                 break;
2299         case TCP_SEQ_STATE_OPENREQ:
2300                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2301                 break;
2302         case TCP_SEQ_STATE_TIME_WAIT:
2303                 get_timewait4_sock(v, seq, st->num, &len);
2304                 break;
2305         }
2306         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2307 out:
2308         return 0;
2309 }
2310
2311 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2312         .name           = "tcp",
2313         .family         = AF_INET,
2314         .seq_fops       = {
2315                 .owner          = THIS_MODULE,
2316         },
2317         .seq_ops        = {
2318                 .show           = tcp4_seq_show,
2319         },
2320 };
2321
2322 static int tcp4_proc_init_net(struct net *net)
2323 {
2324         return tcp_proc_register(net, &tcp4_seq_afinfo);
2325 }
2326
2327 static void tcp4_proc_exit_net(struct net *net)
2328 {
2329         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2330 }
2331
2332 static struct pernet_operations tcp4_net_ops = {
2333         .init = tcp4_proc_init_net,
2334         .exit = tcp4_proc_exit_net,
2335 };
2336
2337 int __init tcp4_proc_init(void)
2338 {
2339         return register_pernet_subsys(&tcp4_net_ops);
2340 }
2341
2342 void tcp4_proc_exit(void)
2343 {
2344         unregister_pernet_subsys(&tcp4_net_ops);
2345 }
2346 #endif /* CONFIG_PROC_FS */
2347
2348 struct proto tcp_prot = {
2349         .name                   = "TCP",
2350         .owner                  = THIS_MODULE,
2351         .close                  = tcp_close,
2352         .connect                = tcp_v4_connect,
2353         .disconnect             = tcp_disconnect,
2354         .accept                 = inet_csk_accept,
2355         .ioctl                  = tcp_ioctl,
2356         .init                   = tcp_v4_init_sock,
2357         .destroy                = tcp_v4_destroy_sock,
2358         .shutdown               = tcp_shutdown,
2359         .setsockopt             = tcp_setsockopt,
2360         .getsockopt             = tcp_getsockopt,
2361         .recvmsg                = tcp_recvmsg,
2362         .backlog_rcv            = tcp_v4_do_rcv,
2363         .hash                   = inet_hash,
2364         .unhash                 = inet_unhash,
2365         .get_port               = inet_csk_get_port,
2366         .enter_memory_pressure  = tcp_enter_memory_pressure,
2367         .sockets_allocated      = &tcp_sockets_allocated,
2368         .orphan_count           = &tcp_orphan_count,
2369         .memory_allocated       = &tcp_memory_allocated,
2370         .memory_pressure        = &tcp_memory_pressure,
2371         .sysctl_mem             = sysctl_tcp_mem,
2372         .sysctl_wmem            = sysctl_tcp_wmem,
2373         .sysctl_rmem            = sysctl_tcp_rmem,
2374         .max_header             = MAX_TCP_HEADER,
2375         .obj_size               = sizeof(struct tcp_sock),
2376         .twsk_prot              = &tcp_timewait_sock_ops,
2377         .rsk_prot               = &tcp_request_sock_ops,
2378         .h.hashinfo             = &tcp_hashinfo,
2379 #ifdef CONFIG_COMPAT
2380         .compat_setsockopt      = compat_tcp_setsockopt,
2381         .compat_getsockopt      = compat_tcp_getsockopt,
2382 #endif
2383 };
2384
2385
2386 static int __net_init tcp_sk_init(struct net *net)
2387 {
2388         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2389                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2390 }
2391
2392 static void __net_exit tcp_sk_exit(struct net *net)
2393 {
2394         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2395         inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET);
2396 }
2397
2398 static struct pernet_operations __net_initdata tcp_sk_ops = {
2399        .init = tcp_sk_init,
2400        .exit = tcp_sk_exit,
2401 };
2402
2403 void __init tcp_v4_init(void)
2404 {
2405         if (register_pernet_device(&tcp_sk_ops))
2406                 panic("Failed to create the TCP control socket.\n");
2407 }
2408
2409 EXPORT_SYMBOL(ipv4_specific);
2410 EXPORT_SYMBOL(tcp_hashinfo);
2411 EXPORT_SYMBOL(tcp_prot);
2412 EXPORT_SYMBOL(tcp_v4_conn_request);
2413 EXPORT_SYMBOL(tcp_v4_connect);
2414 EXPORT_SYMBOL(tcp_v4_do_rcv);
2415 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2416 EXPORT_SYMBOL(tcp_v4_send_check);
2417 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2418
2419 #ifdef CONFIG_PROC_FS
2420 EXPORT_SYMBOL(tcp_proc_register);
2421 EXPORT_SYMBOL(tcp_proc_unregister);
2422 #endif
2423 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2424