net: Use a percpu_counter for sockets_allocated
[linux-flexiantxendom0-natty.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53
54 #include <linux/types.h>
55 #include <linux/fcntl.h>
56 #include <linux/module.h>
57 #include <linux/random.h>
58 #include <linux/cache.h>
59 #include <linux/jhash.h>
60 #include <linux/init.h>
61 #include <linux/times.h>
62
63 #include <net/net_namespace.h>
64 #include <net/icmp.h>
65 #include <net/inet_hashtables.h>
66 #include <net/tcp.h>
67 #include <net/transp_v6.h>
68 #include <net/ipv6.h>
69 #include <net/inet_common.h>
70 #include <net/timewait_sock.h>
71 #include <net/xfrm.h>
72 #include <net/netdma.h>
73
74 #include <linux/inet.h>
75 #include <linux/ipv6.h>
76 #include <linux/stddef.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79
80 #include <linux/crypto.h>
81 #include <linux/scatterlist.h>
82
83 int sysctl_tcp_tw_reuse __read_mostly;
84 int sysctl_tcp_low_latency __read_mostly;
85
86
87 #ifdef CONFIG_TCP_MD5SIG
88 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
89                                                    __be32 addr);
90 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
91                                __be32 daddr, __be32 saddr, struct tcphdr *th);
92 #else
93 static inline
94 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
95 {
96         return NULL;
97 }
98 #endif
99
100 struct inet_hashinfo tcp_hashinfo;
101
102 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
103 {
104         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
105                                           ip_hdr(skb)->saddr,
106                                           tcp_hdr(skb)->dest,
107                                           tcp_hdr(skb)->source);
108 }
109
110 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
111 {
112         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
113         struct tcp_sock *tp = tcp_sk(sk);
114
115         /* With PAWS, it is safe from the viewpoint
116            of data integrity. Even without PAWS it is safe provided sequence
117            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
118
119            Actually, the idea is close to VJ's one, only timestamp cache is
120            held not per host, but per port pair and TW bucket is used as state
121            holder.
122
123            If TW bucket has been already destroyed we fall back to VJ's scheme
124            and use initial timestamp retrieved from peer table.
125          */
126         if (tcptw->tw_ts_recent_stamp &&
127             (twp == NULL || (sysctl_tcp_tw_reuse &&
128                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
129                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
130                 if (tp->write_seq == 0)
131                         tp->write_seq = 1;
132                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
133                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
134                 sock_hold(sktw);
135                 return 1;
136         }
137
138         return 0;
139 }
140
141 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
142
143 /* This will initiate an outgoing connection. */
144 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
145 {
146         struct inet_sock *inet = inet_sk(sk);
147         struct tcp_sock *tp = tcp_sk(sk);
148         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
149         struct rtable *rt;
150         __be32 daddr, nexthop;
151         int tmp;
152         int err;
153
154         if (addr_len < sizeof(struct sockaddr_in))
155                 return -EINVAL;
156
157         if (usin->sin_family != AF_INET)
158                 return -EAFNOSUPPORT;
159
160         nexthop = daddr = usin->sin_addr.s_addr;
161         if (inet->opt && inet->opt->srr) {
162                 if (!daddr)
163                         return -EINVAL;
164                 nexthop = inet->opt->faddr;
165         }
166
167         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
168                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
169                                IPPROTO_TCP,
170                                inet->sport, usin->sin_port, sk, 1);
171         if (tmp < 0) {
172                 if (tmp == -ENETUNREACH)
173                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
174                 return tmp;
175         }
176
177         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
178                 ip_rt_put(rt);
179                 return -ENETUNREACH;
180         }
181
182         if (!inet->opt || !inet->opt->srr)
183                 daddr = rt->rt_dst;
184
185         if (!inet->saddr)
186                 inet->saddr = rt->rt_src;
187         inet->rcv_saddr = inet->saddr;
188
189         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
190                 /* Reset inherited state */
191                 tp->rx_opt.ts_recent       = 0;
192                 tp->rx_opt.ts_recent_stamp = 0;
193                 tp->write_seq              = 0;
194         }
195
196         if (tcp_death_row.sysctl_tw_recycle &&
197             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
198                 struct inet_peer *peer = rt_get_peer(rt);
199                 /*
200                  * VJ's idea. We save last timestamp seen from
201                  * the destination in peer table, when entering state
202                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
203                  * when trying new connection.
204                  */
205                 if (peer != NULL &&
206                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
207                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
208                         tp->rx_opt.ts_recent = peer->tcp_ts;
209                 }
210         }
211
212         inet->dport = usin->sin_port;
213         inet->daddr = daddr;
214
215         inet_csk(sk)->icsk_ext_hdr_len = 0;
216         if (inet->opt)
217                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
218
219         tp->rx_opt.mss_clamp = 536;
220
221         /* Socket identity is still unknown (sport may be zero).
222          * However we set state to SYN-SENT and not releasing socket
223          * lock select source port, enter ourselves into the hash tables and
224          * complete initialization after this.
225          */
226         tcp_set_state(sk, TCP_SYN_SENT);
227         err = inet_hash_connect(&tcp_death_row, sk);
228         if (err)
229                 goto failure;
230
231         err = ip_route_newports(&rt, IPPROTO_TCP,
232                                 inet->sport, inet->dport, sk);
233         if (err)
234                 goto failure;
235
236         /* OK, now commit destination to socket.  */
237         sk->sk_gso_type = SKB_GSO_TCPV4;
238         sk_setup_caps(sk, &rt->u.dst);
239
240         if (!tp->write_seq)
241                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
242                                                            inet->daddr,
243                                                            inet->sport,
244                                                            usin->sin_port);
245
246         inet->id = tp->write_seq ^ jiffies;
247
248         err = tcp_connect(sk);
249         rt = NULL;
250         if (err)
251                 goto failure;
252
253         return 0;
254
255 failure:
256         /*
257          * This unhashes the socket and releases the local port,
258          * if necessary.
259          */
260         tcp_set_state(sk, TCP_CLOSE);
261         ip_rt_put(rt);
262         sk->sk_route_caps = 0;
263         inet->dport = 0;
264         return err;
265 }
266
267 /*
268  * This routine does path mtu discovery as defined in RFC1191.
269  */
270 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
271 {
272         struct dst_entry *dst;
273         struct inet_sock *inet = inet_sk(sk);
274
275         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
276          * send out by Linux are always <576bytes so they should go through
277          * unfragmented).
278          */
279         if (sk->sk_state == TCP_LISTEN)
280                 return;
281
282         /* We don't check in the destentry if pmtu discovery is forbidden
283          * on this route. We just assume that no packet_to_big packets
284          * are send back when pmtu discovery is not active.
285          * There is a small race when the user changes this flag in the
286          * route, but I think that's acceptable.
287          */
288         if ((dst = __sk_dst_check(sk, 0)) == NULL)
289                 return;
290
291         dst->ops->update_pmtu(dst, mtu);
292
293         /* Something is about to be wrong... Remember soft error
294          * for the case, if this connection will not able to recover.
295          */
296         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
297                 sk->sk_err_soft = EMSGSIZE;
298
299         mtu = dst_mtu(dst);
300
301         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
302             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
303                 tcp_sync_mss(sk, mtu);
304
305                 /* Resend the TCP packet because it's
306                  * clear that the old packet has been
307                  * dropped. This is the new "fast" path mtu
308                  * discovery.
309                  */
310                 tcp_simple_retransmit(sk);
311         } /* else let the usual retransmit timer handle it */
312 }
313
314 /*
315  * This routine is called by the ICMP module when it gets some
316  * sort of error condition.  If err < 0 then the socket should
317  * be closed and the error returned to the user.  If err > 0
318  * it's just the icmp type << 8 | icmp code.  After adjustment
319  * header points to the first 8 bytes of the tcp header.  We need
320  * to find the appropriate port.
321  *
322  * The locking strategy used here is very "optimistic". When
323  * someone else accesses the socket the ICMP is just dropped
324  * and for some paths there is no check at all.
325  * A more general error queue to queue errors for later handling
326  * is probably better.
327  *
328  */
329
330 void tcp_v4_err(struct sk_buff *skb, u32 info)
331 {
332         struct iphdr *iph = (struct iphdr *)skb->data;
333         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
334         struct tcp_sock *tp;
335         struct inet_sock *inet;
336         const int type = icmp_hdr(skb)->type;
337         const int code = icmp_hdr(skb)->code;
338         struct sock *sk;
339         __u32 seq;
340         int err;
341         struct net *net = dev_net(skb->dev);
342
343         if (skb->len < (iph->ihl << 2) + 8) {
344                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
345                 return;
346         }
347
348         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
349                         iph->saddr, th->source, inet_iif(skb));
350         if (!sk) {
351                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
352                 return;
353         }
354         if (sk->sk_state == TCP_TIME_WAIT) {
355                 inet_twsk_put(inet_twsk(sk));
356                 return;
357         }
358
359         bh_lock_sock(sk);
360         /* If too many ICMPs get dropped on busy
361          * servers this needs to be solved differently.
362          */
363         if (sock_owned_by_user(sk))
364                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
365
366         if (sk->sk_state == TCP_CLOSE)
367                 goto out;
368
369         tp = tcp_sk(sk);
370         seq = ntohl(th->seq);
371         if (sk->sk_state != TCP_LISTEN &&
372             !between(seq, tp->snd_una, tp->snd_nxt)) {
373                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
374                 goto out;
375         }
376
377         switch (type) {
378         case ICMP_SOURCE_QUENCH:
379                 /* Just silently ignore these. */
380                 goto out;
381         case ICMP_PARAMETERPROB:
382                 err = EPROTO;
383                 break;
384         case ICMP_DEST_UNREACH:
385                 if (code > NR_ICMP_UNREACH)
386                         goto out;
387
388                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
389                         if (!sock_owned_by_user(sk))
390                                 do_pmtu_discovery(sk, iph, info);
391                         goto out;
392                 }
393
394                 err = icmp_err_convert[code].errno;
395                 break;
396         case ICMP_TIME_EXCEEDED:
397                 err = EHOSTUNREACH;
398                 break;
399         default:
400                 goto out;
401         }
402
403         switch (sk->sk_state) {
404                 struct request_sock *req, **prev;
405         case TCP_LISTEN:
406                 if (sock_owned_by_user(sk))
407                         goto out;
408
409                 req = inet_csk_search_req(sk, &prev, th->dest,
410                                           iph->daddr, iph->saddr);
411                 if (!req)
412                         goto out;
413
414                 /* ICMPs are not backlogged, hence we cannot get
415                    an established socket here.
416                  */
417                 WARN_ON(req->sk);
418
419                 if (seq != tcp_rsk(req)->snt_isn) {
420                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
421                         goto out;
422                 }
423
424                 /*
425                  * Still in SYN_RECV, just remove it silently.
426                  * There is no good way to pass the error to the newly
427                  * created socket, and POSIX does not want network
428                  * errors returned from accept().
429                  */
430                 inet_csk_reqsk_queue_drop(sk, req, prev);
431                 goto out;
432
433         case TCP_SYN_SENT:
434         case TCP_SYN_RECV:  /* Cannot happen.
435                                It can f.e. if SYNs crossed.
436                              */
437                 if (!sock_owned_by_user(sk)) {
438                         sk->sk_err = err;
439
440                         sk->sk_error_report(sk);
441
442                         tcp_done(sk);
443                 } else {
444                         sk->sk_err_soft = err;
445                 }
446                 goto out;
447         }
448
449         /* If we've already connected we will keep trying
450          * until we time out, or the user gives up.
451          *
452          * rfc1122 4.2.3.9 allows to consider as hard errors
453          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
454          * but it is obsoleted by pmtu discovery).
455          *
456          * Note, that in modern internet, where routing is unreliable
457          * and in each dark corner broken firewalls sit, sending random
458          * errors ordered by their masters even this two messages finally lose
459          * their original sense (even Linux sends invalid PORT_UNREACHs)
460          *
461          * Now we are in compliance with RFCs.
462          *                                                      --ANK (980905)
463          */
464
465         inet = inet_sk(sk);
466         if (!sock_owned_by_user(sk) && inet->recverr) {
467                 sk->sk_err = err;
468                 sk->sk_error_report(sk);
469         } else  { /* Only an error on timeout */
470                 sk->sk_err_soft = err;
471         }
472
473 out:
474         bh_unlock_sock(sk);
475         sock_put(sk);
476 }
477
478 /* This routine computes an IPv4 TCP checksum. */
479 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
480 {
481         struct inet_sock *inet = inet_sk(sk);
482         struct tcphdr *th = tcp_hdr(skb);
483
484         if (skb->ip_summed == CHECKSUM_PARTIAL) {
485                 th->check = ~tcp_v4_check(len, inet->saddr,
486                                           inet->daddr, 0);
487                 skb->csum_start = skb_transport_header(skb) - skb->head;
488                 skb->csum_offset = offsetof(struct tcphdr, check);
489         } else {
490                 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
491                                          csum_partial(th,
492                                                       th->doff << 2,
493                                                       skb->csum));
494         }
495 }
496
497 int tcp_v4_gso_send_check(struct sk_buff *skb)
498 {
499         const struct iphdr *iph;
500         struct tcphdr *th;
501
502         if (!pskb_may_pull(skb, sizeof(*th)))
503                 return -EINVAL;
504
505         iph = ip_hdr(skb);
506         th = tcp_hdr(skb);
507
508         th->check = 0;
509         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
510         skb->csum_start = skb_transport_header(skb) - skb->head;
511         skb->csum_offset = offsetof(struct tcphdr, check);
512         skb->ip_summed = CHECKSUM_PARTIAL;
513         return 0;
514 }
515
516 /*
517  *      This routine will send an RST to the other tcp.
518  *
519  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
520  *                    for reset.
521  *      Answer: if a packet caused RST, it is not for a socket
522  *              existing in our system, if it is matched to a socket,
523  *              it is just duplicate segment or bug in other side's TCP.
524  *              So that we build reply only basing on parameters
525  *              arrived with segment.
526  *      Exception: precedence violation. We do not implement it in any case.
527  */
528
529 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
530 {
531         struct tcphdr *th = tcp_hdr(skb);
532         struct {
533                 struct tcphdr th;
534 #ifdef CONFIG_TCP_MD5SIG
535                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
536 #endif
537         } rep;
538         struct ip_reply_arg arg;
539 #ifdef CONFIG_TCP_MD5SIG
540         struct tcp_md5sig_key *key;
541 #endif
542         struct net *net;
543
544         /* Never send a reset in response to a reset. */
545         if (th->rst)
546                 return;
547
548         if (skb->rtable->rt_type != RTN_LOCAL)
549                 return;
550
551         /* Swap the send and the receive. */
552         memset(&rep, 0, sizeof(rep));
553         rep.th.dest   = th->source;
554         rep.th.source = th->dest;
555         rep.th.doff   = sizeof(struct tcphdr) / 4;
556         rep.th.rst    = 1;
557
558         if (th->ack) {
559                 rep.th.seq = th->ack_seq;
560         } else {
561                 rep.th.ack = 1;
562                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
563                                        skb->len - (th->doff << 2));
564         }
565
566         memset(&arg, 0, sizeof(arg));
567         arg.iov[0].iov_base = (unsigned char *)&rep;
568         arg.iov[0].iov_len  = sizeof(rep.th);
569
570 #ifdef CONFIG_TCP_MD5SIG
571         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
572         if (key) {
573                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
574                                    (TCPOPT_NOP << 16) |
575                                    (TCPOPT_MD5SIG << 8) |
576                                    TCPOLEN_MD5SIG);
577                 /* Update length and the length the header thinks exists */
578                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
579                 rep.th.doff = arg.iov[0].iov_len / 4;
580
581                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
582                                      key, ip_hdr(skb)->saddr,
583                                      ip_hdr(skb)->daddr, &rep.th);
584         }
585 #endif
586         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
587                                       ip_hdr(skb)->saddr, /* XXX */
588                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
589         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
590         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
591
592         net = dev_net(skb->dst->dev);
593         ip_send_reply(net->ipv4.tcp_sock, skb,
594                       &arg, arg.iov[0].iov_len);
595
596         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
597         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
598 }
599
600 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
601    outside socket context is ugly, certainly. What can I do?
602  */
603
604 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
605                             u32 win, u32 ts, int oif,
606                             struct tcp_md5sig_key *key,
607                             int reply_flags)
608 {
609         struct tcphdr *th = tcp_hdr(skb);
610         struct {
611                 struct tcphdr th;
612                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
613 #ifdef CONFIG_TCP_MD5SIG
614                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
615 #endif
616                         ];
617         } rep;
618         struct ip_reply_arg arg;
619         struct net *net = dev_net(skb->dst->dev);
620
621         memset(&rep.th, 0, sizeof(struct tcphdr));
622         memset(&arg, 0, sizeof(arg));
623
624         arg.iov[0].iov_base = (unsigned char *)&rep;
625         arg.iov[0].iov_len  = sizeof(rep.th);
626         if (ts) {
627                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
628                                    (TCPOPT_TIMESTAMP << 8) |
629                                    TCPOLEN_TIMESTAMP);
630                 rep.opt[1] = htonl(tcp_time_stamp);
631                 rep.opt[2] = htonl(ts);
632                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
633         }
634
635         /* Swap the send and the receive. */
636         rep.th.dest    = th->source;
637         rep.th.source  = th->dest;
638         rep.th.doff    = arg.iov[0].iov_len / 4;
639         rep.th.seq     = htonl(seq);
640         rep.th.ack_seq = htonl(ack);
641         rep.th.ack     = 1;
642         rep.th.window  = htons(win);
643
644 #ifdef CONFIG_TCP_MD5SIG
645         if (key) {
646                 int offset = (ts) ? 3 : 0;
647
648                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
649                                           (TCPOPT_NOP << 16) |
650                                           (TCPOPT_MD5SIG << 8) |
651                                           TCPOLEN_MD5SIG);
652                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
653                 rep.th.doff = arg.iov[0].iov_len/4;
654
655                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
656                                     key, ip_hdr(skb)->saddr,
657                                     ip_hdr(skb)->daddr, &rep.th);
658         }
659 #endif
660         arg.flags = reply_flags;
661         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
662                                       ip_hdr(skb)->saddr, /* XXX */
663                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
664         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
665         if (oif)
666                 arg.bound_dev_if = oif;
667
668         ip_send_reply(net->ipv4.tcp_sock, skb,
669                       &arg, arg.iov[0].iov_len);
670
671         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
672 }
673
674 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
675 {
676         struct inet_timewait_sock *tw = inet_twsk(sk);
677         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
678
679         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
680                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
681                         tcptw->tw_ts_recent,
682                         tw->tw_bound_dev_if,
683                         tcp_twsk_md5_key(tcptw),
684                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
685                         );
686
687         inet_twsk_put(tw);
688 }
689
690 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
691                                   struct request_sock *req)
692 {
693         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
694                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
695                         req->ts_recent,
696                         0,
697                         tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
698                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
699 }
700
701 /*
702  *      Send a SYN-ACK after having received a SYN.
703  *      This still operates on a request_sock only, not on a big
704  *      socket.
705  */
706 static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
707                                 struct dst_entry *dst)
708 {
709         const struct inet_request_sock *ireq = inet_rsk(req);
710         int err = -1;
711         struct sk_buff * skb;
712
713         /* First, grab a route. */
714         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
715                 return -1;
716
717         skb = tcp_make_synack(sk, dst, req);
718
719         if (skb) {
720                 struct tcphdr *th = tcp_hdr(skb);
721
722                 th->check = tcp_v4_check(skb->len,
723                                          ireq->loc_addr,
724                                          ireq->rmt_addr,
725                                          csum_partial(th, skb->len,
726                                                       skb->csum));
727
728                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
729                                             ireq->rmt_addr,
730                                             ireq->opt);
731                 err = net_xmit_eval(err);
732         }
733
734         dst_release(dst);
735         return err;
736 }
737
738 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
739 {
740         return __tcp_v4_send_synack(sk, req, NULL);
741 }
742
743 /*
744  *      IPv4 request_sock destructor.
745  */
746 static void tcp_v4_reqsk_destructor(struct request_sock *req)
747 {
748         kfree(inet_rsk(req)->opt);
749 }
750
751 #ifdef CONFIG_SYN_COOKIES
752 static void syn_flood_warning(struct sk_buff *skb)
753 {
754         static unsigned long warntime;
755
756         if (time_after(jiffies, (warntime + HZ * 60))) {
757                 warntime = jiffies;
758                 printk(KERN_INFO
759                        "possible SYN flooding on port %d. Sending cookies.\n",
760                        ntohs(tcp_hdr(skb)->dest));
761         }
762 }
763 #endif
764
765 /*
766  * Save and compile IPv4 options into the request_sock if needed.
767  */
768 static struct ip_options *tcp_v4_save_options(struct sock *sk,
769                                               struct sk_buff *skb)
770 {
771         struct ip_options *opt = &(IPCB(skb)->opt);
772         struct ip_options *dopt = NULL;
773
774         if (opt && opt->optlen) {
775                 int opt_size = optlength(opt);
776                 dopt = kmalloc(opt_size, GFP_ATOMIC);
777                 if (dopt) {
778                         if (ip_options_echo(dopt, skb)) {
779                                 kfree(dopt);
780                                 dopt = NULL;
781                         }
782                 }
783         }
784         return dopt;
785 }
786
787 #ifdef CONFIG_TCP_MD5SIG
788 /*
789  * RFC2385 MD5 checksumming requires a mapping of
790  * IP address->MD5 Key.
791  * We need to maintain these in the sk structure.
792  */
793
794 /* Find the Key structure for an address.  */
795 static struct tcp_md5sig_key *
796                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
797 {
798         struct tcp_sock *tp = tcp_sk(sk);
799         int i;
800
801         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
802                 return NULL;
803         for (i = 0; i < tp->md5sig_info->entries4; i++) {
804                 if (tp->md5sig_info->keys4[i].addr == addr)
805                         return &tp->md5sig_info->keys4[i].base;
806         }
807         return NULL;
808 }
809
810 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
811                                          struct sock *addr_sk)
812 {
813         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
814 }
815
816 EXPORT_SYMBOL(tcp_v4_md5_lookup);
817
818 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
819                                                       struct request_sock *req)
820 {
821         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
822 }
823
824 /* This can be called on a newly created socket, from other files */
825 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
826                       u8 *newkey, u8 newkeylen)
827 {
828         /* Add Key to the list */
829         struct tcp_md5sig_key *key;
830         struct tcp_sock *tp = tcp_sk(sk);
831         struct tcp4_md5sig_key *keys;
832
833         key = tcp_v4_md5_do_lookup(sk, addr);
834         if (key) {
835                 /* Pre-existing entry - just update that one. */
836                 kfree(key->key);
837                 key->key = newkey;
838                 key->keylen = newkeylen;
839         } else {
840                 struct tcp_md5sig_info *md5sig;
841
842                 if (!tp->md5sig_info) {
843                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
844                                                   GFP_ATOMIC);
845                         if (!tp->md5sig_info) {
846                                 kfree(newkey);
847                                 return -ENOMEM;
848                         }
849                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
850                 }
851                 if (tcp_alloc_md5sig_pool() == NULL) {
852                         kfree(newkey);
853                         return -ENOMEM;
854                 }
855                 md5sig = tp->md5sig_info;
856
857                 if (md5sig->alloced4 == md5sig->entries4) {
858                         keys = kmalloc((sizeof(*keys) *
859                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
860                         if (!keys) {
861                                 kfree(newkey);
862                                 tcp_free_md5sig_pool();
863                                 return -ENOMEM;
864                         }
865
866                         if (md5sig->entries4)
867                                 memcpy(keys, md5sig->keys4,
868                                        sizeof(*keys) * md5sig->entries4);
869
870                         /* Free old key list, and reference new one */
871                         kfree(md5sig->keys4);
872                         md5sig->keys4 = keys;
873                         md5sig->alloced4++;
874                 }
875                 md5sig->entries4++;
876                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
877                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
878                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
879         }
880         return 0;
881 }
882
883 EXPORT_SYMBOL(tcp_v4_md5_do_add);
884
885 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
886                                u8 *newkey, u8 newkeylen)
887 {
888         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
889                                  newkey, newkeylen);
890 }
891
892 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
893 {
894         struct tcp_sock *tp = tcp_sk(sk);
895         int i;
896
897         for (i = 0; i < tp->md5sig_info->entries4; i++) {
898                 if (tp->md5sig_info->keys4[i].addr == addr) {
899                         /* Free the key */
900                         kfree(tp->md5sig_info->keys4[i].base.key);
901                         tp->md5sig_info->entries4--;
902
903                         if (tp->md5sig_info->entries4 == 0) {
904                                 kfree(tp->md5sig_info->keys4);
905                                 tp->md5sig_info->keys4 = NULL;
906                                 tp->md5sig_info->alloced4 = 0;
907                         } else if (tp->md5sig_info->entries4 != i) {
908                                 /* Need to do some manipulation */
909                                 memmove(&tp->md5sig_info->keys4[i],
910                                         &tp->md5sig_info->keys4[i+1],
911                                         (tp->md5sig_info->entries4 - i) *
912                                          sizeof(struct tcp4_md5sig_key));
913                         }
914                         tcp_free_md5sig_pool();
915                         return 0;
916                 }
917         }
918         return -ENOENT;
919 }
920
921 EXPORT_SYMBOL(tcp_v4_md5_do_del);
922
923 static void tcp_v4_clear_md5_list(struct sock *sk)
924 {
925         struct tcp_sock *tp = tcp_sk(sk);
926
927         /* Free each key, then the set of key keys,
928          * the crypto element, and then decrement our
929          * hold on the last resort crypto.
930          */
931         if (tp->md5sig_info->entries4) {
932                 int i;
933                 for (i = 0; i < tp->md5sig_info->entries4; i++)
934                         kfree(tp->md5sig_info->keys4[i].base.key);
935                 tp->md5sig_info->entries4 = 0;
936                 tcp_free_md5sig_pool();
937         }
938         if (tp->md5sig_info->keys4) {
939                 kfree(tp->md5sig_info->keys4);
940                 tp->md5sig_info->keys4 = NULL;
941                 tp->md5sig_info->alloced4  = 0;
942         }
943 }
944
945 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
946                                  int optlen)
947 {
948         struct tcp_md5sig cmd;
949         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
950         u8 *newkey;
951
952         if (optlen < sizeof(cmd))
953                 return -EINVAL;
954
955         if (copy_from_user(&cmd, optval, sizeof(cmd)))
956                 return -EFAULT;
957
958         if (sin->sin_family != AF_INET)
959                 return -EINVAL;
960
961         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
962                 if (!tcp_sk(sk)->md5sig_info)
963                         return -ENOENT;
964                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
965         }
966
967         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
968                 return -EINVAL;
969
970         if (!tcp_sk(sk)->md5sig_info) {
971                 struct tcp_sock *tp = tcp_sk(sk);
972                 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
973
974                 if (!p)
975                         return -EINVAL;
976
977                 tp->md5sig_info = p;
978                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
979         }
980
981         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
982         if (!newkey)
983                 return -ENOMEM;
984         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
985                                  newkey, cmd.tcpm_keylen);
986 }
987
988 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
989                                         __be32 daddr, __be32 saddr, int nbytes)
990 {
991         struct tcp4_pseudohdr *bp;
992         struct scatterlist sg;
993
994         bp = &hp->md5_blk.ip4;
995
996         /*
997          * 1. the TCP pseudo-header (in the order: source IP address,
998          * destination IP address, zero-padded protocol number, and
999          * segment length)
1000          */
1001         bp->saddr = saddr;
1002         bp->daddr = daddr;
1003         bp->pad = 0;
1004         bp->protocol = IPPROTO_TCP;
1005         bp->len = cpu_to_be16(nbytes);
1006
1007         sg_init_one(&sg, bp, sizeof(*bp));
1008         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1009 }
1010
1011 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1012                                __be32 daddr, __be32 saddr, struct tcphdr *th)
1013 {
1014         struct tcp_md5sig_pool *hp;
1015         struct hash_desc *desc;
1016
1017         hp = tcp_get_md5sig_pool();
1018         if (!hp)
1019                 goto clear_hash_noput;
1020         desc = &hp->md5_desc;
1021
1022         if (crypto_hash_init(desc))
1023                 goto clear_hash;
1024         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1025                 goto clear_hash;
1026         if (tcp_md5_hash_header(hp, th))
1027                 goto clear_hash;
1028         if (tcp_md5_hash_key(hp, key))
1029                 goto clear_hash;
1030         if (crypto_hash_final(desc, md5_hash))
1031                 goto clear_hash;
1032
1033         tcp_put_md5sig_pool();
1034         return 0;
1035
1036 clear_hash:
1037         tcp_put_md5sig_pool();
1038 clear_hash_noput:
1039         memset(md5_hash, 0, 16);
1040         return 1;
1041 }
1042
1043 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1044                         struct sock *sk, struct request_sock *req,
1045                         struct sk_buff *skb)
1046 {
1047         struct tcp_md5sig_pool *hp;
1048         struct hash_desc *desc;
1049         struct tcphdr *th = tcp_hdr(skb);
1050         __be32 saddr, daddr;
1051
1052         if (sk) {
1053                 saddr = inet_sk(sk)->saddr;
1054                 daddr = inet_sk(sk)->daddr;
1055         } else if (req) {
1056                 saddr = inet_rsk(req)->loc_addr;
1057                 daddr = inet_rsk(req)->rmt_addr;
1058         } else {
1059                 const struct iphdr *iph = ip_hdr(skb);
1060                 saddr = iph->saddr;
1061                 daddr = iph->daddr;
1062         }
1063
1064         hp = tcp_get_md5sig_pool();
1065         if (!hp)
1066                 goto clear_hash_noput;
1067         desc = &hp->md5_desc;
1068
1069         if (crypto_hash_init(desc))
1070                 goto clear_hash;
1071
1072         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1073                 goto clear_hash;
1074         if (tcp_md5_hash_header(hp, th))
1075                 goto clear_hash;
1076         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1077                 goto clear_hash;
1078         if (tcp_md5_hash_key(hp, key))
1079                 goto clear_hash;
1080         if (crypto_hash_final(desc, md5_hash))
1081                 goto clear_hash;
1082
1083         tcp_put_md5sig_pool();
1084         return 0;
1085
1086 clear_hash:
1087         tcp_put_md5sig_pool();
1088 clear_hash_noput:
1089         memset(md5_hash, 0, 16);
1090         return 1;
1091 }
1092
1093 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1094
1095 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1096 {
1097         /*
1098          * This gets called for each TCP segment that arrives
1099          * so we want to be efficient.
1100          * We have 3 drop cases:
1101          * o No MD5 hash and one expected.
1102          * o MD5 hash and we're not expecting one.
1103          * o MD5 hash and its wrong.
1104          */
1105         __u8 *hash_location = NULL;
1106         struct tcp_md5sig_key *hash_expected;
1107         const struct iphdr *iph = ip_hdr(skb);
1108         struct tcphdr *th = tcp_hdr(skb);
1109         int genhash;
1110         unsigned char newhash[16];
1111
1112         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1113         hash_location = tcp_parse_md5sig_option(th);
1114
1115         /* We've parsed the options - do we have a hash? */
1116         if (!hash_expected && !hash_location)
1117                 return 0;
1118
1119         if (hash_expected && !hash_location) {
1120                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1121                 return 1;
1122         }
1123
1124         if (!hash_expected && hash_location) {
1125                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1126                 return 1;
1127         }
1128
1129         /* Okay, so this is hash_expected and hash_location -
1130          * so we need to calculate the checksum.
1131          */
1132         genhash = tcp_v4_md5_hash_skb(newhash,
1133                                       hash_expected,
1134                                       NULL, NULL, skb);
1135
1136         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1137                 if (net_ratelimit()) {
1138                         printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1139                                &iph->saddr, ntohs(th->source),
1140                                &iph->daddr, ntohs(th->dest),
1141                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1142                 }
1143                 return 1;
1144         }
1145         return 0;
1146 }
1147
1148 #endif
1149
1150 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1151         .family         =       PF_INET,
1152         .obj_size       =       sizeof(struct tcp_request_sock),
1153         .rtx_syn_ack    =       tcp_v4_send_synack,
1154         .send_ack       =       tcp_v4_reqsk_send_ack,
1155         .destructor     =       tcp_v4_reqsk_destructor,
1156         .send_reset     =       tcp_v4_send_reset,
1157 };
1158
1159 #ifdef CONFIG_TCP_MD5SIG
1160 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1161         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1162 };
1163 #endif
1164
1165 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1166         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1167         .twsk_unique    = tcp_twsk_unique,
1168         .twsk_destructor= tcp_twsk_destructor,
1169 };
1170
1171 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1172 {
1173         struct inet_request_sock *ireq;
1174         struct tcp_options_received tmp_opt;
1175         struct request_sock *req;
1176         __be32 saddr = ip_hdr(skb)->saddr;
1177         __be32 daddr = ip_hdr(skb)->daddr;
1178         __u32 isn = TCP_SKB_CB(skb)->when;
1179         struct dst_entry *dst = NULL;
1180 #ifdef CONFIG_SYN_COOKIES
1181         int want_cookie = 0;
1182 #else
1183 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1184 #endif
1185
1186         /* Never answer to SYNs send to broadcast or multicast */
1187         if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1188                 goto drop;
1189
1190         /* TW buckets are converted to open requests without
1191          * limitations, they conserve resources and peer is
1192          * evidently real one.
1193          */
1194         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1195 #ifdef CONFIG_SYN_COOKIES
1196                 if (sysctl_tcp_syncookies) {
1197                         want_cookie = 1;
1198                 } else
1199 #endif
1200                 goto drop;
1201         }
1202
1203         /* Accept backlog is full. If we have already queued enough
1204          * of warm entries in syn queue, drop request. It is better than
1205          * clogging syn queue with openreqs with exponentially increasing
1206          * timeout.
1207          */
1208         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1209                 goto drop;
1210
1211         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1212         if (!req)
1213                 goto drop;
1214
1215 #ifdef CONFIG_TCP_MD5SIG
1216         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1217 #endif
1218
1219         tcp_clear_options(&tmp_opt);
1220         tmp_opt.mss_clamp = 536;
1221         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1222
1223         tcp_parse_options(skb, &tmp_opt, 0);
1224
1225         if (want_cookie && !tmp_opt.saw_tstamp)
1226                 tcp_clear_options(&tmp_opt);
1227
1228         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1229                 /* Some OSes (unknown ones, but I see them on web server, which
1230                  * contains information interesting only for windows'
1231                  * users) do not send their stamp in SYN. It is easy case.
1232                  * We simply do not advertise TS support.
1233                  */
1234                 tmp_opt.saw_tstamp = 0;
1235                 tmp_opt.tstamp_ok  = 0;
1236         }
1237         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1238
1239         tcp_openreq_init(req, &tmp_opt, skb);
1240
1241         if (security_inet_conn_request(sk, skb, req))
1242                 goto drop_and_free;
1243
1244         ireq = inet_rsk(req);
1245         ireq->loc_addr = daddr;
1246         ireq->rmt_addr = saddr;
1247         ireq->no_srccheck = inet_sk(sk)->transparent;
1248         ireq->opt = tcp_v4_save_options(sk, skb);
1249         if (!want_cookie)
1250                 TCP_ECN_create_request(req, tcp_hdr(skb));
1251
1252         if (want_cookie) {
1253 #ifdef CONFIG_SYN_COOKIES
1254                 syn_flood_warning(skb);
1255                 req->cookie_ts = tmp_opt.tstamp_ok;
1256 #endif
1257                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1258         } else if (!isn) {
1259                 struct inet_peer *peer = NULL;
1260
1261                 /* VJ's idea. We save last timestamp seen
1262                  * from the destination in peer table, when entering
1263                  * state TIME-WAIT, and check against it before
1264                  * accepting new connection request.
1265                  *
1266                  * If "isn" is not zero, this request hit alive
1267                  * timewait bucket, so that all the necessary checks
1268                  * are made in the function processing timewait state.
1269                  */
1270                 if (tmp_opt.saw_tstamp &&
1271                     tcp_death_row.sysctl_tw_recycle &&
1272                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1273                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1274                     peer->v4daddr == saddr) {
1275                         if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1276                             (s32)(peer->tcp_ts - req->ts_recent) >
1277                                                         TCP_PAWS_WINDOW) {
1278                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1279                                 goto drop_and_release;
1280                         }
1281                 }
1282                 /* Kill the following clause, if you dislike this way. */
1283                 else if (!sysctl_tcp_syncookies &&
1284                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1285                           (sysctl_max_syn_backlog >> 2)) &&
1286                          (!peer || !peer->tcp_ts_stamp) &&
1287                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1288                         /* Without syncookies last quarter of
1289                          * backlog is filled with destinations,
1290                          * proven to be alive.
1291                          * It means that we continue to communicate
1292                          * to destinations, already remembered
1293                          * to the moment of synflood.
1294                          */
1295                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1296                                        &saddr, ntohs(tcp_hdr(skb)->source));
1297                         goto drop_and_release;
1298                 }
1299
1300                 isn = tcp_v4_init_sequence(skb);
1301         }
1302         tcp_rsk(req)->snt_isn = isn;
1303
1304         if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1305                 goto drop_and_free;
1306
1307         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1308         return 0;
1309
1310 drop_and_release:
1311         dst_release(dst);
1312 drop_and_free:
1313         reqsk_free(req);
1314 drop:
1315         return 0;
1316 }
1317
1318
1319 /*
1320  * The three way handshake has completed - we got a valid synack -
1321  * now create the new socket.
1322  */
1323 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1324                                   struct request_sock *req,
1325                                   struct dst_entry *dst)
1326 {
1327         struct inet_request_sock *ireq;
1328         struct inet_sock *newinet;
1329         struct tcp_sock *newtp;
1330         struct sock *newsk;
1331 #ifdef CONFIG_TCP_MD5SIG
1332         struct tcp_md5sig_key *key;
1333 #endif
1334
1335         if (sk_acceptq_is_full(sk))
1336                 goto exit_overflow;
1337
1338         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1339                 goto exit;
1340
1341         newsk = tcp_create_openreq_child(sk, req, skb);
1342         if (!newsk)
1343                 goto exit;
1344
1345         newsk->sk_gso_type = SKB_GSO_TCPV4;
1346         sk_setup_caps(newsk, dst);
1347
1348         newtp                 = tcp_sk(newsk);
1349         newinet               = inet_sk(newsk);
1350         ireq                  = inet_rsk(req);
1351         newinet->daddr        = ireq->rmt_addr;
1352         newinet->rcv_saddr    = ireq->loc_addr;
1353         newinet->saddr        = ireq->loc_addr;
1354         newinet->opt          = ireq->opt;
1355         ireq->opt             = NULL;
1356         newinet->mc_index     = inet_iif(skb);
1357         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1358         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1359         if (newinet->opt)
1360                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1361         newinet->id = newtp->write_seq ^ jiffies;
1362
1363         tcp_mtup_init(newsk);
1364         tcp_sync_mss(newsk, dst_mtu(dst));
1365         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1366         if (tcp_sk(sk)->rx_opt.user_mss &&
1367             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1368                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1369
1370         tcp_initialize_rcv_mss(newsk);
1371
1372 #ifdef CONFIG_TCP_MD5SIG
1373         /* Copy over the MD5 key from the original socket */
1374         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1375                 /*
1376                  * We're using one, so create a matching key
1377                  * on the newsk structure. If we fail to get
1378                  * memory, then we end up not copying the key
1379                  * across. Shucks.
1380                  */
1381                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1382                 if (newkey != NULL)
1383                         tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1384                                           newkey, key->keylen);
1385                 newsk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1386         }
1387 #endif
1388
1389         __inet_hash_nolisten(newsk);
1390         __inet_inherit_port(sk, newsk);
1391
1392         return newsk;
1393
1394 exit_overflow:
1395         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1396 exit:
1397         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1398         dst_release(dst);
1399         return NULL;
1400 }
1401
1402 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1403 {
1404         struct tcphdr *th = tcp_hdr(skb);
1405         const struct iphdr *iph = ip_hdr(skb);
1406         struct sock *nsk;
1407         struct request_sock **prev;
1408         /* Find possible connection requests. */
1409         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1410                                                        iph->saddr, iph->daddr);
1411         if (req)
1412                 return tcp_check_req(sk, skb, req, prev);
1413
1414         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1415                         th->source, iph->daddr, th->dest, inet_iif(skb));
1416
1417         if (nsk) {
1418                 if (nsk->sk_state != TCP_TIME_WAIT) {
1419                         bh_lock_sock(nsk);
1420                         return nsk;
1421                 }
1422                 inet_twsk_put(inet_twsk(nsk));
1423                 return NULL;
1424         }
1425
1426 #ifdef CONFIG_SYN_COOKIES
1427         if (!th->rst && !th->syn && th->ack)
1428                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1429 #endif
1430         return sk;
1431 }
1432
1433 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1434 {
1435         const struct iphdr *iph = ip_hdr(skb);
1436
1437         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1438                 if (!tcp_v4_check(skb->len, iph->saddr,
1439                                   iph->daddr, skb->csum)) {
1440                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1441                         return 0;
1442                 }
1443         }
1444
1445         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1446                                        skb->len, IPPROTO_TCP, 0);
1447
1448         if (skb->len <= 76) {
1449                 return __skb_checksum_complete(skb);
1450         }
1451         return 0;
1452 }
1453
1454
1455 /* The socket must have it's spinlock held when we get
1456  * here.
1457  *
1458  * We have a potential double-lock case here, so even when
1459  * doing backlog processing we use the BH locking scheme.
1460  * This is because we cannot sleep with the original spinlock
1461  * held.
1462  */
1463 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1464 {
1465         struct sock *rsk;
1466 #ifdef CONFIG_TCP_MD5SIG
1467         /*
1468          * We really want to reject the packet as early as possible
1469          * if:
1470          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1471          *  o There is an MD5 option and we're not expecting one
1472          */
1473         if (tcp_v4_inbound_md5_hash(sk, skb))
1474                 goto discard;
1475 #endif
1476
1477         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1478                 TCP_CHECK_TIMER(sk);
1479                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1480                         rsk = sk;
1481                         goto reset;
1482                 }
1483                 TCP_CHECK_TIMER(sk);
1484                 return 0;
1485         }
1486
1487         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1488                 goto csum_err;
1489
1490         if (sk->sk_state == TCP_LISTEN) {
1491                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1492                 if (!nsk)
1493                         goto discard;
1494
1495                 if (nsk != sk) {
1496                         if (tcp_child_process(sk, nsk, skb)) {
1497                                 rsk = nsk;
1498                                 goto reset;
1499                         }
1500                         return 0;
1501                 }
1502         }
1503
1504         TCP_CHECK_TIMER(sk);
1505         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1506                 rsk = sk;
1507                 goto reset;
1508         }
1509         TCP_CHECK_TIMER(sk);
1510         return 0;
1511
1512 reset:
1513         tcp_v4_send_reset(rsk, skb);
1514 discard:
1515         kfree_skb(skb);
1516         /* Be careful here. If this function gets more complicated and
1517          * gcc suffers from register pressure on the x86, sk (in %ebx)
1518          * might be destroyed here. This current version compiles correctly,
1519          * but you have been warned.
1520          */
1521         return 0;
1522
1523 csum_err:
1524         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1525         goto discard;
1526 }
1527
1528 /*
1529  *      From tcp_input.c
1530  */
1531
1532 int tcp_v4_rcv(struct sk_buff *skb)
1533 {
1534         const struct iphdr *iph;
1535         struct tcphdr *th;
1536         struct sock *sk;
1537         int ret;
1538         struct net *net = dev_net(skb->dev);
1539
1540         if (skb->pkt_type != PACKET_HOST)
1541                 goto discard_it;
1542
1543         /* Count it even if it's bad */
1544         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1545
1546         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1547                 goto discard_it;
1548
1549         th = tcp_hdr(skb);
1550
1551         if (th->doff < sizeof(struct tcphdr) / 4)
1552                 goto bad_packet;
1553         if (!pskb_may_pull(skb, th->doff * 4))
1554                 goto discard_it;
1555
1556         /* An explanation is required here, I think.
1557          * Packet length and doff are validated by header prediction,
1558          * provided case of th->doff==0 is eliminated.
1559          * So, we defer the checks. */
1560         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1561                 goto bad_packet;
1562
1563         th = tcp_hdr(skb);
1564         iph = ip_hdr(skb);
1565         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1566         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1567                                     skb->len - th->doff * 4);
1568         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1569         TCP_SKB_CB(skb)->when    = 0;
1570         TCP_SKB_CB(skb)->flags   = iph->tos;
1571         TCP_SKB_CB(skb)->sacked  = 0;
1572
1573         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1574         if (!sk)
1575                 goto no_tcp_socket;
1576
1577 process:
1578         if (sk->sk_state == TCP_TIME_WAIT)
1579                 goto do_time_wait;
1580
1581         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1582                 goto discard_and_relse;
1583         nf_reset(skb);
1584
1585         if (sk_filter(sk, skb))
1586                 goto discard_and_relse;
1587
1588         skb->dev = NULL;
1589
1590         bh_lock_sock_nested(sk);
1591         ret = 0;
1592         if (!sock_owned_by_user(sk)) {
1593 #ifdef CONFIG_NET_DMA
1594                 struct tcp_sock *tp = tcp_sk(sk);
1595                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1596                         tp->ucopy.dma_chan = get_softnet_dma();
1597                 if (tp->ucopy.dma_chan)
1598                         ret = tcp_v4_do_rcv(sk, skb);
1599                 else
1600 #endif
1601                 {
1602                         if (!tcp_prequeue(sk, skb))
1603                         ret = tcp_v4_do_rcv(sk, skb);
1604                 }
1605         } else
1606                 sk_add_backlog(sk, skb);
1607         bh_unlock_sock(sk);
1608
1609         sock_put(sk);
1610
1611         return ret;
1612
1613 no_tcp_socket:
1614         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1615                 goto discard_it;
1616
1617         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1618 bad_packet:
1619                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1620         } else {
1621                 tcp_v4_send_reset(NULL, skb);
1622         }
1623
1624 discard_it:
1625         /* Discard frame. */
1626         kfree_skb(skb);
1627         return 0;
1628
1629 discard_and_relse:
1630         sock_put(sk);
1631         goto discard_it;
1632
1633 do_time_wait:
1634         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1635                 inet_twsk_put(inet_twsk(sk));
1636                 goto discard_it;
1637         }
1638
1639         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1640                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1641                 inet_twsk_put(inet_twsk(sk));
1642                 goto discard_it;
1643         }
1644         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1645         case TCP_TW_SYN: {
1646                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1647                                                         &tcp_hashinfo,
1648                                                         iph->daddr, th->dest,
1649                                                         inet_iif(skb));
1650                 if (sk2) {
1651                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1652                         inet_twsk_put(inet_twsk(sk));
1653                         sk = sk2;
1654                         goto process;
1655                 }
1656                 /* Fall through to ACK */
1657         }
1658         case TCP_TW_ACK:
1659                 tcp_v4_timewait_ack(sk, skb);
1660                 break;
1661         case TCP_TW_RST:
1662                 goto no_tcp_socket;
1663         case TCP_TW_SUCCESS:;
1664         }
1665         goto discard_it;
1666 }
1667
1668 /* VJ's idea. Save last timestamp seen from this destination
1669  * and hold it at least for normal timewait interval to use for duplicate
1670  * segment detection in subsequent connections, before they enter synchronized
1671  * state.
1672  */
1673
1674 int tcp_v4_remember_stamp(struct sock *sk)
1675 {
1676         struct inet_sock *inet = inet_sk(sk);
1677         struct tcp_sock *tp = tcp_sk(sk);
1678         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1679         struct inet_peer *peer = NULL;
1680         int release_it = 0;
1681
1682         if (!rt || rt->rt_dst != inet->daddr) {
1683                 peer = inet_getpeer(inet->daddr, 1);
1684                 release_it = 1;
1685         } else {
1686                 if (!rt->peer)
1687                         rt_bind_peer(rt, 1);
1688                 peer = rt->peer;
1689         }
1690
1691         if (peer) {
1692                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1693                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1694                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1695                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1696                         peer->tcp_ts = tp->rx_opt.ts_recent;
1697                 }
1698                 if (release_it)
1699                         inet_putpeer(peer);
1700                 return 1;
1701         }
1702
1703         return 0;
1704 }
1705
1706 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1707 {
1708         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1709
1710         if (peer) {
1711                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1712
1713                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1714                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1715                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1716                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1717                         peer->tcp_ts       = tcptw->tw_ts_recent;
1718                 }
1719                 inet_putpeer(peer);
1720                 return 1;
1721         }
1722
1723         return 0;
1724 }
1725
1726 struct inet_connection_sock_af_ops ipv4_specific = {
1727         .queue_xmit        = ip_queue_xmit,
1728         .send_check        = tcp_v4_send_check,
1729         .rebuild_header    = inet_sk_rebuild_header,
1730         .conn_request      = tcp_v4_conn_request,
1731         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1732         .remember_stamp    = tcp_v4_remember_stamp,
1733         .net_header_len    = sizeof(struct iphdr),
1734         .setsockopt        = ip_setsockopt,
1735         .getsockopt        = ip_getsockopt,
1736         .addr2sockaddr     = inet_csk_addr2sockaddr,
1737         .sockaddr_len      = sizeof(struct sockaddr_in),
1738         .bind_conflict     = inet_csk_bind_conflict,
1739 #ifdef CONFIG_COMPAT
1740         .compat_setsockopt = compat_ip_setsockopt,
1741         .compat_getsockopt = compat_ip_getsockopt,
1742 #endif
1743 };
1744
1745 #ifdef CONFIG_TCP_MD5SIG
1746 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1747         .md5_lookup             = tcp_v4_md5_lookup,
1748         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1749         .md5_add                = tcp_v4_md5_add_func,
1750         .md5_parse              = tcp_v4_parse_md5_keys,
1751 };
1752 #endif
1753
1754 /* NOTE: A lot of things set to zero explicitly by call to
1755  *       sk_alloc() so need not be done here.
1756  */
1757 static int tcp_v4_init_sock(struct sock *sk)
1758 {
1759         struct inet_connection_sock *icsk = inet_csk(sk);
1760         struct tcp_sock *tp = tcp_sk(sk);
1761
1762         skb_queue_head_init(&tp->out_of_order_queue);
1763         tcp_init_xmit_timers(sk);
1764         tcp_prequeue_init(tp);
1765
1766         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1767         tp->mdev = TCP_TIMEOUT_INIT;
1768
1769         /* So many TCP implementations out there (incorrectly) count the
1770          * initial SYN frame in their delayed-ACK and congestion control
1771          * algorithms that we must have the following bandaid to talk
1772          * efficiently to them.  -DaveM
1773          */
1774         tp->snd_cwnd = 2;
1775
1776         /* See draft-stevens-tcpca-spec-01 for discussion of the
1777          * initialization of these values.
1778          */
1779         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1780         tp->snd_cwnd_clamp = ~0;
1781         tp->mss_cache = 536;
1782
1783         tp->reordering = sysctl_tcp_reordering;
1784         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1785
1786         sk->sk_state = TCP_CLOSE;
1787
1788         sk->sk_write_space = sk_stream_write_space;
1789         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1790
1791         icsk->icsk_af_ops = &ipv4_specific;
1792         icsk->icsk_sync_mss = tcp_sync_mss;
1793 #ifdef CONFIG_TCP_MD5SIG
1794         tp->af_specific = &tcp_sock_ipv4_specific;
1795 #endif
1796
1797         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1798         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1799
1800         percpu_counter_inc(&tcp_sockets_allocated);
1801
1802         return 0;
1803 }
1804
1805 void tcp_v4_destroy_sock(struct sock *sk)
1806 {
1807         struct tcp_sock *tp = tcp_sk(sk);
1808
1809         tcp_clear_xmit_timers(sk);
1810
1811         tcp_cleanup_congestion_control(sk);
1812
1813         /* Cleanup up the write buffer. */
1814         tcp_write_queue_purge(sk);
1815
1816         /* Cleans up our, hopefully empty, out_of_order_queue. */
1817         __skb_queue_purge(&tp->out_of_order_queue);
1818
1819 #ifdef CONFIG_TCP_MD5SIG
1820         /* Clean up the MD5 key list, if any */
1821         if (tp->md5sig_info) {
1822                 tcp_v4_clear_md5_list(sk);
1823                 kfree(tp->md5sig_info);
1824                 tp->md5sig_info = NULL;
1825         }
1826 #endif
1827
1828 #ifdef CONFIG_NET_DMA
1829         /* Cleans up our sk_async_wait_queue */
1830         __skb_queue_purge(&sk->sk_async_wait_queue);
1831 #endif
1832
1833         /* Clean prequeue, it must be empty really */
1834         __skb_queue_purge(&tp->ucopy.prequeue);
1835
1836         /* Clean up a referenced TCP bind bucket. */
1837         if (inet_csk(sk)->icsk_bind_hash)
1838                 inet_put_port(sk);
1839
1840         /*
1841          * If sendmsg cached page exists, toss it.
1842          */
1843         if (sk->sk_sndmsg_page) {
1844                 __free_page(sk->sk_sndmsg_page);
1845                 sk->sk_sndmsg_page = NULL;
1846         }
1847
1848         percpu_counter_dec(&tcp_sockets_allocated);
1849 }
1850
1851 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1852
1853 #ifdef CONFIG_PROC_FS
1854 /* Proc filesystem TCP sock list dumping. */
1855
1856 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1857 {
1858         return hlist_nulls_empty(head) ? NULL :
1859                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1860 }
1861
1862 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1863 {
1864         return !is_a_nulls(tw->tw_node.next) ?
1865                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1866 }
1867
1868 static void *listening_get_next(struct seq_file *seq, void *cur)
1869 {
1870         struct inet_connection_sock *icsk;
1871         struct hlist_nulls_node *node;
1872         struct sock *sk = cur;
1873         struct inet_listen_hashbucket *ilb;
1874         struct tcp_iter_state *st = seq->private;
1875         struct net *net = seq_file_net(seq);
1876
1877         if (!sk) {
1878                 st->bucket = 0;
1879                 ilb = &tcp_hashinfo.listening_hash[0];
1880                 spin_lock_bh(&ilb->lock);
1881                 sk = sk_nulls_head(&ilb->head);
1882                 goto get_sk;
1883         }
1884         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1885         ++st->num;
1886
1887         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1888                 struct request_sock *req = cur;
1889
1890                 icsk = inet_csk(st->syn_wait_sk);
1891                 req = req->dl_next;
1892                 while (1) {
1893                         while (req) {
1894                                 if (req->rsk_ops->family == st->family) {
1895                                         cur = req;
1896                                         goto out;
1897                                 }
1898                                 req = req->dl_next;
1899                         }
1900                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1901                                 break;
1902 get_req:
1903                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1904                 }
1905                 sk        = sk_next(st->syn_wait_sk);
1906                 st->state = TCP_SEQ_STATE_LISTENING;
1907                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1908         } else {
1909                 icsk = inet_csk(sk);
1910                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1911                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1912                         goto start_req;
1913                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1914                 sk = sk_next(sk);
1915         }
1916 get_sk:
1917         sk_nulls_for_each_from(sk, node) {
1918                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1919                         cur = sk;
1920                         goto out;
1921                 }
1922                 icsk = inet_csk(sk);
1923                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1924                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1925 start_req:
1926                         st->uid         = sock_i_uid(sk);
1927                         st->syn_wait_sk = sk;
1928                         st->state       = TCP_SEQ_STATE_OPENREQ;
1929                         st->sbucket     = 0;
1930                         goto get_req;
1931                 }
1932                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1933         }
1934         spin_unlock_bh(&ilb->lock);
1935         if (++st->bucket < INET_LHTABLE_SIZE) {
1936                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1937                 spin_lock_bh(&ilb->lock);
1938                 sk = sk_nulls_head(&ilb->head);
1939                 goto get_sk;
1940         }
1941         cur = NULL;
1942 out:
1943         return cur;
1944 }
1945
1946 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1947 {
1948         void *rc = listening_get_next(seq, NULL);
1949
1950         while (rc && *pos) {
1951                 rc = listening_get_next(seq, rc);
1952                 --*pos;
1953         }
1954         return rc;
1955 }
1956
1957 static inline int empty_bucket(struct tcp_iter_state *st)
1958 {
1959         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
1960                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
1961 }
1962
1963 static void *established_get_first(struct seq_file *seq)
1964 {
1965         struct tcp_iter_state *st = seq->private;
1966         struct net *net = seq_file_net(seq);
1967         void *rc = NULL;
1968
1969         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1970                 struct sock *sk;
1971                 struct hlist_nulls_node *node;
1972                 struct inet_timewait_sock *tw;
1973                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1974
1975                 /* Lockless fast path for the common case of empty buckets */
1976                 if (empty_bucket(st))
1977                         continue;
1978
1979                 spin_lock_bh(lock);
1980                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1981                         if (sk->sk_family != st->family ||
1982                             !net_eq(sock_net(sk), net)) {
1983                                 continue;
1984                         }
1985                         rc = sk;
1986                         goto out;
1987                 }
1988                 st->state = TCP_SEQ_STATE_TIME_WAIT;
1989                 inet_twsk_for_each(tw, node,
1990                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
1991                         if (tw->tw_family != st->family ||
1992                             !net_eq(twsk_net(tw), net)) {
1993                                 continue;
1994                         }
1995                         rc = tw;
1996                         goto out;
1997                 }
1998                 spin_unlock_bh(lock);
1999                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2000         }
2001 out:
2002         return rc;
2003 }
2004
2005 static void *established_get_next(struct seq_file *seq, void *cur)
2006 {
2007         struct sock *sk = cur;
2008         struct inet_timewait_sock *tw;
2009         struct hlist_nulls_node *node;
2010         struct tcp_iter_state *st = seq->private;
2011         struct net *net = seq_file_net(seq);
2012
2013         ++st->num;
2014
2015         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2016                 tw = cur;
2017                 tw = tw_next(tw);
2018 get_tw:
2019                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2020                         tw = tw_next(tw);
2021                 }
2022                 if (tw) {
2023                         cur = tw;
2024                         goto out;
2025                 }
2026                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2027                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2028
2029                 /* Look for next non empty bucket */
2030                 while (++st->bucket < tcp_hashinfo.ehash_size &&
2031                                 empty_bucket(st))
2032                         ;
2033                 if (st->bucket >= tcp_hashinfo.ehash_size)
2034                         return NULL;
2035
2036                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2037                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2038         } else
2039                 sk = sk_nulls_next(sk);
2040
2041         sk_nulls_for_each_from(sk, node) {
2042                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2043                         goto found;
2044         }
2045
2046         st->state = TCP_SEQ_STATE_TIME_WAIT;
2047         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2048         goto get_tw;
2049 found:
2050         cur = sk;
2051 out:
2052         return cur;
2053 }
2054
2055 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2056 {
2057         void *rc = established_get_first(seq);
2058
2059         while (rc && pos) {
2060                 rc = established_get_next(seq, rc);
2061                 --pos;
2062         }
2063         return rc;
2064 }
2065
2066 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2067 {
2068         void *rc;
2069         struct tcp_iter_state *st = seq->private;
2070
2071         st->state = TCP_SEQ_STATE_LISTENING;
2072         rc        = listening_get_idx(seq, &pos);
2073
2074         if (!rc) {
2075                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2076                 rc        = established_get_idx(seq, pos);
2077         }
2078
2079         return rc;
2080 }
2081
2082 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2083 {
2084         struct tcp_iter_state *st = seq->private;
2085         st->state = TCP_SEQ_STATE_LISTENING;
2086         st->num = 0;
2087         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2088 }
2089
2090 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2091 {
2092         void *rc = NULL;
2093         struct tcp_iter_state *st;
2094
2095         if (v == SEQ_START_TOKEN) {
2096                 rc = tcp_get_idx(seq, 0);
2097                 goto out;
2098         }
2099         st = seq->private;
2100
2101         switch (st->state) {
2102         case TCP_SEQ_STATE_OPENREQ:
2103         case TCP_SEQ_STATE_LISTENING:
2104                 rc = listening_get_next(seq, v);
2105                 if (!rc) {
2106                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2107                         rc        = established_get_first(seq);
2108                 }
2109                 break;
2110         case TCP_SEQ_STATE_ESTABLISHED:
2111         case TCP_SEQ_STATE_TIME_WAIT:
2112                 rc = established_get_next(seq, v);
2113                 break;
2114         }
2115 out:
2116         ++*pos;
2117         return rc;
2118 }
2119
2120 static void tcp_seq_stop(struct seq_file *seq, void *v)
2121 {
2122         struct tcp_iter_state *st = seq->private;
2123
2124         switch (st->state) {
2125         case TCP_SEQ_STATE_OPENREQ:
2126                 if (v) {
2127                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2128                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2129                 }
2130         case TCP_SEQ_STATE_LISTENING:
2131                 if (v != SEQ_START_TOKEN)
2132                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2133                 break;
2134         case TCP_SEQ_STATE_TIME_WAIT:
2135         case TCP_SEQ_STATE_ESTABLISHED:
2136                 if (v)
2137                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2138                 break;
2139         }
2140 }
2141
2142 static int tcp_seq_open(struct inode *inode, struct file *file)
2143 {
2144         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2145         struct tcp_iter_state *s;
2146         int err;
2147
2148         err = seq_open_net(inode, file, &afinfo->seq_ops,
2149                           sizeof(struct tcp_iter_state));
2150         if (err < 0)
2151                 return err;
2152
2153         s = ((struct seq_file *)file->private_data)->private;
2154         s->family               = afinfo->family;
2155         return 0;
2156 }
2157
2158 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2159 {
2160         int rc = 0;
2161         struct proc_dir_entry *p;
2162
2163         afinfo->seq_fops.open           = tcp_seq_open;
2164         afinfo->seq_fops.read           = seq_read;
2165         afinfo->seq_fops.llseek         = seq_lseek;
2166         afinfo->seq_fops.release        = seq_release_net;
2167
2168         afinfo->seq_ops.start           = tcp_seq_start;
2169         afinfo->seq_ops.next            = tcp_seq_next;
2170         afinfo->seq_ops.stop            = tcp_seq_stop;
2171
2172         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2173                              &afinfo->seq_fops, afinfo);
2174         if (!p)
2175                 rc = -ENOMEM;
2176         return rc;
2177 }
2178
2179 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2180 {
2181         proc_net_remove(net, afinfo->name);
2182 }
2183
2184 static void get_openreq4(struct sock *sk, struct request_sock *req,
2185                          struct seq_file *f, int i, int uid, int *len)
2186 {
2187         const struct inet_request_sock *ireq = inet_rsk(req);
2188         int ttd = req->expires - jiffies;
2189
2190         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2191                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2192                 i,
2193                 ireq->loc_addr,
2194                 ntohs(inet_sk(sk)->sport),
2195                 ireq->rmt_addr,
2196                 ntohs(ireq->rmt_port),
2197                 TCP_SYN_RECV,
2198                 0, 0, /* could print option size, but that is af dependent. */
2199                 1,    /* timers active (only the expire timer) */
2200                 jiffies_to_clock_t(ttd),
2201                 req->retrans,
2202                 uid,
2203                 0,  /* non standard timer */
2204                 0, /* open_requests have no inode */
2205                 atomic_read(&sk->sk_refcnt),
2206                 req,
2207                 len);
2208 }
2209
2210 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2211 {
2212         int timer_active;
2213         unsigned long timer_expires;
2214         struct tcp_sock *tp = tcp_sk(sk);
2215         const struct inet_connection_sock *icsk = inet_csk(sk);
2216         struct inet_sock *inet = inet_sk(sk);
2217         __be32 dest = inet->daddr;
2218         __be32 src = inet->rcv_saddr;
2219         __u16 destp = ntohs(inet->dport);
2220         __u16 srcp = ntohs(inet->sport);
2221
2222         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2223                 timer_active    = 1;
2224                 timer_expires   = icsk->icsk_timeout;
2225         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2226                 timer_active    = 4;
2227                 timer_expires   = icsk->icsk_timeout;
2228         } else if (timer_pending(&sk->sk_timer)) {
2229                 timer_active    = 2;
2230                 timer_expires   = sk->sk_timer.expires;
2231         } else {
2232                 timer_active    = 0;
2233                 timer_expires = jiffies;
2234         }
2235
2236         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2237                         "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2238                 i, src, srcp, dest, destp, sk->sk_state,
2239                 tp->write_seq - tp->snd_una,
2240                 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2241                                              (tp->rcv_nxt - tp->copied_seq),
2242                 timer_active,
2243                 jiffies_to_clock_t(timer_expires - jiffies),
2244                 icsk->icsk_retransmits,
2245                 sock_i_uid(sk),
2246                 icsk->icsk_probes_out,
2247                 sock_i_ino(sk),
2248                 atomic_read(&sk->sk_refcnt), sk,
2249                 jiffies_to_clock_t(icsk->icsk_rto),
2250                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2251                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2252                 tp->snd_cwnd,
2253                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh,
2254                 len);
2255 }
2256
2257 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2258                                struct seq_file *f, int i, int *len)
2259 {
2260         __be32 dest, src;
2261         __u16 destp, srcp;
2262         int ttd = tw->tw_ttd - jiffies;
2263
2264         if (ttd < 0)
2265                 ttd = 0;
2266
2267         dest  = tw->tw_daddr;
2268         src   = tw->tw_rcv_saddr;
2269         destp = ntohs(tw->tw_dport);
2270         srcp  = ntohs(tw->tw_sport);
2271
2272         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2273                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2274                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2275                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2276                 atomic_read(&tw->tw_refcnt), tw, len);
2277 }
2278
2279 #define TMPSZ 150
2280
2281 static int tcp4_seq_show(struct seq_file *seq, void *v)
2282 {
2283         struct tcp_iter_state *st;
2284         int len;
2285
2286         if (v == SEQ_START_TOKEN) {
2287                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2288                            "  sl  local_address rem_address   st tx_queue "
2289                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2290                            "inode");
2291                 goto out;
2292         }
2293         st = seq->private;
2294
2295         switch (st->state) {
2296         case TCP_SEQ_STATE_LISTENING:
2297         case TCP_SEQ_STATE_ESTABLISHED:
2298                 get_tcp4_sock(v, seq, st->num, &len);
2299                 break;
2300         case TCP_SEQ_STATE_OPENREQ:
2301                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2302                 break;
2303         case TCP_SEQ_STATE_TIME_WAIT:
2304                 get_timewait4_sock(v, seq, st->num, &len);
2305                 break;
2306         }
2307         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2308 out:
2309         return 0;
2310 }
2311
2312 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2313         .name           = "tcp",
2314         .family         = AF_INET,
2315         .seq_fops       = {
2316                 .owner          = THIS_MODULE,
2317         },
2318         .seq_ops        = {
2319                 .show           = tcp4_seq_show,
2320         },
2321 };
2322
2323 static int tcp4_proc_init_net(struct net *net)
2324 {
2325         return tcp_proc_register(net, &tcp4_seq_afinfo);
2326 }
2327
2328 static void tcp4_proc_exit_net(struct net *net)
2329 {
2330         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2331 }
2332
2333 static struct pernet_operations tcp4_net_ops = {
2334         .init = tcp4_proc_init_net,
2335         .exit = tcp4_proc_exit_net,
2336 };
2337
2338 int __init tcp4_proc_init(void)
2339 {
2340         return register_pernet_subsys(&tcp4_net_ops);
2341 }
2342
2343 void tcp4_proc_exit(void)
2344 {
2345         unregister_pernet_subsys(&tcp4_net_ops);
2346 }
2347 #endif /* CONFIG_PROC_FS */
2348
2349 struct proto tcp_prot = {
2350         .name                   = "TCP",
2351         .owner                  = THIS_MODULE,
2352         .close                  = tcp_close,
2353         .connect                = tcp_v4_connect,
2354         .disconnect             = tcp_disconnect,
2355         .accept                 = inet_csk_accept,
2356         .ioctl                  = tcp_ioctl,
2357         .init                   = tcp_v4_init_sock,
2358         .destroy                = tcp_v4_destroy_sock,
2359         .shutdown               = tcp_shutdown,
2360         .setsockopt             = tcp_setsockopt,
2361         .getsockopt             = tcp_getsockopt,
2362         .recvmsg                = tcp_recvmsg,
2363         .backlog_rcv            = tcp_v4_do_rcv,
2364         .hash                   = inet_hash,
2365         .unhash                 = inet_unhash,
2366         .get_port               = inet_csk_get_port,
2367         .enter_memory_pressure  = tcp_enter_memory_pressure,
2368         .sockets_allocated      = &tcp_sockets_allocated,
2369         .orphan_count           = &tcp_orphan_count,
2370         .memory_allocated       = &tcp_memory_allocated,
2371         .memory_pressure        = &tcp_memory_pressure,
2372         .sysctl_mem             = sysctl_tcp_mem,
2373         .sysctl_wmem            = sysctl_tcp_wmem,
2374         .sysctl_rmem            = sysctl_tcp_rmem,
2375         .max_header             = MAX_TCP_HEADER,
2376         .obj_size               = sizeof(struct tcp_sock),
2377         .slab_flags             = SLAB_DESTROY_BY_RCU,
2378         .twsk_prot              = &tcp_timewait_sock_ops,
2379         .rsk_prot               = &tcp_request_sock_ops,
2380         .h.hashinfo             = &tcp_hashinfo,
2381 #ifdef CONFIG_COMPAT
2382         .compat_setsockopt      = compat_tcp_setsockopt,
2383         .compat_getsockopt      = compat_tcp_getsockopt,
2384 #endif
2385 };
2386
2387
2388 static int __net_init tcp_sk_init(struct net *net)
2389 {
2390         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2391                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2392 }
2393
2394 static void __net_exit tcp_sk_exit(struct net *net)
2395 {
2396         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2397         inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET);
2398 }
2399
2400 static struct pernet_operations __net_initdata tcp_sk_ops = {
2401        .init = tcp_sk_init,
2402        .exit = tcp_sk_exit,
2403 };
2404
2405 void __init tcp_v4_init(void)
2406 {
2407         inet_hashinfo_init(&tcp_hashinfo);
2408         if (register_pernet_device(&tcp_sk_ops))
2409                 panic("Failed to create the TCP control socket.\n");
2410 }
2411
2412 EXPORT_SYMBOL(ipv4_specific);
2413 EXPORT_SYMBOL(tcp_hashinfo);
2414 EXPORT_SYMBOL(tcp_prot);
2415 EXPORT_SYMBOL(tcp_v4_conn_request);
2416 EXPORT_SYMBOL(tcp_v4_connect);
2417 EXPORT_SYMBOL(tcp_v4_do_rcv);
2418 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2419 EXPORT_SYMBOL(tcp_v4_send_check);
2420 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2421
2422 #ifdef CONFIG_PROC_FS
2423 EXPORT_SYMBOL(tcp_proc_register);
2424 EXPORT_SYMBOL(tcp_proc_unregister);
2425 #endif
2426 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2427