inet{6}_request_sock: Init ->opt and ->pktopts in the constructor
[linux-flexiantxendom0-natty.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9  *
10  *              IPv4 specific functions
11  *
12  *
13  *              code split from:
14  *              linux/ipv4/tcp.c
15  *              linux/ipv4/tcp_input.c
16  *              linux/ipv4/tcp_output.c
17  *
18  *              See tcp.c for author information
19  *
20  *      This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25
26 /*
27  * Changes:
28  *              David S. Miller :       New socket lookup architecture.
29  *                                      This code is dedicated to John Dyson.
30  *              David S. Miller :       Change semantics of established hash,
31  *                                      half is devoted to TIME_WAIT sockets
32  *                                      and the rest go in the other half.
33  *              Andi Kleen :            Add support for syncookies and fixed
34  *                                      some bugs: ip options weren't passed to
35  *                                      the TCP layer, missed a check for an
36  *                                      ACK bit.
37  *              Andi Kleen :            Implemented fast path mtu discovery.
38  *                                      Fixed many serious bugs in the
39  *                                      request_sock handling and moved
40  *                                      most of it into the af independent code.
41  *                                      Added tail drop and some other bugfixes.
42  *                                      Added new listen semantics.
43  *              Mike McLagan    :       Routing by source
44  *      Juan Jose Ciarlante:            ip_dynaddr bits
45  *              Andi Kleen:             various fixes.
46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
47  *                                      coma.
48  *      Andi Kleen              :       Fix new listen.
49  *      Andi Kleen              :       Fix accept error reporting.
50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
52  *                                      a single port at the same time.
53  */
54
55
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64
65 #include <net/net_namespace.h>
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/transp_v6.h>
70 #include <net/ipv6.h>
71 #include <net/inet_common.h>
72 #include <net/timewait_sock.h>
73 #include <net/xfrm.h>
74 #include <net/netdma.h>
75
76 #include <linux/inet.h>
77 #include <linux/ipv6.h>
78 #include <linux/stddef.h>
79 #include <linux/proc_fs.h>
80 #include <linux/seq_file.h>
81
82 #include <linux/crypto.h>
83 #include <linux/scatterlist.h>
84
85 int sysctl_tcp_tw_reuse __read_mostly;
86 int sysctl_tcp_low_latency __read_mostly;
87
88 /* Check TCP sequence numbers in ICMP packets. */
89 #define ICMP_MIN_LENGTH 8
90
91 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
92
93 #ifdef CONFIG_TCP_MD5SIG
94 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
95                                                    __be32 addr);
96 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
97                                    __be32 saddr, __be32 daddr,
98                                    struct tcphdr *th, int protocol,
99                                    unsigned int tcplen);
100 #endif
101
102 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
103         .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
104         .lhash_users = ATOMIC_INIT(0),
105         .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
106 };
107
108 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
109 {
110         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
111                                           ip_hdr(skb)->saddr,
112                                           tcp_hdr(skb)->dest,
113                                           tcp_hdr(skb)->source);
114 }
115
116 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
117 {
118         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
119         struct tcp_sock *tp = tcp_sk(sk);
120
121         /* With PAWS, it is safe from the viewpoint
122            of data integrity. Even without PAWS it is safe provided sequence
123            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
124
125            Actually, the idea is close to VJ's one, only timestamp cache is
126            held not per host, but per port pair and TW bucket is used as state
127            holder.
128
129            If TW bucket has been already destroyed we fall back to VJ's scheme
130            and use initial timestamp retrieved from peer table.
131          */
132         if (tcptw->tw_ts_recent_stamp &&
133             (twp == NULL || (sysctl_tcp_tw_reuse &&
134                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
135                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
136                 if (tp->write_seq == 0)
137                         tp->write_seq = 1;
138                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
139                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
140                 sock_hold(sktw);
141                 return 1;
142         }
143
144         return 0;
145 }
146
147 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
148
149 /* This will initiate an outgoing connection. */
150 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
151 {
152         struct inet_sock *inet = inet_sk(sk);
153         struct tcp_sock *tp = tcp_sk(sk);
154         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
155         struct rtable *rt;
156         __be32 daddr, nexthop;
157         int tmp;
158         int err;
159
160         if (addr_len < sizeof(struct sockaddr_in))
161                 return -EINVAL;
162
163         if (usin->sin_family != AF_INET)
164                 return -EAFNOSUPPORT;
165
166         nexthop = daddr = usin->sin_addr.s_addr;
167         if (inet->opt && inet->opt->srr) {
168                 if (!daddr)
169                         return -EINVAL;
170                 nexthop = inet->opt->faddr;
171         }
172
173         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
174                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
175                                IPPROTO_TCP,
176                                inet->sport, usin->sin_port, sk, 1);
177         if (tmp < 0) {
178                 if (tmp == -ENETUNREACH)
179                         IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
180                 return tmp;
181         }
182
183         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
184                 ip_rt_put(rt);
185                 return -ENETUNREACH;
186         }
187
188         if (!inet->opt || !inet->opt->srr)
189                 daddr = rt->rt_dst;
190
191         if (!inet->saddr)
192                 inet->saddr = rt->rt_src;
193         inet->rcv_saddr = inet->saddr;
194
195         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
196                 /* Reset inherited state */
197                 tp->rx_opt.ts_recent       = 0;
198                 tp->rx_opt.ts_recent_stamp = 0;
199                 tp->write_seq              = 0;
200         }
201
202         if (tcp_death_row.sysctl_tw_recycle &&
203             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
204                 struct inet_peer *peer = rt_get_peer(rt);
205                 /*
206                  * VJ's idea. We save last timestamp seen from
207                  * the destination in peer table, when entering state
208                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
209                  * when trying new connection.
210                  */
211                 if (peer != NULL &&
212                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
213                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
214                         tp->rx_opt.ts_recent = peer->tcp_ts;
215                 }
216         }
217
218         inet->dport = usin->sin_port;
219         inet->daddr = daddr;
220
221         inet_csk(sk)->icsk_ext_hdr_len = 0;
222         if (inet->opt)
223                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
224
225         tp->rx_opt.mss_clamp = 536;
226
227         /* Socket identity is still unknown (sport may be zero).
228          * However we set state to SYN-SENT and not releasing socket
229          * lock select source port, enter ourselves into the hash tables and
230          * complete initialization after this.
231          */
232         tcp_set_state(sk, TCP_SYN_SENT);
233         err = inet_hash_connect(&tcp_death_row, sk);
234         if (err)
235                 goto failure;
236
237         err = ip_route_newports(&rt, IPPROTO_TCP,
238                                 inet->sport, inet->dport, sk);
239         if (err)
240                 goto failure;
241
242         /* OK, now commit destination to socket.  */
243         sk->sk_gso_type = SKB_GSO_TCPV4;
244         sk_setup_caps(sk, &rt->u.dst);
245
246         if (!tp->write_seq)
247                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
248                                                            inet->daddr,
249                                                            inet->sport,
250                                                            usin->sin_port);
251
252         inet->id = tp->write_seq ^ jiffies;
253
254         err = tcp_connect(sk);
255         rt = NULL;
256         if (err)
257                 goto failure;
258
259         return 0;
260
261 failure:
262         /*
263          * This unhashes the socket and releases the local port,
264          * if necessary.
265          */
266         tcp_set_state(sk, TCP_CLOSE);
267         ip_rt_put(rt);
268         sk->sk_route_caps = 0;
269         inet->dport = 0;
270         return err;
271 }
272
273 /*
274  * This routine does path mtu discovery as defined in RFC1191.
275  */
276 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
277 {
278         struct dst_entry *dst;
279         struct inet_sock *inet = inet_sk(sk);
280
281         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
282          * send out by Linux are always <576bytes so they should go through
283          * unfragmented).
284          */
285         if (sk->sk_state == TCP_LISTEN)
286                 return;
287
288         /* We don't check in the destentry if pmtu discovery is forbidden
289          * on this route. We just assume that no packet_to_big packets
290          * are send back when pmtu discovery is not active.
291          * There is a small race when the user changes this flag in the
292          * route, but I think that's acceptable.
293          */
294         if ((dst = __sk_dst_check(sk, 0)) == NULL)
295                 return;
296
297         dst->ops->update_pmtu(dst, mtu);
298
299         /* Something is about to be wrong... Remember soft error
300          * for the case, if this connection will not able to recover.
301          */
302         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
303                 sk->sk_err_soft = EMSGSIZE;
304
305         mtu = dst_mtu(dst);
306
307         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
308             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
309                 tcp_sync_mss(sk, mtu);
310
311                 /* Resend the TCP packet because it's
312                  * clear that the old packet has been
313                  * dropped. This is the new "fast" path mtu
314                  * discovery.
315                  */
316                 tcp_simple_retransmit(sk);
317         } /* else let the usual retransmit timer handle it */
318 }
319
320 /*
321  * This routine is called by the ICMP module when it gets some
322  * sort of error condition.  If err < 0 then the socket should
323  * be closed and the error returned to the user.  If err > 0
324  * it's just the icmp type << 8 | icmp code.  After adjustment
325  * header points to the first 8 bytes of the tcp header.  We need
326  * to find the appropriate port.
327  *
328  * The locking strategy used here is very "optimistic". When
329  * someone else accesses the socket the ICMP is just dropped
330  * and for some paths there is no check at all.
331  * A more general error queue to queue errors for later handling
332  * is probably better.
333  *
334  */
335
336 void tcp_v4_err(struct sk_buff *skb, u32 info)
337 {
338         struct iphdr *iph = (struct iphdr *)skb->data;
339         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
340         struct tcp_sock *tp;
341         struct inet_sock *inet;
342         const int type = icmp_hdr(skb)->type;
343         const int code = icmp_hdr(skb)->code;
344         struct sock *sk;
345         __u32 seq;
346         int err;
347
348         if (skb->len < (iph->ihl << 2) + 8) {
349                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
350                 return;
351         }
352
353         sk = inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->daddr, th->dest,
354                         iph->saddr, th->source, inet_iif(skb));
355         if (!sk) {
356                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
357                 return;
358         }
359         if (sk->sk_state == TCP_TIME_WAIT) {
360                 inet_twsk_put(inet_twsk(sk));
361                 return;
362         }
363
364         bh_lock_sock(sk);
365         /* If too many ICMPs get dropped on busy
366          * servers this needs to be solved differently.
367          */
368         if (sock_owned_by_user(sk))
369                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
370
371         if (sk->sk_state == TCP_CLOSE)
372                 goto out;
373
374         tp = tcp_sk(sk);
375         seq = ntohl(th->seq);
376         if (sk->sk_state != TCP_LISTEN &&
377             !between(seq, tp->snd_una, tp->snd_nxt)) {
378                 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
379                 goto out;
380         }
381
382         switch (type) {
383         case ICMP_SOURCE_QUENCH:
384                 /* Just silently ignore these. */
385                 goto out;
386         case ICMP_PARAMETERPROB:
387                 err = EPROTO;
388                 break;
389         case ICMP_DEST_UNREACH:
390                 if (code > NR_ICMP_UNREACH)
391                         goto out;
392
393                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
394                         if (!sock_owned_by_user(sk))
395                                 do_pmtu_discovery(sk, iph, info);
396                         goto out;
397                 }
398
399                 err = icmp_err_convert[code].errno;
400                 break;
401         case ICMP_TIME_EXCEEDED:
402                 err = EHOSTUNREACH;
403                 break;
404         default:
405                 goto out;
406         }
407
408         switch (sk->sk_state) {
409                 struct request_sock *req, **prev;
410         case TCP_LISTEN:
411                 if (sock_owned_by_user(sk))
412                         goto out;
413
414                 req = inet_csk_search_req(sk, &prev, th->dest,
415                                           iph->daddr, iph->saddr);
416                 if (!req)
417                         goto out;
418
419                 /* ICMPs are not backlogged, hence we cannot get
420                    an established socket here.
421                  */
422                 BUG_TRAP(!req->sk);
423
424                 if (seq != tcp_rsk(req)->snt_isn) {
425                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
426                         goto out;
427                 }
428
429                 /*
430                  * Still in SYN_RECV, just remove it silently.
431                  * There is no good way to pass the error to the newly
432                  * created socket, and POSIX does not want network
433                  * errors returned from accept().
434                  */
435                 inet_csk_reqsk_queue_drop(sk, req, prev);
436                 goto out;
437
438         case TCP_SYN_SENT:
439         case TCP_SYN_RECV:  /* Cannot happen.
440                                It can f.e. if SYNs crossed.
441                              */
442                 if (!sock_owned_by_user(sk)) {
443                         sk->sk_err = err;
444
445                         sk->sk_error_report(sk);
446
447                         tcp_done(sk);
448                 } else {
449                         sk->sk_err_soft = err;
450                 }
451                 goto out;
452         }
453
454         /* If we've already connected we will keep trying
455          * until we time out, or the user gives up.
456          *
457          * rfc1122 4.2.3.9 allows to consider as hard errors
458          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
459          * but it is obsoleted by pmtu discovery).
460          *
461          * Note, that in modern internet, where routing is unreliable
462          * and in each dark corner broken firewalls sit, sending random
463          * errors ordered by their masters even this two messages finally lose
464          * their original sense (even Linux sends invalid PORT_UNREACHs)
465          *
466          * Now we are in compliance with RFCs.
467          *                                                      --ANK (980905)
468          */
469
470         inet = inet_sk(sk);
471         if (!sock_owned_by_user(sk) && inet->recverr) {
472                 sk->sk_err = err;
473                 sk->sk_error_report(sk);
474         } else  { /* Only an error on timeout */
475                 sk->sk_err_soft = err;
476         }
477
478 out:
479         bh_unlock_sock(sk);
480         sock_put(sk);
481 }
482
483 /* This routine computes an IPv4 TCP checksum. */
484 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
485 {
486         struct inet_sock *inet = inet_sk(sk);
487         struct tcphdr *th = tcp_hdr(skb);
488
489         if (skb->ip_summed == CHECKSUM_PARTIAL) {
490                 th->check = ~tcp_v4_check(len, inet->saddr,
491                                           inet->daddr, 0);
492                 skb->csum_start = skb_transport_header(skb) - skb->head;
493                 skb->csum_offset = offsetof(struct tcphdr, check);
494         } else {
495                 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
496                                          csum_partial((char *)th,
497                                                       th->doff << 2,
498                                                       skb->csum));
499         }
500 }
501
502 int tcp_v4_gso_send_check(struct sk_buff *skb)
503 {
504         const struct iphdr *iph;
505         struct tcphdr *th;
506
507         if (!pskb_may_pull(skb, sizeof(*th)))
508                 return -EINVAL;
509
510         iph = ip_hdr(skb);
511         th = tcp_hdr(skb);
512
513         th->check = 0;
514         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
515         skb->csum_start = skb_transport_header(skb) - skb->head;
516         skb->csum_offset = offsetof(struct tcphdr, check);
517         skb->ip_summed = CHECKSUM_PARTIAL;
518         return 0;
519 }
520
521 /*
522  *      This routine will send an RST to the other tcp.
523  *
524  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
525  *                    for reset.
526  *      Answer: if a packet caused RST, it is not for a socket
527  *              existing in our system, if it is matched to a socket,
528  *              it is just duplicate segment or bug in other side's TCP.
529  *              So that we build reply only basing on parameters
530  *              arrived with segment.
531  *      Exception: precedence violation. We do not implement it in any case.
532  */
533
534 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
535 {
536         struct tcphdr *th = tcp_hdr(skb);
537         struct {
538                 struct tcphdr th;
539 #ifdef CONFIG_TCP_MD5SIG
540                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
541 #endif
542         } rep;
543         struct ip_reply_arg arg;
544 #ifdef CONFIG_TCP_MD5SIG
545         struct tcp_md5sig_key *key;
546 #endif
547
548         /* Never send a reset in response to a reset. */
549         if (th->rst)
550                 return;
551
552         if (skb->rtable->rt_type != RTN_LOCAL)
553                 return;
554
555         /* Swap the send and the receive. */
556         memset(&rep, 0, sizeof(rep));
557         rep.th.dest   = th->source;
558         rep.th.source = th->dest;
559         rep.th.doff   = sizeof(struct tcphdr) / 4;
560         rep.th.rst    = 1;
561
562         if (th->ack) {
563                 rep.th.seq = th->ack_seq;
564         } else {
565                 rep.th.ack = 1;
566                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
567                                        skb->len - (th->doff << 2));
568         }
569
570         memset(&arg, 0, sizeof(arg));
571         arg.iov[0].iov_base = (unsigned char *)&rep;
572         arg.iov[0].iov_len  = sizeof(rep.th);
573
574 #ifdef CONFIG_TCP_MD5SIG
575         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
576         if (key) {
577                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
578                                    (TCPOPT_NOP << 16) |
579                                    (TCPOPT_MD5SIG << 8) |
580                                    TCPOLEN_MD5SIG);
581                 /* Update length and the length the header thinks exists */
582                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
583                 rep.th.doff = arg.iov[0].iov_len / 4;
584
585                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
586                                         key,
587                                         ip_hdr(skb)->daddr,
588                                         ip_hdr(skb)->saddr,
589                                         &rep.th, IPPROTO_TCP,
590                                         arg.iov[0].iov_len);
591         }
592 #endif
593         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
594                                       ip_hdr(skb)->saddr, /* XXX */
595                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
596         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
597
598         ip_send_reply(dev_net(skb->dst->dev)->ipv4.tcp_sock, skb,
599                       &arg, arg.iov[0].iov_len);
600
601         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
602         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
603 }
604
605 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
606    outside socket context is ugly, certainly. What can I do?
607  */
608
609 static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
610                             struct sk_buff *skb, u32 seq, u32 ack,
611                             u32 win, u32 ts)
612 {
613         struct tcphdr *th = tcp_hdr(skb);
614         struct {
615                 struct tcphdr th;
616                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
617 #ifdef CONFIG_TCP_MD5SIG
618                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
619 #endif
620                         ];
621         } rep;
622         struct ip_reply_arg arg;
623 #ifdef CONFIG_TCP_MD5SIG
624         struct tcp_md5sig_key *key;
625         struct tcp_md5sig_key tw_key;
626 #endif
627
628         memset(&rep.th, 0, sizeof(struct tcphdr));
629         memset(&arg, 0, sizeof(arg));
630
631         arg.iov[0].iov_base = (unsigned char *)&rep;
632         arg.iov[0].iov_len  = sizeof(rep.th);
633         if (ts) {
634                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
635                                    (TCPOPT_TIMESTAMP << 8) |
636                                    TCPOLEN_TIMESTAMP);
637                 rep.opt[1] = htonl(tcp_time_stamp);
638                 rep.opt[2] = htonl(ts);
639                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
640         }
641
642         /* Swap the send and the receive. */
643         rep.th.dest    = th->source;
644         rep.th.source  = th->dest;
645         rep.th.doff    = arg.iov[0].iov_len / 4;
646         rep.th.seq     = htonl(seq);
647         rep.th.ack_seq = htonl(ack);
648         rep.th.ack     = 1;
649         rep.th.window  = htons(win);
650
651 #ifdef CONFIG_TCP_MD5SIG
652         /*
653          * The SKB holds an imcoming packet, but may not have a valid ->sk
654          * pointer. This is especially the case when we're dealing with a
655          * TIME_WAIT ack, because the sk structure is long gone, and only
656          * the tcp_timewait_sock remains. So the md5 key is stashed in that
657          * structure, and we use it in preference.  I believe that (twsk ||
658          * skb->sk) holds true, but we program defensively.
659          */
660         if (!twsk && skb->sk) {
661                 key = tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr);
662         } else if (twsk && twsk->tw_md5_keylen) {
663                 tw_key.key = twsk->tw_md5_key;
664                 tw_key.keylen = twsk->tw_md5_keylen;
665                 key = &tw_key;
666         } else
667                 key = NULL;
668
669         if (key) {
670                 int offset = (ts) ? 3 : 0;
671
672                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
673                                           (TCPOPT_NOP << 16) |
674                                           (TCPOPT_MD5SIG << 8) |
675                                           TCPOLEN_MD5SIG);
676                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
677                 rep.th.doff = arg.iov[0].iov_len/4;
678
679                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
680                                         key,
681                                         ip_hdr(skb)->daddr,
682                                         ip_hdr(skb)->saddr,
683                                         &rep.th, IPPROTO_TCP,
684                                         arg.iov[0].iov_len);
685         }
686 #endif
687         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
688                                       ip_hdr(skb)->saddr, /* XXX */
689                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
690         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
691         if (twsk)
692                 arg.bound_dev_if = twsk->tw_sk.tw_bound_dev_if;
693
694         ip_send_reply(dev_net(skb->dev)->ipv4.tcp_sock, skb,
695                       &arg, arg.iov[0].iov_len);
696
697         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
698 }
699
700 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
701 {
702         struct inet_timewait_sock *tw = inet_twsk(sk);
703         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
704
705         tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
706                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
707                         tcptw->tw_ts_recent);
708
709         inet_twsk_put(tw);
710 }
711
712 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
713                                   struct request_sock *req)
714 {
715         tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1,
716                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
717                         req->ts_recent);
718 }
719
720 /*
721  *      Send a SYN-ACK after having received a SYN.
722  *      This still operates on a request_sock only, not on a big
723  *      socket.
724  */
725 static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
726                                 struct dst_entry *dst)
727 {
728         const struct inet_request_sock *ireq = inet_rsk(req);
729         int err = -1;
730         struct sk_buff * skb;
731
732         /* First, grab a route. */
733         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
734                 return -1;
735
736         skb = tcp_make_synack(sk, dst, req);
737
738         if (skb) {
739                 struct tcphdr *th = tcp_hdr(skb);
740
741                 th->check = tcp_v4_check(skb->len,
742                                          ireq->loc_addr,
743                                          ireq->rmt_addr,
744                                          csum_partial((char *)th, skb->len,
745                                                       skb->csum));
746
747                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
748                                             ireq->rmt_addr,
749                                             ireq->opt);
750                 err = net_xmit_eval(err);
751         }
752
753         dst_release(dst);
754         return err;
755 }
756
757 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
758 {
759         return __tcp_v4_send_synack(sk, req, NULL);
760 }
761
762 /*
763  *      IPv4 request_sock destructor.
764  */
765 static void tcp_v4_reqsk_destructor(struct request_sock *req)
766 {
767         kfree(inet_rsk(req)->opt);
768 }
769
770 #ifdef CONFIG_SYN_COOKIES
771 static void syn_flood_warning(struct sk_buff *skb)
772 {
773         static unsigned long warntime;
774
775         if (time_after(jiffies, (warntime + HZ * 60))) {
776                 warntime = jiffies;
777                 printk(KERN_INFO
778                        "possible SYN flooding on port %d. Sending cookies.\n",
779                        ntohs(tcp_hdr(skb)->dest));
780         }
781 }
782 #endif
783
784 /*
785  * Save and compile IPv4 options into the request_sock if needed.
786  */
787 static struct ip_options *tcp_v4_save_options(struct sock *sk,
788                                               struct sk_buff *skb)
789 {
790         struct ip_options *opt = &(IPCB(skb)->opt);
791         struct ip_options *dopt = NULL;
792
793         if (opt && opt->optlen) {
794                 int opt_size = optlength(opt);
795                 dopt = kmalloc(opt_size, GFP_ATOMIC);
796                 if (dopt) {
797                         if (ip_options_echo(dopt, skb)) {
798                                 kfree(dopt);
799                                 dopt = NULL;
800                         }
801                 }
802         }
803         return dopt;
804 }
805
806 #ifdef CONFIG_TCP_MD5SIG
807 /*
808  * RFC2385 MD5 checksumming requires a mapping of
809  * IP address->MD5 Key.
810  * We need to maintain these in the sk structure.
811  */
812
813 /* Find the Key structure for an address.  */
814 static struct tcp_md5sig_key *
815                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
816 {
817         struct tcp_sock *tp = tcp_sk(sk);
818         int i;
819
820         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
821                 return NULL;
822         for (i = 0; i < tp->md5sig_info->entries4; i++) {
823                 if (tp->md5sig_info->keys4[i].addr == addr)
824                         return &tp->md5sig_info->keys4[i].base;
825         }
826         return NULL;
827 }
828
829 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
830                                          struct sock *addr_sk)
831 {
832         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
833 }
834
835 EXPORT_SYMBOL(tcp_v4_md5_lookup);
836
837 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
838                                                       struct request_sock *req)
839 {
840         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
841 }
842
843 /* This can be called on a newly created socket, from other files */
844 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
845                       u8 *newkey, u8 newkeylen)
846 {
847         /* Add Key to the list */
848         struct tcp_md5sig_key *key;
849         struct tcp_sock *tp = tcp_sk(sk);
850         struct tcp4_md5sig_key *keys;
851
852         key = tcp_v4_md5_do_lookup(sk, addr);
853         if (key) {
854                 /* Pre-existing entry - just update that one. */
855                 kfree(key->key);
856                 key->key = newkey;
857                 key->keylen = newkeylen;
858         } else {
859                 struct tcp_md5sig_info *md5sig;
860
861                 if (!tp->md5sig_info) {
862                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
863                                                   GFP_ATOMIC);
864                         if (!tp->md5sig_info) {
865                                 kfree(newkey);
866                                 return -ENOMEM;
867                         }
868                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
869                 }
870                 if (tcp_alloc_md5sig_pool() == NULL) {
871                         kfree(newkey);
872                         return -ENOMEM;
873                 }
874                 md5sig = tp->md5sig_info;
875
876                 if (md5sig->alloced4 == md5sig->entries4) {
877                         keys = kmalloc((sizeof(*keys) *
878                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
879                         if (!keys) {
880                                 kfree(newkey);
881                                 tcp_free_md5sig_pool();
882                                 return -ENOMEM;
883                         }
884
885                         if (md5sig->entries4)
886                                 memcpy(keys, md5sig->keys4,
887                                        sizeof(*keys) * md5sig->entries4);
888
889                         /* Free old key list, and reference new one */
890                         kfree(md5sig->keys4);
891                         md5sig->keys4 = keys;
892                         md5sig->alloced4++;
893                 }
894                 md5sig->entries4++;
895                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
896                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
897                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
898         }
899         return 0;
900 }
901
902 EXPORT_SYMBOL(tcp_v4_md5_do_add);
903
904 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
905                                u8 *newkey, u8 newkeylen)
906 {
907         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
908                                  newkey, newkeylen);
909 }
910
911 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
912 {
913         struct tcp_sock *tp = tcp_sk(sk);
914         int i;
915
916         for (i = 0; i < tp->md5sig_info->entries4; i++) {
917                 if (tp->md5sig_info->keys4[i].addr == addr) {
918                         /* Free the key */
919                         kfree(tp->md5sig_info->keys4[i].base.key);
920                         tp->md5sig_info->entries4--;
921
922                         if (tp->md5sig_info->entries4 == 0) {
923                                 kfree(tp->md5sig_info->keys4);
924                                 tp->md5sig_info->keys4 = NULL;
925                                 tp->md5sig_info->alloced4 = 0;
926                         } else if (tp->md5sig_info->entries4 != i) {
927                                 /* Need to do some manipulation */
928                                 memmove(&tp->md5sig_info->keys4[i],
929                                         &tp->md5sig_info->keys4[i+1],
930                                         (tp->md5sig_info->entries4 - i) *
931                                          sizeof(struct tcp4_md5sig_key));
932                         }
933                         tcp_free_md5sig_pool();
934                         return 0;
935                 }
936         }
937         return -ENOENT;
938 }
939
940 EXPORT_SYMBOL(tcp_v4_md5_do_del);
941
942 static void tcp_v4_clear_md5_list(struct sock *sk)
943 {
944         struct tcp_sock *tp = tcp_sk(sk);
945
946         /* Free each key, then the set of key keys,
947          * the crypto element, and then decrement our
948          * hold on the last resort crypto.
949          */
950         if (tp->md5sig_info->entries4) {
951                 int i;
952                 for (i = 0; i < tp->md5sig_info->entries4; i++)
953                         kfree(tp->md5sig_info->keys4[i].base.key);
954                 tp->md5sig_info->entries4 = 0;
955                 tcp_free_md5sig_pool();
956         }
957         if (tp->md5sig_info->keys4) {
958                 kfree(tp->md5sig_info->keys4);
959                 tp->md5sig_info->keys4 = NULL;
960                 tp->md5sig_info->alloced4  = 0;
961         }
962 }
963
964 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
965                                  int optlen)
966 {
967         struct tcp_md5sig cmd;
968         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
969         u8 *newkey;
970
971         if (optlen < sizeof(cmd))
972                 return -EINVAL;
973
974         if (copy_from_user(&cmd, optval, sizeof(cmd)))
975                 return -EFAULT;
976
977         if (sin->sin_family != AF_INET)
978                 return -EINVAL;
979
980         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
981                 if (!tcp_sk(sk)->md5sig_info)
982                         return -ENOENT;
983                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
984         }
985
986         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
987                 return -EINVAL;
988
989         if (!tcp_sk(sk)->md5sig_info) {
990                 struct tcp_sock *tp = tcp_sk(sk);
991                 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
992
993                 if (!p)
994                         return -EINVAL;
995
996                 tp->md5sig_info = p;
997                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
998         }
999
1000         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1001         if (!newkey)
1002                 return -ENOMEM;
1003         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1004                                  newkey, cmd.tcpm_keylen);
1005 }
1006
1007 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1008                                    __be32 saddr, __be32 daddr,
1009                                    struct tcphdr *th, int protocol,
1010                                    unsigned int tcplen)
1011 {
1012         struct scatterlist sg[4];
1013         __u16 data_len;
1014         int block = 0;
1015         __sum16 old_checksum;
1016         struct tcp_md5sig_pool *hp;
1017         struct tcp4_pseudohdr *bp;
1018         struct hash_desc *desc;
1019         int err;
1020         unsigned int nbytes = 0;
1021
1022         /*
1023          * Okay, so RFC2385 is turned on for this connection,
1024          * so we need to generate the MD5 hash for the packet now.
1025          */
1026
1027         hp = tcp_get_md5sig_pool();
1028         if (!hp)
1029                 goto clear_hash_noput;
1030
1031         bp = &hp->md5_blk.ip4;
1032         desc = &hp->md5_desc;
1033
1034         /*
1035          * 1. the TCP pseudo-header (in the order: source IP address,
1036          * destination IP address, zero-padded protocol number, and
1037          * segment length)
1038          */
1039         bp->saddr = saddr;
1040         bp->daddr = daddr;
1041         bp->pad = 0;
1042         bp->protocol = protocol;
1043         bp->len = htons(tcplen);
1044
1045         sg_init_table(sg, 4);
1046
1047         sg_set_buf(&sg[block++], bp, sizeof(*bp));
1048         nbytes += sizeof(*bp);
1049
1050         /* 2. the TCP header, excluding options, and assuming a
1051          * checksum of zero/
1052          */
1053         old_checksum = th->check;
1054         th->check = 0;
1055         sg_set_buf(&sg[block++], th, sizeof(struct tcphdr));
1056         nbytes += sizeof(struct tcphdr);
1057
1058         /* 3. the TCP segment data (if any) */
1059         data_len = tcplen - (th->doff << 2);
1060         if (data_len > 0) {
1061                 unsigned char *data = (unsigned char *)th + (th->doff << 2);
1062                 sg_set_buf(&sg[block++], data, data_len);
1063                 nbytes += data_len;
1064         }
1065
1066         /* 4. an independently-specified key or password, known to both
1067          * TCPs and presumably connection-specific
1068          */
1069         sg_set_buf(&sg[block++], key->key, key->keylen);
1070         nbytes += key->keylen;
1071
1072         sg_mark_end(&sg[block - 1]);
1073
1074         /* Now store the Hash into the packet */
1075         err = crypto_hash_init(desc);
1076         if (err)
1077                 goto clear_hash;
1078         err = crypto_hash_update(desc, sg, nbytes);
1079         if (err)
1080                 goto clear_hash;
1081         err = crypto_hash_final(desc, md5_hash);
1082         if (err)
1083                 goto clear_hash;
1084
1085         /* Reset header, and free up the crypto */
1086         tcp_put_md5sig_pool();
1087         th->check = old_checksum;
1088
1089 out:
1090         return 0;
1091 clear_hash:
1092         tcp_put_md5sig_pool();
1093 clear_hash_noput:
1094         memset(md5_hash, 0, 16);
1095         goto out;
1096 }
1097
1098 int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1099                          struct sock *sk,
1100                          struct dst_entry *dst,
1101                          struct request_sock *req,
1102                          struct tcphdr *th, int protocol,
1103                          unsigned int tcplen)
1104 {
1105         __be32 saddr, daddr;
1106
1107         if (sk) {
1108                 saddr = inet_sk(sk)->saddr;
1109                 daddr = inet_sk(sk)->daddr;
1110         } else {
1111                 struct rtable *rt = (struct rtable *)dst;
1112                 BUG_ON(!rt);
1113                 saddr = rt->rt_src;
1114                 daddr = rt->rt_dst;
1115         }
1116         return tcp_v4_do_calc_md5_hash(md5_hash, key,
1117                                        saddr, daddr,
1118                                        th, protocol, tcplen);
1119 }
1120
1121 EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1122
1123 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1124 {
1125         /*
1126          * This gets called for each TCP segment that arrives
1127          * so we want to be efficient.
1128          * We have 3 drop cases:
1129          * o No MD5 hash and one expected.
1130          * o MD5 hash and we're not expecting one.
1131          * o MD5 hash and its wrong.
1132          */
1133         __u8 *hash_location = NULL;
1134         struct tcp_md5sig_key *hash_expected;
1135         const struct iphdr *iph = ip_hdr(skb);
1136         struct tcphdr *th = tcp_hdr(skb);
1137         int length = (th->doff << 2) - sizeof(struct tcphdr);
1138         int genhash;
1139         unsigned char *ptr;
1140         unsigned char newhash[16];
1141
1142         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1143
1144         /*
1145          * If the TCP option length is less than the TCP_MD5SIG
1146          * option length, then we can shortcut
1147          */
1148         if (length < TCPOLEN_MD5SIG) {
1149                 if (hash_expected)
1150                         return 1;
1151                 else
1152                         return 0;
1153         }
1154
1155         /* Okay, we can't shortcut - we have to grub through the options */
1156         ptr = (unsigned char *)(th + 1);
1157         while (length > 0) {
1158                 int opcode = *ptr++;
1159                 int opsize;
1160
1161                 switch (opcode) {
1162                 case TCPOPT_EOL:
1163                         goto done_opts;
1164                 case TCPOPT_NOP:
1165                         length--;
1166                         continue;
1167                 default:
1168                         opsize = *ptr++;
1169                         if (opsize < 2)
1170                                 goto done_opts;
1171                         if (opsize > length)
1172                                 goto done_opts;
1173
1174                         if (opcode == TCPOPT_MD5SIG) {
1175                                 hash_location = ptr;
1176                                 goto done_opts;
1177                         }
1178                 }
1179                 ptr += opsize-2;
1180                 length -= opsize;
1181         }
1182 done_opts:
1183         /* We've parsed the options - do we have a hash? */
1184         if (!hash_expected && !hash_location)
1185                 return 0;
1186
1187         if (hash_expected && !hash_location) {
1188                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1189                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1190                                NIPQUAD(iph->saddr), ntohs(th->source),
1191                                NIPQUAD(iph->daddr), ntohs(th->dest));
1192                 return 1;
1193         }
1194
1195         if (!hash_expected && hash_location) {
1196                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1197                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1198                                NIPQUAD(iph->saddr), ntohs(th->source),
1199                                NIPQUAD(iph->daddr), ntohs(th->dest));
1200                 return 1;
1201         }
1202
1203         /* Okay, so this is hash_expected and hash_location -
1204          * so we need to calculate the checksum.
1205          */
1206         genhash = tcp_v4_do_calc_md5_hash(newhash,
1207                                           hash_expected,
1208                                           iph->saddr, iph->daddr,
1209                                           th, sk->sk_protocol,
1210                                           skb->len);
1211
1212         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1213                 if (net_ratelimit()) {
1214                         printk(KERN_INFO "MD5 Hash failed for "
1215                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1216                                NIPQUAD(iph->saddr), ntohs(th->source),
1217                                NIPQUAD(iph->daddr), ntohs(th->dest),
1218                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1219                 }
1220                 return 1;
1221         }
1222         return 0;
1223 }
1224
1225 #endif
1226
1227 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1228         .family         =       PF_INET,
1229         .obj_size       =       sizeof(struct tcp_request_sock),
1230         .rtx_syn_ack    =       tcp_v4_send_synack,
1231         .send_ack       =       tcp_v4_reqsk_send_ack,
1232         .destructor     =       tcp_v4_reqsk_destructor,
1233         .send_reset     =       tcp_v4_send_reset,
1234 };
1235
1236 #ifdef CONFIG_TCP_MD5SIG
1237 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1238         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1239 };
1240 #endif
1241
1242 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1243         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1244         .twsk_unique    = tcp_twsk_unique,
1245         .twsk_destructor= tcp_twsk_destructor,
1246 };
1247
1248 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1249 {
1250         struct inet_request_sock *ireq;
1251         struct tcp_options_received tmp_opt;
1252         struct request_sock *req;
1253         __be32 saddr = ip_hdr(skb)->saddr;
1254         __be32 daddr = ip_hdr(skb)->daddr;
1255         __u32 isn = TCP_SKB_CB(skb)->when;
1256         struct dst_entry *dst = NULL;
1257 #ifdef CONFIG_SYN_COOKIES
1258         int want_cookie = 0;
1259 #else
1260 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1261 #endif
1262
1263         /* Never answer to SYNs send to broadcast or multicast */
1264         if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1265                 goto drop;
1266
1267         /* TW buckets are converted to open requests without
1268          * limitations, they conserve resources and peer is
1269          * evidently real one.
1270          */
1271         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1272 #ifdef CONFIG_SYN_COOKIES
1273                 if (sysctl_tcp_syncookies) {
1274                         want_cookie = 1;
1275                 } else
1276 #endif
1277                 goto drop;
1278         }
1279
1280         /* Accept backlog is full. If we have already queued enough
1281          * of warm entries in syn queue, drop request. It is better than
1282          * clogging syn queue with openreqs with exponentially increasing
1283          * timeout.
1284          */
1285         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1286                 goto drop;
1287
1288         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1289         if (!req)
1290                 goto drop;
1291
1292 #ifdef CONFIG_TCP_MD5SIG
1293         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1294 #endif
1295
1296         tcp_clear_options(&tmp_opt);
1297         tmp_opt.mss_clamp = 536;
1298         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1299
1300         tcp_parse_options(skb, &tmp_opt, 0);
1301
1302         if (want_cookie && !tmp_opt.saw_tstamp)
1303                 tcp_clear_options(&tmp_opt);
1304
1305         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1306                 /* Some OSes (unknown ones, but I see them on web server, which
1307                  * contains information interesting only for windows'
1308                  * users) do not send their stamp in SYN. It is easy case.
1309                  * We simply do not advertise TS support.
1310                  */
1311                 tmp_opt.saw_tstamp = 0;
1312                 tmp_opt.tstamp_ok  = 0;
1313         }
1314         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1315
1316         tcp_openreq_init(req, &tmp_opt, skb);
1317
1318         if (security_inet_conn_request(sk, skb, req))
1319                 goto drop_and_free;
1320
1321         ireq = inet_rsk(req);
1322         ireq->loc_addr = daddr;
1323         ireq->rmt_addr = saddr;
1324         ireq->opt = tcp_v4_save_options(sk, skb);
1325         if (!want_cookie)
1326                 TCP_ECN_create_request(req, tcp_hdr(skb));
1327
1328         if (want_cookie) {
1329 #ifdef CONFIG_SYN_COOKIES
1330                 syn_flood_warning(skb);
1331                 req->cookie_ts = tmp_opt.tstamp_ok;
1332 #endif
1333                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1334         } else if (!isn) {
1335                 struct inet_peer *peer = NULL;
1336
1337                 /* VJ's idea. We save last timestamp seen
1338                  * from the destination in peer table, when entering
1339                  * state TIME-WAIT, and check against it before
1340                  * accepting new connection request.
1341                  *
1342                  * If "isn" is not zero, this request hit alive
1343                  * timewait bucket, so that all the necessary checks
1344                  * are made in the function processing timewait state.
1345                  */
1346                 if (tmp_opt.saw_tstamp &&
1347                     tcp_death_row.sysctl_tw_recycle &&
1348                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1349                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1350                     peer->v4daddr == saddr) {
1351                         if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1352                             (s32)(peer->tcp_ts - req->ts_recent) >
1353                                                         TCP_PAWS_WINDOW) {
1354                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1355                                 goto drop_and_release;
1356                         }
1357                 }
1358                 /* Kill the following clause, if you dislike this way. */
1359                 else if (!sysctl_tcp_syncookies &&
1360                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1361                           (sysctl_max_syn_backlog >> 2)) &&
1362                          (!peer || !peer->tcp_ts_stamp) &&
1363                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1364                         /* Without syncookies last quarter of
1365                          * backlog is filled with destinations,
1366                          * proven to be alive.
1367                          * It means that we continue to communicate
1368                          * to destinations, already remembered
1369                          * to the moment of synflood.
1370                          */
1371                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1372                                        "request from " NIPQUAD_FMT "/%u\n",
1373                                        NIPQUAD(saddr),
1374                                        ntohs(tcp_hdr(skb)->source));
1375                         goto drop_and_release;
1376                 }
1377
1378                 isn = tcp_v4_init_sequence(skb);
1379         }
1380         tcp_rsk(req)->snt_isn = isn;
1381
1382         if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1383                 goto drop_and_free;
1384
1385         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1386         return 0;
1387
1388 drop_and_release:
1389         dst_release(dst);
1390 drop_and_free:
1391         reqsk_free(req);
1392 drop:
1393         return 0;
1394 }
1395
1396
1397 /*
1398  * The three way handshake has completed - we got a valid synack -
1399  * now create the new socket.
1400  */
1401 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1402                                   struct request_sock *req,
1403                                   struct dst_entry *dst)
1404 {
1405         struct inet_request_sock *ireq;
1406         struct inet_sock *newinet;
1407         struct tcp_sock *newtp;
1408         struct sock *newsk;
1409 #ifdef CONFIG_TCP_MD5SIG
1410         struct tcp_md5sig_key *key;
1411 #endif
1412
1413         if (sk_acceptq_is_full(sk))
1414                 goto exit_overflow;
1415
1416         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1417                 goto exit;
1418
1419         newsk = tcp_create_openreq_child(sk, req, skb);
1420         if (!newsk)
1421                 goto exit;
1422
1423         newsk->sk_gso_type = SKB_GSO_TCPV4;
1424         sk_setup_caps(newsk, dst);
1425
1426         newtp                 = tcp_sk(newsk);
1427         newinet               = inet_sk(newsk);
1428         ireq                  = inet_rsk(req);
1429         newinet->daddr        = ireq->rmt_addr;
1430         newinet->rcv_saddr    = ireq->loc_addr;
1431         newinet->saddr        = ireq->loc_addr;
1432         newinet->opt          = ireq->opt;
1433         ireq->opt             = NULL;
1434         newinet->mc_index     = inet_iif(skb);
1435         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1436         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1437         if (newinet->opt)
1438                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1439         newinet->id = newtp->write_seq ^ jiffies;
1440
1441         tcp_mtup_init(newsk);
1442         tcp_sync_mss(newsk, dst_mtu(dst));
1443         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1444         tcp_initialize_rcv_mss(newsk);
1445
1446 #ifdef CONFIG_TCP_MD5SIG
1447         /* Copy over the MD5 key from the original socket */
1448         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1449                 /*
1450                  * We're using one, so create a matching key
1451                  * on the newsk structure. If we fail to get
1452                  * memory, then we end up not copying the key
1453                  * across. Shucks.
1454                  */
1455                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1456                 if (newkey != NULL)
1457                         tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1458                                           newkey, key->keylen);
1459         }
1460 #endif
1461
1462         __inet_hash_nolisten(newsk);
1463         __inet_inherit_port(sk, newsk);
1464
1465         return newsk;
1466
1467 exit_overflow:
1468         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1469 exit:
1470         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1471         dst_release(dst);
1472         return NULL;
1473 }
1474
1475 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1476 {
1477         struct tcphdr *th = tcp_hdr(skb);
1478         const struct iphdr *iph = ip_hdr(skb);
1479         struct sock *nsk;
1480         struct request_sock **prev;
1481         /* Find possible connection requests. */
1482         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1483                                                        iph->saddr, iph->daddr);
1484         if (req)
1485                 return tcp_check_req(sk, skb, req, prev);
1486
1487         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1488                         th->source, iph->daddr, th->dest, inet_iif(skb));
1489
1490         if (nsk) {
1491                 if (nsk->sk_state != TCP_TIME_WAIT) {
1492                         bh_lock_sock(nsk);
1493                         return nsk;
1494                 }
1495                 inet_twsk_put(inet_twsk(nsk));
1496                 return NULL;
1497         }
1498
1499 #ifdef CONFIG_SYN_COOKIES
1500         if (!th->rst && !th->syn && th->ack)
1501                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1502 #endif
1503         return sk;
1504 }
1505
1506 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1507 {
1508         const struct iphdr *iph = ip_hdr(skb);
1509
1510         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1511                 if (!tcp_v4_check(skb->len, iph->saddr,
1512                                   iph->daddr, skb->csum)) {
1513                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1514                         return 0;
1515                 }
1516         }
1517
1518         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1519                                        skb->len, IPPROTO_TCP, 0);
1520
1521         if (skb->len <= 76) {
1522                 return __skb_checksum_complete(skb);
1523         }
1524         return 0;
1525 }
1526
1527
1528 /* The socket must have it's spinlock held when we get
1529  * here.
1530  *
1531  * We have a potential double-lock case here, so even when
1532  * doing backlog processing we use the BH locking scheme.
1533  * This is because we cannot sleep with the original spinlock
1534  * held.
1535  */
1536 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1537 {
1538         struct sock *rsk;
1539 #ifdef CONFIG_TCP_MD5SIG
1540         /*
1541          * We really want to reject the packet as early as possible
1542          * if:
1543          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1544          *  o There is an MD5 option and we're not expecting one
1545          */
1546         if (tcp_v4_inbound_md5_hash(sk, skb))
1547                 goto discard;
1548 #endif
1549
1550         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1551                 TCP_CHECK_TIMER(sk);
1552                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1553                         rsk = sk;
1554                         goto reset;
1555                 }
1556                 TCP_CHECK_TIMER(sk);
1557                 return 0;
1558         }
1559
1560         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1561                 goto csum_err;
1562
1563         if (sk->sk_state == TCP_LISTEN) {
1564                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1565                 if (!nsk)
1566                         goto discard;
1567
1568                 if (nsk != sk) {
1569                         if (tcp_child_process(sk, nsk, skb)) {
1570                                 rsk = nsk;
1571                                 goto reset;
1572                         }
1573                         return 0;
1574                 }
1575         }
1576
1577         TCP_CHECK_TIMER(sk);
1578         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1579                 rsk = sk;
1580                 goto reset;
1581         }
1582         TCP_CHECK_TIMER(sk);
1583         return 0;
1584
1585 reset:
1586         tcp_v4_send_reset(rsk, skb);
1587 discard:
1588         kfree_skb(skb);
1589         /* Be careful here. If this function gets more complicated and
1590          * gcc suffers from register pressure on the x86, sk (in %ebx)
1591          * might be destroyed here. This current version compiles correctly,
1592          * but you have been warned.
1593          */
1594         return 0;
1595
1596 csum_err:
1597         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1598         goto discard;
1599 }
1600
1601 /*
1602  *      From tcp_input.c
1603  */
1604
1605 int tcp_v4_rcv(struct sk_buff *skb)
1606 {
1607         const struct iphdr *iph;
1608         struct tcphdr *th;
1609         struct sock *sk;
1610         int ret;
1611
1612         if (skb->pkt_type != PACKET_HOST)
1613                 goto discard_it;
1614
1615         /* Count it even if it's bad */
1616         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1617
1618         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1619                 goto discard_it;
1620
1621         th = tcp_hdr(skb);
1622
1623         if (th->doff < sizeof(struct tcphdr) / 4)
1624                 goto bad_packet;
1625         if (!pskb_may_pull(skb, th->doff * 4))
1626                 goto discard_it;
1627
1628         /* An explanation is required here, I think.
1629          * Packet length and doff are validated by header prediction,
1630          * provided case of th->doff==0 is eliminated.
1631          * So, we defer the checks. */
1632         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1633                 goto bad_packet;
1634
1635         th = tcp_hdr(skb);
1636         iph = ip_hdr(skb);
1637         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1638         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1639                                     skb->len - th->doff * 4);
1640         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1641         TCP_SKB_CB(skb)->when    = 0;
1642         TCP_SKB_CB(skb)->flags   = iph->tos;
1643         TCP_SKB_CB(skb)->sacked  = 0;
1644
1645         sk = __inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->saddr,
1646                         th->source, iph->daddr, th->dest, inet_iif(skb));
1647         if (!sk)
1648                 goto no_tcp_socket;
1649
1650 process:
1651         if (sk->sk_state == TCP_TIME_WAIT)
1652                 goto do_time_wait;
1653
1654         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1655                 goto discard_and_relse;
1656         nf_reset(skb);
1657
1658         if (sk_filter(sk, skb))
1659                 goto discard_and_relse;
1660
1661         skb->dev = NULL;
1662
1663         bh_lock_sock_nested(sk);
1664         ret = 0;
1665         if (!sock_owned_by_user(sk)) {
1666 #ifdef CONFIG_NET_DMA
1667                 struct tcp_sock *tp = tcp_sk(sk);
1668                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1669                         tp->ucopy.dma_chan = get_softnet_dma();
1670                 if (tp->ucopy.dma_chan)
1671                         ret = tcp_v4_do_rcv(sk, skb);
1672                 else
1673 #endif
1674                 {
1675                         if (!tcp_prequeue(sk, skb))
1676                         ret = tcp_v4_do_rcv(sk, skb);
1677                 }
1678         } else
1679                 sk_add_backlog(sk, skb);
1680         bh_unlock_sock(sk);
1681
1682         sock_put(sk);
1683
1684         return ret;
1685
1686 no_tcp_socket:
1687         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1688                 goto discard_it;
1689
1690         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1691 bad_packet:
1692                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1693         } else {
1694                 tcp_v4_send_reset(NULL, skb);
1695         }
1696
1697 discard_it:
1698         /* Discard frame. */
1699         kfree_skb(skb);
1700         return 0;
1701
1702 discard_and_relse:
1703         sock_put(sk);
1704         goto discard_it;
1705
1706 do_time_wait:
1707         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1708                 inet_twsk_put(inet_twsk(sk));
1709                 goto discard_it;
1710         }
1711
1712         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1713                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1714                 inet_twsk_put(inet_twsk(sk));
1715                 goto discard_it;
1716         }
1717         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1718         case TCP_TW_SYN: {
1719                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1720                                                         &tcp_hashinfo,
1721                                                         iph->daddr, th->dest,
1722                                                         inet_iif(skb));
1723                 if (sk2) {
1724                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1725                         inet_twsk_put(inet_twsk(sk));
1726                         sk = sk2;
1727                         goto process;
1728                 }
1729                 /* Fall through to ACK */
1730         }
1731         case TCP_TW_ACK:
1732                 tcp_v4_timewait_ack(sk, skb);
1733                 break;
1734         case TCP_TW_RST:
1735                 goto no_tcp_socket;
1736         case TCP_TW_SUCCESS:;
1737         }
1738         goto discard_it;
1739 }
1740
1741 /* VJ's idea. Save last timestamp seen from this destination
1742  * and hold it at least for normal timewait interval to use for duplicate
1743  * segment detection in subsequent connections, before they enter synchronized
1744  * state.
1745  */
1746
1747 int tcp_v4_remember_stamp(struct sock *sk)
1748 {
1749         struct inet_sock *inet = inet_sk(sk);
1750         struct tcp_sock *tp = tcp_sk(sk);
1751         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1752         struct inet_peer *peer = NULL;
1753         int release_it = 0;
1754
1755         if (!rt || rt->rt_dst != inet->daddr) {
1756                 peer = inet_getpeer(inet->daddr, 1);
1757                 release_it = 1;
1758         } else {
1759                 if (!rt->peer)
1760                         rt_bind_peer(rt, 1);
1761                 peer = rt->peer;
1762         }
1763
1764         if (peer) {
1765                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1766                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1767                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1768                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1769                         peer->tcp_ts = tp->rx_opt.ts_recent;
1770                 }
1771                 if (release_it)
1772                         inet_putpeer(peer);
1773                 return 1;
1774         }
1775
1776         return 0;
1777 }
1778
1779 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1780 {
1781         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1782
1783         if (peer) {
1784                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1785
1786                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1787                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1788                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1789                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1790                         peer->tcp_ts       = tcptw->tw_ts_recent;
1791                 }
1792                 inet_putpeer(peer);
1793                 return 1;
1794         }
1795
1796         return 0;
1797 }
1798
1799 struct inet_connection_sock_af_ops ipv4_specific = {
1800         .queue_xmit        = ip_queue_xmit,
1801         .send_check        = tcp_v4_send_check,
1802         .rebuild_header    = inet_sk_rebuild_header,
1803         .conn_request      = tcp_v4_conn_request,
1804         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1805         .remember_stamp    = tcp_v4_remember_stamp,
1806         .net_header_len    = sizeof(struct iphdr),
1807         .setsockopt        = ip_setsockopt,
1808         .getsockopt        = ip_getsockopt,
1809         .addr2sockaddr     = inet_csk_addr2sockaddr,
1810         .sockaddr_len      = sizeof(struct sockaddr_in),
1811         .bind_conflict     = inet_csk_bind_conflict,
1812 #ifdef CONFIG_COMPAT
1813         .compat_setsockopt = compat_ip_setsockopt,
1814         .compat_getsockopt = compat_ip_getsockopt,
1815 #endif
1816 };
1817
1818 #ifdef CONFIG_TCP_MD5SIG
1819 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1820         .md5_lookup             = tcp_v4_md5_lookup,
1821         .calc_md5_hash          = tcp_v4_calc_md5_hash,
1822         .md5_add                = tcp_v4_md5_add_func,
1823         .md5_parse              = tcp_v4_parse_md5_keys,
1824 };
1825 #endif
1826
1827 /* NOTE: A lot of things set to zero explicitly by call to
1828  *       sk_alloc() so need not be done here.
1829  */
1830 static int tcp_v4_init_sock(struct sock *sk)
1831 {
1832         struct inet_connection_sock *icsk = inet_csk(sk);
1833         struct tcp_sock *tp = tcp_sk(sk);
1834
1835         skb_queue_head_init(&tp->out_of_order_queue);
1836         tcp_init_xmit_timers(sk);
1837         tcp_prequeue_init(tp);
1838
1839         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1840         tp->mdev = TCP_TIMEOUT_INIT;
1841
1842         /* So many TCP implementations out there (incorrectly) count the
1843          * initial SYN frame in their delayed-ACK and congestion control
1844          * algorithms that we must have the following bandaid to talk
1845          * efficiently to them.  -DaveM
1846          */
1847         tp->snd_cwnd = 2;
1848
1849         /* See draft-stevens-tcpca-spec-01 for discussion of the
1850          * initialization of these values.
1851          */
1852         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1853         tp->snd_cwnd_clamp = ~0;
1854         tp->mss_cache = 536;
1855
1856         tp->reordering = sysctl_tcp_reordering;
1857         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1858
1859         sk->sk_state = TCP_CLOSE;
1860
1861         sk->sk_write_space = sk_stream_write_space;
1862         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1863
1864         icsk->icsk_af_ops = &ipv4_specific;
1865         icsk->icsk_sync_mss = tcp_sync_mss;
1866 #ifdef CONFIG_TCP_MD5SIG
1867         tp->af_specific = &tcp_sock_ipv4_specific;
1868 #endif
1869
1870         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1871         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1872
1873         atomic_inc(&tcp_sockets_allocated);
1874
1875         return 0;
1876 }
1877
1878 int tcp_v4_destroy_sock(struct sock *sk)
1879 {
1880         struct tcp_sock *tp = tcp_sk(sk);
1881
1882         tcp_clear_xmit_timers(sk);
1883
1884         tcp_cleanup_congestion_control(sk);
1885
1886         /* Cleanup up the write buffer. */
1887         tcp_write_queue_purge(sk);
1888
1889         /* Cleans up our, hopefully empty, out_of_order_queue. */
1890         __skb_queue_purge(&tp->out_of_order_queue);
1891
1892 #ifdef CONFIG_TCP_MD5SIG
1893         /* Clean up the MD5 key list, if any */
1894         if (tp->md5sig_info) {
1895                 tcp_v4_clear_md5_list(sk);
1896                 kfree(tp->md5sig_info);
1897                 tp->md5sig_info = NULL;
1898         }
1899 #endif
1900
1901 #ifdef CONFIG_NET_DMA
1902         /* Cleans up our sk_async_wait_queue */
1903         __skb_queue_purge(&sk->sk_async_wait_queue);
1904 #endif
1905
1906         /* Clean prequeue, it must be empty really */
1907         __skb_queue_purge(&tp->ucopy.prequeue);
1908
1909         /* Clean up a referenced TCP bind bucket. */
1910         if (inet_csk(sk)->icsk_bind_hash)
1911                 inet_put_port(sk);
1912
1913         /*
1914          * If sendmsg cached page exists, toss it.
1915          */
1916         if (sk->sk_sndmsg_page) {
1917                 __free_page(sk->sk_sndmsg_page);
1918                 sk->sk_sndmsg_page = NULL;
1919         }
1920
1921         if (tp->defer_tcp_accept.request) {
1922                 reqsk_free(tp->defer_tcp_accept.request);
1923                 sock_put(tp->defer_tcp_accept.listen_sk);
1924                 sock_put(sk);
1925                 tp->defer_tcp_accept.listen_sk = NULL;
1926                 tp->defer_tcp_accept.request = NULL;
1927         }
1928
1929         atomic_dec(&tcp_sockets_allocated);
1930
1931         return 0;
1932 }
1933
1934 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1935
1936 #ifdef CONFIG_PROC_FS
1937 /* Proc filesystem TCP sock list dumping. */
1938
1939 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1940 {
1941         return hlist_empty(head) ? NULL :
1942                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1943 }
1944
1945 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1946 {
1947         return tw->tw_node.next ?
1948                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1949 }
1950
1951 static void *listening_get_next(struct seq_file *seq, void *cur)
1952 {
1953         struct inet_connection_sock *icsk;
1954         struct hlist_node *node;
1955         struct sock *sk = cur;
1956         struct tcp_iter_state* st = seq->private;
1957         struct net *net = seq_file_net(seq);
1958
1959         if (!sk) {
1960                 st->bucket = 0;
1961                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1962                 goto get_sk;
1963         }
1964
1965         ++st->num;
1966
1967         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1968                 struct request_sock *req = cur;
1969
1970                 icsk = inet_csk(st->syn_wait_sk);
1971                 req = req->dl_next;
1972                 while (1) {
1973                         while (req) {
1974                                 if (req->rsk_ops->family == st->family &&
1975                                     net_eq(sock_net(req->sk), net)) {
1976                                         cur = req;
1977                                         goto out;
1978                                 }
1979                                 req = req->dl_next;
1980                         }
1981                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1982                                 break;
1983 get_req:
1984                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1985                 }
1986                 sk        = sk_next(st->syn_wait_sk);
1987                 st->state = TCP_SEQ_STATE_LISTENING;
1988                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1989         } else {
1990                 icsk = inet_csk(sk);
1991                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1992                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1993                         goto start_req;
1994                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1995                 sk = sk_next(sk);
1996         }
1997 get_sk:
1998         sk_for_each_from(sk, node) {
1999                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
2000                         cur = sk;
2001                         goto out;
2002                 }
2003                 icsk = inet_csk(sk);
2004                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2005                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2006 start_req:
2007                         st->uid         = sock_i_uid(sk);
2008                         st->syn_wait_sk = sk;
2009                         st->state       = TCP_SEQ_STATE_OPENREQ;
2010                         st->sbucket     = 0;
2011                         goto get_req;
2012                 }
2013                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2014         }
2015         if (++st->bucket < INET_LHTABLE_SIZE) {
2016                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
2017                 goto get_sk;
2018         }
2019         cur = NULL;
2020 out:
2021         return cur;
2022 }
2023
2024 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2025 {
2026         void *rc = listening_get_next(seq, NULL);
2027
2028         while (rc && *pos) {
2029                 rc = listening_get_next(seq, rc);
2030                 --*pos;
2031         }
2032         return rc;
2033 }
2034
2035 static void *established_get_first(struct seq_file *seq)
2036 {
2037         struct tcp_iter_state* st = seq->private;
2038         struct net *net = seq_file_net(seq);
2039         void *rc = NULL;
2040
2041         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
2042                 struct sock *sk;
2043                 struct hlist_node *node;
2044                 struct inet_timewait_sock *tw;
2045                 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2046
2047                 read_lock_bh(lock);
2048                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2049                         if (sk->sk_family != st->family ||
2050                             !net_eq(sock_net(sk), net)) {
2051                                 continue;
2052                         }
2053                         rc = sk;
2054                         goto out;
2055                 }
2056                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2057                 inet_twsk_for_each(tw, node,
2058                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2059                         if (tw->tw_family != st->family ||
2060                             !net_eq(twsk_net(tw), net)) {
2061                                 continue;
2062                         }
2063                         rc = tw;
2064                         goto out;
2065                 }
2066                 read_unlock_bh(lock);
2067                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2068         }
2069 out:
2070         return rc;
2071 }
2072
2073 static void *established_get_next(struct seq_file *seq, void *cur)
2074 {
2075         struct sock *sk = cur;
2076         struct inet_timewait_sock *tw;
2077         struct hlist_node *node;
2078         struct tcp_iter_state* st = seq->private;
2079         struct net *net = seq_file_net(seq);
2080
2081         ++st->num;
2082
2083         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2084                 tw = cur;
2085                 tw = tw_next(tw);
2086 get_tw:
2087                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2088                         tw = tw_next(tw);
2089                 }
2090                 if (tw) {
2091                         cur = tw;
2092                         goto out;
2093                 }
2094                 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2095                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2096
2097                 if (++st->bucket < tcp_hashinfo.ehash_size) {
2098                         read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2099                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2100                 } else {
2101                         cur = NULL;
2102                         goto out;
2103                 }
2104         } else
2105                 sk = sk_next(sk);
2106
2107         sk_for_each_from(sk, node) {
2108                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2109                         goto found;
2110         }
2111
2112         st->state = TCP_SEQ_STATE_TIME_WAIT;
2113         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2114         goto get_tw;
2115 found:
2116         cur = sk;
2117 out:
2118         return cur;
2119 }
2120
2121 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2122 {
2123         void *rc = established_get_first(seq);
2124
2125         while (rc && pos) {
2126                 rc = established_get_next(seq, rc);
2127                 --pos;
2128         }
2129         return rc;
2130 }
2131
2132 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2133 {
2134         void *rc;
2135         struct tcp_iter_state* st = seq->private;
2136
2137         inet_listen_lock(&tcp_hashinfo);
2138         st->state = TCP_SEQ_STATE_LISTENING;
2139         rc        = listening_get_idx(seq, &pos);
2140
2141         if (!rc) {
2142                 inet_listen_unlock(&tcp_hashinfo);
2143                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2144                 rc        = established_get_idx(seq, pos);
2145         }
2146
2147         return rc;
2148 }
2149
2150 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2151 {
2152         struct tcp_iter_state* st = seq->private;
2153         st->state = TCP_SEQ_STATE_LISTENING;
2154         st->num = 0;
2155         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2156 }
2157
2158 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2159 {
2160         void *rc = NULL;
2161         struct tcp_iter_state* st;
2162
2163         if (v == SEQ_START_TOKEN) {
2164                 rc = tcp_get_idx(seq, 0);
2165                 goto out;
2166         }
2167         st = seq->private;
2168
2169         switch (st->state) {
2170         case TCP_SEQ_STATE_OPENREQ:
2171         case TCP_SEQ_STATE_LISTENING:
2172                 rc = listening_get_next(seq, v);
2173                 if (!rc) {
2174                         inet_listen_unlock(&tcp_hashinfo);
2175                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2176                         rc        = established_get_first(seq);
2177                 }
2178                 break;
2179         case TCP_SEQ_STATE_ESTABLISHED:
2180         case TCP_SEQ_STATE_TIME_WAIT:
2181                 rc = established_get_next(seq, v);
2182                 break;
2183         }
2184 out:
2185         ++*pos;
2186         return rc;
2187 }
2188
2189 static void tcp_seq_stop(struct seq_file *seq, void *v)
2190 {
2191         struct tcp_iter_state* st = seq->private;
2192
2193         switch (st->state) {
2194         case TCP_SEQ_STATE_OPENREQ:
2195                 if (v) {
2196                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2197                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2198                 }
2199         case TCP_SEQ_STATE_LISTENING:
2200                 if (v != SEQ_START_TOKEN)
2201                         inet_listen_unlock(&tcp_hashinfo);
2202                 break;
2203         case TCP_SEQ_STATE_TIME_WAIT:
2204         case TCP_SEQ_STATE_ESTABLISHED:
2205                 if (v)
2206                         read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2207                 break;
2208         }
2209 }
2210
2211 static int tcp_seq_open(struct inode *inode, struct file *file)
2212 {
2213         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2214         struct tcp_iter_state *s;
2215         int err;
2216
2217         err = seq_open_net(inode, file, &afinfo->seq_ops,
2218                           sizeof(struct tcp_iter_state));
2219         if (err < 0)
2220                 return err;
2221
2222         s = ((struct seq_file *)file->private_data)->private;
2223         s->family               = afinfo->family;
2224         return 0;
2225 }
2226
2227 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2228 {
2229         int rc = 0;
2230         struct proc_dir_entry *p;
2231
2232         afinfo->seq_fops.open           = tcp_seq_open;
2233         afinfo->seq_fops.read           = seq_read;
2234         afinfo->seq_fops.llseek         = seq_lseek;
2235         afinfo->seq_fops.release        = seq_release_net;
2236
2237         afinfo->seq_ops.start           = tcp_seq_start;
2238         afinfo->seq_ops.next            = tcp_seq_next;
2239         afinfo->seq_ops.stop            = tcp_seq_stop;
2240
2241         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2242                              &afinfo->seq_fops, afinfo);
2243         if (!p)
2244                 rc = -ENOMEM;
2245         return rc;
2246 }
2247
2248 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2249 {
2250         proc_net_remove(net, afinfo->name);
2251 }
2252
2253 static void get_openreq4(struct sock *sk, struct request_sock *req,
2254                          struct seq_file *f, int i, int uid, int *len)
2255 {
2256         const struct inet_request_sock *ireq = inet_rsk(req);
2257         int ttd = req->expires - jiffies;
2258
2259         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2260                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2261                 i,
2262                 ireq->loc_addr,
2263                 ntohs(inet_sk(sk)->sport),
2264                 ireq->rmt_addr,
2265                 ntohs(ireq->rmt_port),
2266                 TCP_SYN_RECV,
2267                 0, 0, /* could print option size, but that is af dependent. */
2268                 1,    /* timers active (only the expire timer) */
2269                 jiffies_to_clock_t(ttd),
2270                 req->retrans,
2271                 uid,
2272                 0,  /* non standard timer */
2273                 0, /* open_requests have no inode */
2274                 atomic_read(&sk->sk_refcnt),
2275                 req,
2276                 len);
2277 }
2278
2279 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2280 {
2281         int timer_active;
2282         unsigned long timer_expires;
2283         struct tcp_sock *tp = tcp_sk(sk);
2284         const struct inet_connection_sock *icsk = inet_csk(sk);
2285         struct inet_sock *inet = inet_sk(sk);
2286         __be32 dest = inet->daddr;
2287         __be32 src = inet->rcv_saddr;
2288         __u16 destp = ntohs(inet->dport);
2289         __u16 srcp = ntohs(inet->sport);
2290
2291         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2292                 timer_active    = 1;
2293                 timer_expires   = icsk->icsk_timeout;
2294         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2295                 timer_active    = 4;
2296                 timer_expires   = icsk->icsk_timeout;
2297         } else if (timer_pending(&sk->sk_timer)) {
2298                 timer_active    = 2;
2299                 timer_expires   = sk->sk_timer.expires;
2300         } else {
2301                 timer_active    = 0;
2302                 timer_expires = jiffies;
2303         }
2304
2305         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2306                         "%08X %5d %8d %lu %d %p %u %u %u %u %d%n",
2307                 i, src, srcp, dest, destp, sk->sk_state,
2308                 tp->write_seq - tp->snd_una,
2309                 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2310                                              (tp->rcv_nxt - tp->copied_seq),
2311                 timer_active,
2312                 jiffies_to_clock_t(timer_expires - jiffies),
2313                 icsk->icsk_retransmits,
2314                 sock_i_uid(sk),
2315                 icsk->icsk_probes_out,
2316                 sock_i_ino(sk),
2317                 atomic_read(&sk->sk_refcnt), sk,
2318                 icsk->icsk_rto,
2319                 icsk->icsk_ack.ato,
2320                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2321                 tp->snd_cwnd,
2322                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh,
2323                 len);
2324 }
2325
2326 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2327                                struct seq_file *f, int i, int *len)
2328 {
2329         __be32 dest, src;
2330         __u16 destp, srcp;
2331         int ttd = tw->tw_ttd - jiffies;
2332
2333         if (ttd < 0)
2334                 ttd = 0;
2335
2336         dest  = tw->tw_daddr;
2337         src   = tw->tw_rcv_saddr;
2338         destp = ntohs(tw->tw_dport);
2339         srcp  = ntohs(tw->tw_sport);
2340
2341         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2342                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2343                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2344                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2345                 atomic_read(&tw->tw_refcnt), tw, len);
2346 }
2347
2348 #define TMPSZ 150
2349
2350 static int tcp4_seq_show(struct seq_file *seq, void *v)
2351 {
2352         struct tcp_iter_state* st;
2353         int len;
2354
2355         if (v == SEQ_START_TOKEN) {
2356                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2357                            "  sl  local_address rem_address   st tx_queue "
2358                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2359                            "inode");
2360                 goto out;
2361         }
2362         st = seq->private;
2363
2364         switch (st->state) {
2365         case TCP_SEQ_STATE_LISTENING:
2366         case TCP_SEQ_STATE_ESTABLISHED:
2367                 get_tcp4_sock(v, seq, st->num, &len);
2368                 break;
2369         case TCP_SEQ_STATE_OPENREQ:
2370                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2371                 break;
2372         case TCP_SEQ_STATE_TIME_WAIT:
2373                 get_timewait4_sock(v, seq, st->num, &len);
2374                 break;
2375         }
2376         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2377 out:
2378         return 0;
2379 }
2380
2381 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2382         .name           = "tcp",
2383         .family         = AF_INET,
2384         .seq_fops       = {
2385                 .owner          = THIS_MODULE,
2386         },
2387         .seq_ops        = {
2388                 .show           = tcp4_seq_show,
2389         },
2390 };
2391
2392 static int tcp4_proc_init_net(struct net *net)
2393 {
2394         return tcp_proc_register(net, &tcp4_seq_afinfo);
2395 }
2396
2397 static void tcp4_proc_exit_net(struct net *net)
2398 {
2399         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2400 }
2401
2402 static struct pernet_operations tcp4_net_ops = {
2403         .init = tcp4_proc_init_net,
2404         .exit = tcp4_proc_exit_net,
2405 };
2406
2407 int __init tcp4_proc_init(void)
2408 {
2409         return register_pernet_subsys(&tcp4_net_ops);
2410 }
2411
2412 void tcp4_proc_exit(void)
2413 {
2414         unregister_pernet_subsys(&tcp4_net_ops);
2415 }
2416 #endif /* CONFIG_PROC_FS */
2417
2418 struct proto tcp_prot = {
2419         .name                   = "TCP",
2420         .owner                  = THIS_MODULE,
2421         .close                  = tcp_close,
2422         .connect                = tcp_v4_connect,
2423         .disconnect             = tcp_disconnect,
2424         .accept                 = inet_csk_accept,
2425         .ioctl                  = tcp_ioctl,
2426         .init                   = tcp_v4_init_sock,
2427         .destroy                = tcp_v4_destroy_sock,
2428         .shutdown               = tcp_shutdown,
2429         .setsockopt             = tcp_setsockopt,
2430         .getsockopt             = tcp_getsockopt,
2431         .recvmsg                = tcp_recvmsg,
2432         .backlog_rcv            = tcp_v4_do_rcv,
2433         .hash                   = inet_hash,
2434         .unhash                 = inet_unhash,
2435         .get_port               = inet_csk_get_port,
2436         .enter_memory_pressure  = tcp_enter_memory_pressure,
2437         .sockets_allocated      = &tcp_sockets_allocated,
2438         .orphan_count           = &tcp_orphan_count,
2439         .memory_allocated       = &tcp_memory_allocated,
2440         .memory_pressure        = &tcp_memory_pressure,
2441         .sysctl_mem             = sysctl_tcp_mem,
2442         .sysctl_wmem            = sysctl_tcp_wmem,
2443         .sysctl_rmem            = sysctl_tcp_rmem,
2444         .max_header             = MAX_TCP_HEADER,
2445         .obj_size               = sizeof(struct tcp_sock),
2446         .twsk_prot              = &tcp_timewait_sock_ops,
2447         .rsk_prot               = &tcp_request_sock_ops,
2448         .h.hashinfo             = &tcp_hashinfo,
2449 #ifdef CONFIG_COMPAT
2450         .compat_setsockopt      = compat_tcp_setsockopt,
2451         .compat_getsockopt      = compat_tcp_getsockopt,
2452 #endif
2453 };
2454
2455
2456 static int __net_init tcp_sk_init(struct net *net)
2457 {
2458         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2459                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2460 }
2461
2462 static void __net_exit tcp_sk_exit(struct net *net)
2463 {
2464         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2465 }
2466
2467 static struct pernet_operations __net_initdata tcp_sk_ops = {
2468        .init = tcp_sk_init,
2469        .exit = tcp_sk_exit,
2470 };
2471
2472 void __init tcp_v4_init(void)
2473 {
2474         if (register_pernet_device(&tcp_sk_ops))
2475                 panic("Failed to create the TCP control socket.\n");
2476 }
2477
2478 EXPORT_SYMBOL(ipv4_specific);
2479 EXPORT_SYMBOL(tcp_hashinfo);
2480 EXPORT_SYMBOL(tcp_prot);
2481 EXPORT_SYMBOL(tcp_v4_conn_request);
2482 EXPORT_SYMBOL(tcp_v4_connect);
2483 EXPORT_SYMBOL(tcp_v4_do_rcv);
2484 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2485 EXPORT_SYMBOL(tcp_v4_send_check);
2486 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2487
2488 #ifdef CONFIG_PROC_FS
2489 EXPORT_SYMBOL(tcp_proc_register);
2490 EXPORT_SYMBOL(tcp_proc_unregister);
2491 #endif
2492 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2493