tcp: Fix slowness in read /proc/net/tcp
[linux-flexiantxendom0-natty.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53
54 #include <linux/bottom_half.h>
55 #include <linux/types.h>
56 #include <linux/fcntl.h>
57 #include <linux/module.h>
58 #include <linux/random.h>
59 #include <linux/cache.h>
60 #include <linux/jhash.h>
61 #include <linux/init.h>
62 #include <linux/times.h>
63 #include <linux/slab.h>
64
65 #include <net/net_namespace.h>
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/transp_v6.h>
70 #include <net/ipv6.h>
71 #include <net/inet_common.h>
72 #include <net/timewait_sock.h>
73 #include <net/xfrm.h>
74 #include <net/netdma.h>
75
76 #include <linux/inet.h>
77 #include <linux/ipv6.h>
78 #include <linux/stddef.h>
79 #include <linux/proc_fs.h>
80 #include <linux/seq_file.h>
81
82 #include <linux/crypto.h>
83 #include <linux/scatterlist.h>
84
85 int sysctl_tcp_tw_reuse __read_mostly;
86 int sysctl_tcp_low_latency __read_mostly;
87
88
89 #ifdef CONFIG_TCP_MD5SIG
90 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
91                                                    __be32 addr);
92 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
93                                __be32 daddr, __be32 saddr, struct tcphdr *th);
94 #else
95 static inline
96 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
97 {
98         return NULL;
99 }
100 #endif
101
102 struct inet_hashinfo tcp_hashinfo;
103
104 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
105 {
106         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
107                                           ip_hdr(skb)->saddr,
108                                           tcp_hdr(skb)->dest,
109                                           tcp_hdr(skb)->source);
110 }
111
112 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
113 {
114         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
115         struct tcp_sock *tp = tcp_sk(sk);
116
117         /* With PAWS, it is safe from the viewpoint
118            of data integrity. Even without PAWS it is safe provided sequence
119            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
120
121            Actually, the idea is close to VJ's one, only timestamp cache is
122            held not per host, but per port pair and TW bucket is used as state
123            holder.
124
125            If TW bucket has been already destroyed we fall back to VJ's scheme
126            and use initial timestamp retrieved from peer table.
127          */
128         if (tcptw->tw_ts_recent_stamp &&
129             (twp == NULL || (sysctl_tcp_tw_reuse &&
130                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
131                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
132                 if (tp->write_seq == 0)
133                         tp->write_seq = 1;
134                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
135                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
136                 sock_hold(sktw);
137                 return 1;
138         }
139
140         return 0;
141 }
142
143 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
144
145 /* This will initiate an outgoing connection. */
146 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
147 {
148         struct inet_sock *inet = inet_sk(sk);
149         struct tcp_sock *tp = tcp_sk(sk);
150         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
151         struct rtable *rt;
152         __be32 daddr, nexthop;
153         int tmp;
154         int err;
155
156         if (addr_len < sizeof(struct sockaddr_in))
157                 return -EINVAL;
158
159         if (usin->sin_family != AF_INET)
160                 return -EAFNOSUPPORT;
161
162         nexthop = daddr = usin->sin_addr.s_addr;
163         if (inet->opt && inet->opt->srr) {
164                 if (!daddr)
165                         return -EINVAL;
166                 nexthop = inet->opt->faddr;
167         }
168
169         tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
170                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
171                                IPPROTO_TCP,
172                                inet->inet_sport, usin->sin_port, sk, 1);
173         if (tmp < 0) {
174                 if (tmp == -ENETUNREACH)
175                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
176                 return tmp;
177         }
178
179         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
180                 ip_rt_put(rt);
181                 return -ENETUNREACH;
182         }
183
184         if (!inet->opt || !inet->opt->srr)
185                 daddr = rt->rt_dst;
186
187         if (!inet->inet_saddr)
188                 inet->inet_saddr = rt->rt_src;
189         inet->inet_rcv_saddr = inet->inet_saddr;
190
191         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
192                 /* Reset inherited state */
193                 tp->rx_opt.ts_recent       = 0;
194                 tp->rx_opt.ts_recent_stamp = 0;
195                 tp->write_seq              = 0;
196         }
197
198         if (tcp_death_row.sysctl_tw_recycle &&
199             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
200                 struct inet_peer *peer = rt_get_peer(rt);
201                 /*
202                  * VJ's idea. We save last timestamp seen from
203                  * the destination in peer table, when entering state
204                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
205                  * when trying new connection.
206                  */
207                 if (peer != NULL &&
208                     (u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
209                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
210                         tp->rx_opt.ts_recent = peer->tcp_ts;
211                 }
212         }
213
214         inet->inet_dport = usin->sin_port;
215         inet->inet_daddr = daddr;
216
217         inet_csk(sk)->icsk_ext_hdr_len = 0;
218         if (inet->opt)
219                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
220
221         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
222
223         /* Socket identity is still unknown (sport may be zero).
224          * However we set state to SYN-SENT and not releasing socket
225          * lock select source port, enter ourselves into the hash tables and
226          * complete initialization after this.
227          */
228         tcp_set_state(sk, TCP_SYN_SENT);
229         err = inet_hash_connect(&tcp_death_row, sk);
230         if (err)
231                 goto failure;
232
233         err = ip_route_newports(&rt, IPPROTO_TCP,
234                                 inet->inet_sport, inet->inet_dport, sk);
235         if (err)
236                 goto failure;
237
238         /* OK, now commit destination to socket.  */
239         sk->sk_gso_type = SKB_GSO_TCPV4;
240         sk_setup_caps(sk, &rt->u.dst);
241
242         if (!tp->write_seq)
243                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
244                                                            inet->inet_daddr,
245                                                            inet->inet_sport,
246                                                            usin->sin_port);
247
248         inet->inet_id = tp->write_seq ^ jiffies;
249
250         err = tcp_connect(sk);
251         rt = NULL;
252         if (err)
253                 goto failure;
254
255         return 0;
256
257 failure:
258         /*
259          * This unhashes the socket and releases the local port,
260          * if necessary.
261          */
262         tcp_set_state(sk, TCP_CLOSE);
263         ip_rt_put(rt);
264         sk->sk_route_caps = 0;
265         inet->inet_dport = 0;
266         return err;
267 }
268
269 /*
270  * This routine does path mtu discovery as defined in RFC1191.
271  */
272 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
273 {
274         struct dst_entry *dst;
275         struct inet_sock *inet = inet_sk(sk);
276
277         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
278          * send out by Linux are always <576bytes so they should go through
279          * unfragmented).
280          */
281         if (sk->sk_state == TCP_LISTEN)
282                 return;
283
284         /* We don't check in the destentry if pmtu discovery is forbidden
285          * on this route. We just assume that no packet_to_big packets
286          * are send back when pmtu discovery is not active.
287          * There is a small race when the user changes this flag in the
288          * route, but I think that's acceptable.
289          */
290         if ((dst = __sk_dst_check(sk, 0)) == NULL)
291                 return;
292
293         dst->ops->update_pmtu(dst, mtu);
294
295         /* Something is about to be wrong... Remember soft error
296          * for the case, if this connection will not able to recover.
297          */
298         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
299                 sk->sk_err_soft = EMSGSIZE;
300
301         mtu = dst_mtu(dst);
302
303         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
304             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
305                 tcp_sync_mss(sk, mtu);
306
307                 /* Resend the TCP packet because it's
308                  * clear that the old packet has been
309                  * dropped. This is the new "fast" path mtu
310                  * discovery.
311                  */
312                 tcp_simple_retransmit(sk);
313         } /* else let the usual retransmit timer handle it */
314 }
315
316 /*
317  * This routine is called by the ICMP module when it gets some
318  * sort of error condition.  If err < 0 then the socket should
319  * be closed and the error returned to the user.  If err > 0
320  * it's just the icmp type << 8 | icmp code.  After adjustment
321  * header points to the first 8 bytes of the tcp header.  We need
322  * to find the appropriate port.
323  *
324  * The locking strategy used here is very "optimistic". When
325  * someone else accesses the socket the ICMP is just dropped
326  * and for some paths there is no check at all.
327  * A more general error queue to queue errors for later handling
328  * is probably better.
329  *
330  */
331
332 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
333 {
334         struct iphdr *iph = (struct iphdr *)icmp_skb->data;
335         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
336         struct inet_connection_sock *icsk;
337         struct tcp_sock *tp;
338         struct inet_sock *inet;
339         const int type = icmp_hdr(icmp_skb)->type;
340         const int code = icmp_hdr(icmp_skb)->code;
341         struct sock *sk;
342         struct sk_buff *skb;
343         __u32 seq;
344         __u32 remaining;
345         int err;
346         struct net *net = dev_net(icmp_skb->dev);
347
348         if (icmp_skb->len < (iph->ihl << 2) + 8) {
349                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
350                 return;
351         }
352
353         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
354                         iph->saddr, th->source, inet_iif(icmp_skb));
355         if (!sk) {
356                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
357                 return;
358         }
359         if (sk->sk_state == TCP_TIME_WAIT) {
360                 inet_twsk_put(inet_twsk(sk));
361                 return;
362         }
363
364         bh_lock_sock(sk);
365         /* If too many ICMPs get dropped on busy
366          * servers this needs to be solved differently.
367          */
368         if (sock_owned_by_user(sk))
369                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
370
371         if (sk->sk_state == TCP_CLOSE)
372                 goto out;
373
374         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
375                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
376                 goto out;
377         }
378
379         icsk = inet_csk(sk);
380         tp = tcp_sk(sk);
381         seq = ntohl(th->seq);
382         if (sk->sk_state != TCP_LISTEN &&
383             !between(seq, tp->snd_una, tp->snd_nxt)) {
384                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
385                 goto out;
386         }
387
388         switch (type) {
389         case ICMP_SOURCE_QUENCH:
390                 /* Just silently ignore these. */
391                 goto out;
392         case ICMP_PARAMETERPROB:
393                 err = EPROTO;
394                 break;
395         case ICMP_DEST_UNREACH:
396                 if (code > NR_ICMP_UNREACH)
397                         goto out;
398
399                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
400                         if (!sock_owned_by_user(sk))
401                                 do_pmtu_discovery(sk, iph, info);
402                         goto out;
403                 }
404
405                 err = icmp_err_convert[code].errno;
406                 /* check if icmp_skb allows revert of backoff
407                  * (see draft-zimmermann-tcp-lcd) */
408                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
409                         break;
410                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
411                     !icsk->icsk_backoff)
412                         break;
413
414                 icsk->icsk_backoff--;
415                 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
416                                          icsk->icsk_backoff;
417                 tcp_bound_rto(sk);
418
419                 skb = tcp_write_queue_head(sk);
420                 BUG_ON(!skb);
421
422                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
423                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
424
425                 if (remaining) {
426                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
427                                                   remaining, TCP_RTO_MAX);
428                 } else if (sock_owned_by_user(sk)) {
429                         /* RTO revert clocked out retransmission,
430                          * but socket is locked. Will defer. */
431                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
432                                                   HZ/20, TCP_RTO_MAX);
433                 } else {
434                         /* RTO revert clocked out retransmission.
435                          * Will retransmit now */
436                         tcp_retransmit_timer(sk);
437                 }
438
439                 break;
440         case ICMP_TIME_EXCEEDED:
441                 err = EHOSTUNREACH;
442                 break;
443         default:
444                 goto out;
445         }
446
447         switch (sk->sk_state) {
448                 struct request_sock *req, **prev;
449         case TCP_LISTEN:
450                 if (sock_owned_by_user(sk))
451                         goto out;
452
453                 req = inet_csk_search_req(sk, &prev, th->dest,
454                                           iph->daddr, iph->saddr);
455                 if (!req)
456                         goto out;
457
458                 /* ICMPs are not backlogged, hence we cannot get
459                    an established socket here.
460                  */
461                 WARN_ON(req->sk);
462
463                 if (seq != tcp_rsk(req)->snt_isn) {
464                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
465                         goto out;
466                 }
467
468                 /*
469                  * Still in SYN_RECV, just remove it silently.
470                  * There is no good way to pass the error to the newly
471                  * created socket, and POSIX does not want network
472                  * errors returned from accept().
473                  */
474                 inet_csk_reqsk_queue_drop(sk, req, prev);
475                 goto out;
476
477         case TCP_SYN_SENT:
478         case TCP_SYN_RECV:  /* Cannot happen.
479                                It can f.e. if SYNs crossed.
480                              */
481                 if (!sock_owned_by_user(sk)) {
482                         sk->sk_err = err;
483
484                         sk->sk_error_report(sk);
485
486                         tcp_done(sk);
487                 } else {
488                         sk->sk_err_soft = err;
489                 }
490                 goto out;
491         }
492
493         /* If we've already connected we will keep trying
494          * until we time out, or the user gives up.
495          *
496          * rfc1122 4.2.3.9 allows to consider as hard errors
497          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
498          * but it is obsoleted by pmtu discovery).
499          *
500          * Note, that in modern internet, where routing is unreliable
501          * and in each dark corner broken firewalls sit, sending random
502          * errors ordered by their masters even this two messages finally lose
503          * their original sense (even Linux sends invalid PORT_UNREACHs)
504          *
505          * Now we are in compliance with RFCs.
506          *                                                      --ANK (980905)
507          */
508
509         inet = inet_sk(sk);
510         if (!sock_owned_by_user(sk) && inet->recverr) {
511                 sk->sk_err = err;
512                 sk->sk_error_report(sk);
513         } else  { /* Only an error on timeout */
514                 sk->sk_err_soft = err;
515         }
516
517 out:
518         bh_unlock_sock(sk);
519         sock_put(sk);
520 }
521
522 static void __tcp_v4_send_check(struct sk_buff *skb,
523                                 __be32 saddr, __be32 daddr)
524 {
525         struct tcphdr *th = tcp_hdr(skb);
526
527         if (skb->ip_summed == CHECKSUM_PARTIAL) {
528                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
529                 skb->csum_start = skb_transport_header(skb) - skb->head;
530                 skb->csum_offset = offsetof(struct tcphdr, check);
531         } else {
532                 th->check = tcp_v4_check(skb->len, saddr, daddr,
533                                          csum_partial(th,
534                                                       th->doff << 2,
535                                                       skb->csum));
536         }
537 }
538
539 /* This routine computes an IPv4 TCP checksum. */
540 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
541 {
542         struct inet_sock *inet = inet_sk(sk);
543
544         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
545 }
546
547 int tcp_v4_gso_send_check(struct sk_buff *skb)
548 {
549         const struct iphdr *iph;
550         struct tcphdr *th;
551
552         if (!pskb_may_pull(skb, sizeof(*th)))
553                 return -EINVAL;
554
555         iph = ip_hdr(skb);
556         th = tcp_hdr(skb);
557
558         th->check = 0;
559         skb->ip_summed = CHECKSUM_PARTIAL;
560         __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
561         return 0;
562 }
563
564 /*
565  *      This routine will send an RST to the other tcp.
566  *
567  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
568  *                    for reset.
569  *      Answer: if a packet caused RST, it is not for a socket
570  *              existing in our system, if it is matched to a socket,
571  *              it is just duplicate segment or bug in other side's TCP.
572  *              So that we build reply only basing on parameters
573  *              arrived with segment.
574  *      Exception: precedence violation. We do not implement it in any case.
575  */
576
577 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
578 {
579         struct tcphdr *th = tcp_hdr(skb);
580         struct {
581                 struct tcphdr th;
582 #ifdef CONFIG_TCP_MD5SIG
583                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
584 #endif
585         } rep;
586         struct ip_reply_arg arg;
587 #ifdef CONFIG_TCP_MD5SIG
588         struct tcp_md5sig_key *key;
589 #endif
590         struct net *net;
591
592         /* Never send a reset in response to a reset. */
593         if (th->rst)
594                 return;
595
596         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
597                 return;
598
599         /* Swap the send and the receive. */
600         memset(&rep, 0, sizeof(rep));
601         rep.th.dest   = th->source;
602         rep.th.source = th->dest;
603         rep.th.doff   = sizeof(struct tcphdr) / 4;
604         rep.th.rst    = 1;
605
606         if (th->ack) {
607                 rep.th.seq = th->ack_seq;
608         } else {
609                 rep.th.ack = 1;
610                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
611                                        skb->len - (th->doff << 2));
612         }
613
614         memset(&arg, 0, sizeof(arg));
615         arg.iov[0].iov_base = (unsigned char *)&rep;
616         arg.iov[0].iov_len  = sizeof(rep.th);
617
618 #ifdef CONFIG_TCP_MD5SIG
619         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
620         if (key) {
621                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
622                                    (TCPOPT_NOP << 16) |
623                                    (TCPOPT_MD5SIG << 8) |
624                                    TCPOLEN_MD5SIG);
625                 /* Update length and the length the header thinks exists */
626                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
627                 rep.th.doff = arg.iov[0].iov_len / 4;
628
629                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
630                                      key, ip_hdr(skb)->saddr,
631                                      ip_hdr(skb)->daddr, &rep.th);
632         }
633 #endif
634         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
635                                       ip_hdr(skb)->saddr, /* XXX */
636                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
637         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
638         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
639
640         net = dev_net(skb_dst(skb)->dev);
641         ip_send_reply(net->ipv4.tcp_sock, skb,
642                       &arg, arg.iov[0].iov_len);
643
644         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
645         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
646 }
647
648 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
649    outside socket context is ugly, certainly. What can I do?
650  */
651
652 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
653                             u32 win, u32 ts, int oif,
654                             struct tcp_md5sig_key *key,
655                             int reply_flags)
656 {
657         struct tcphdr *th = tcp_hdr(skb);
658         struct {
659                 struct tcphdr th;
660                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
661 #ifdef CONFIG_TCP_MD5SIG
662                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
663 #endif
664                         ];
665         } rep;
666         struct ip_reply_arg arg;
667         struct net *net = dev_net(skb_dst(skb)->dev);
668
669         memset(&rep.th, 0, sizeof(struct tcphdr));
670         memset(&arg, 0, sizeof(arg));
671
672         arg.iov[0].iov_base = (unsigned char *)&rep;
673         arg.iov[0].iov_len  = sizeof(rep.th);
674         if (ts) {
675                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
676                                    (TCPOPT_TIMESTAMP << 8) |
677                                    TCPOLEN_TIMESTAMP);
678                 rep.opt[1] = htonl(tcp_time_stamp);
679                 rep.opt[2] = htonl(ts);
680                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
681         }
682
683         /* Swap the send and the receive. */
684         rep.th.dest    = th->source;
685         rep.th.source  = th->dest;
686         rep.th.doff    = arg.iov[0].iov_len / 4;
687         rep.th.seq     = htonl(seq);
688         rep.th.ack_seq = htonl(ack);
689         rep.th.ack     = 1;
690         rep.th.window  = htons(win);
691
692 #ifdef CONFIG_TCP_MD5SIG
693         if (key) {
694                 int offset = (ts) ? 3 : 0;
695
696                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
697                                           (TCPOPT_NOP << 16) |
698                                           (TCPOPT_MD5SIG << 8) |
699                                           TCPOLEN_MD5SIG);
700                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
701                 rep.th.doff = arg.iov[0].iov_len/4;
702
703                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
704                                     key, ip_hdr(skb)->saddr,
705                                     ip_hdr(skb)->daddr, &rep.th);
706         }
707 #endif
708         arg.flags = reply_flags;
709         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
710                                       ip_hdr(skb)->saddr, /* XXX */
711                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
712         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
713         if (oif)
714                 arg.bound_dev_if = oif;
715
716         ip_send_reply(net->ipv4.tcp_sock, skb,
717                       &arg, arg.iov[0].iov_len);
718
719         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
720 }
721
722 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
723 {
724         struct inet_timewait_sock *tw = inet_twsk(sk);
725         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
726
727         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
728                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
729                         tcptw->tw_ts_recent,
730                         tw->tw_bound_dev_if,
731                         tcp_twsk_md5_key(tcptw),
732                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
733                         );
734
735         inet_twsk_put(tw);
736 }
737
738 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
739                                   struct request_sock *req)
740 {
741         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
742                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
743                         req->ts_recent,
744                         0,
745                         tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
746                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
747 }
748
749 /*
750  *      Send a SYN-ACK after having received a SYN.
751  *      This still operates on a request_sock only, not on a big
752  *      socket.
753  */
754 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
755                               struct request_sock *req,
756                               struct request_values *rvp)
757 {
758         const struct inet_request_sock *ireq = inet_rsk(req);
759         int err = -1;
760         struct sk_buff * skb;
761
762         /* First, grab a route. */
763         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
764                 return -1;
765
766         skb = tcp_make_synack(sk, dst, req, rvp);
767
768         if (skb) {
769                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
770
771                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
772                                             ireq->rmt_addr,
773                                             ireq->opt);
774                 err = net_xmit_eval(err);
775         }
776
777         dst_release(dst);
778         return err;
779 }
780
781 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
782                               struct request_values *rvp)
783 {
784         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
785         return tcp_v4_send_synack(sk, NULL, req, rvp);
786 }
787
788 /*
789  *      IPv4 request_sock destructor.
790  */
791 static void tcp_v4_reqsk_destructor(struct request_sock *req)
792 {
793         kfree(inet_rsk(req)->opt);
794 }
795
796 static void syn_flood_warning(const struct sk_buff *skb)
797 {
798         const char *msg;
799
800 #ifdef CONFIG_SYN_COOKIES
801         if (sysctl_tcp_syncookies)
802                 msg = "Sending cookies";
803         else
804 #endif
805                 msg = "Dropping request";
806
807         pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
808                                 ntohs(tcp_hdr(skb)->dest), msg);
809 }
810
811 /*
812  * Save and compile IPv4 options into the request_sock if needed.
813  */
814 static struct ip_options *tcp_v4_save_options(struct sock *sk,
815                                               struct sk_buff *skb)
816 {
817         struct ip_options *opt = &(IPCB(skb)->opt);
818         struct ip_options *dopt = NULL;
819
820         if (opt && opt->optlen) {
821                 int opt_size = optlength(opt);
822                 dopt = kmalloc(opt_size, GFP_ATOMIC);
823                 if (dopt) {
824                         if (ip_options_echo(dopt, skb)) {
825                                 kfree(dopt);
826                                 dopt = NULL;
827                         }
828                 }
829         }
830         return dopt;
831 }
832
833 #ifdef CONFIG_TCP_MD5SIG
834 /*
835  * RFC2385 MD5 checksumming requires a mapping of
836  * IP address->MD5 Key.
837  * We need to maintain these in the sk structure.
838  */
839
840 /* Find the Key structure for an address.  */
841 static struct tcp_md5sig_key *
842                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
843 {
844         struct tcp_sock *tp = tcp_sk(sk);
845         int i;
846
847         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
848                 return NULL;
849         for (i = 0; i < tp->md5sig_info->entries4; i++) {
850                 if (tp->md5sig_info->keys4[i].addr == addr)
851                         return &tp->md5sig_info->keys4[i].base;
852         }
853         return NULL;
854 }
855
856 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
857                                          struct sock *addr_sk)
858 {
859         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
860 }
861
862 EXPORT_SYMBOL(tcp_v4_md5_lookup);
863
864 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
865                                                       struct request_sock *req)
866 {
867         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
868 }
869
870 /* This can be called on a newly created socket, from other files */
871 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
872                       u8 *newkey, u8 newkeylen)
873 {
874         /* Add Key to the list */
875         struct tcp_md5sig_key *key;
876         struct tcp_sock *tp = tcp_sk(sk);
877         struct tcp4_md5sig_key *keys;
878
879         key = tcp_v4_md5_do_lookup(sk, addr);
880         if (key) {
881                 /* Pre-existing entry - just update that one. */
882                 kfree(key->key);
883                 key->key = newkey;
884                 key->keylen = newkeylen;
885         } else {
886                 struct tcp_md5sig_info *md5sig;
887
888                 if (!tp->md5sig_info) {
889                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
890                                                   GFP_ATOMIC);
891                         if (!tp->md5sig_info) {
892                                 kfree(newkey);
893                                 return -ENOMEM;
894                         }
895                         sk_nocaps_add(sk, NETIF_F_GSO_MASK);
896                 }
897                 if (tcp_alloc_md5sig_pool(sk) == NULL) {
898                         kfree(newkey);
899                         return -ENOMEM;
900                 }
901                 md5sig = tp->md5sig_info;
902
903                 if (md5sig->alloced4 == md5sig->entries4) {
904                         keys = kmalloc((sizeof(*keys) *
905                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
906                         if (!keys) {
907                                 kfree(newkey);
908                                 tcp_free_md5sig_pool();
909                                 return -ENOMEM;
910                         }
911
912                         if (md5sig->entries4)
913                                 memcpy(keys, md5sig->keys4,
914                                        sizeof(*keys) * md5sig->entries4);
915
916                         /* Free old key list, and reference new one */
917                         kfree(md5sig->keys4);
918                         md5sig->keys4 = keys;
919                         md5sig->alloced4++;
920                 }
921                 md5sig->entries4++;
922                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
923                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
924                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
925         }
926         return 0;
927 }
928
929 EXPORT_SYMBOL(tcp_v4_md5_do_add);
930
931 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
932                                u8 *newkey, u8 newkeylen)
933 {
934         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
935                                  newkey, newkeylen);
936 }
937
938 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
939 {
940         struct tcp_sock *tp = tcp_sk(sk);
941         int i;
942
943         for (i = 0; i < tp->md5sig_info->entries4; i++) {
944                 if (tp->md5sig_info->keys4[i].addr == addr) {
945                         /* Free the key */
946                         kfree(tp->md5sig_info->keys4[i].base.key);
947                         tp->md5sig_info->entries4--;
948
949                         if (tp->md5sig_info->entries4 == 0) {
950                                 kfree(tp->md5sig_info->keys4);
951                                 tp->md5sig_info->keys4 = NULL;
952                                 tp->md5sig_info->alloced4 = 0;
953                         } else if (tp->md5sig_info->entries4 != i) {
954                                 /* Need to do some manipulation */
955                                 memmove(&tp->md5sig_info->keys4[i],
956                                         &tp->md5sig_info->keys4[i+1],
957                                         (tp->md5sig_info->entries4 - i) *
958                                          sizeof(struct tcp4_md5sig_key));
959                         }
960                         tcp_free_md5sig_pool();
961                         return 0;
962                 }
963         }
964         return -ENOENT;
965 }
966
967 EXPORT_SYMBOL(tcp_v4_md5_do_del);
968
969 static void tcp_v4_clear_md5_list(struct sock *sk)
970 {
971         struct tcp_sock *tp = tcp_sk(sk);
972
973         /* Free each key, then the set of key keys,
974          * the crypto element, and then decrement our
975          * hold on the last resort crypto.
976          */
977         if (tp->md5sig_info->entries4) {
978                 int i;
979                 for (i = 0; i < tp->md5sig_info->entries4; i++)
980                         kfree(tp->md5sig_info->keys4[i].base.key);
981                 tp->md5sig_info->entries4 = 0;
982                 tcp_free_md5sig_pool();
983         }
984         if (tp->md5sig_info->keys4) {
985                 kfree(tp->md5sig_info->keys4);
986                 tp->md5sig_info->keys4 = NULL;
987                 tp->md5sig_info->alloced4  = 0;
988         }
989 }
990
991 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
992                                  int optlen)
993 {
994         struct tcp_md5sig cmd;
995         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
996         u8 *newkey;
997
998         if (optlen < sizeof(cmd))
999                 return -EINVAL;
1000
1001         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1002                 return -EFAULT;
1003
1004         if (sin->sin_family != AF_INET)
1005                 return -EINVAL;
1006
1007         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1008                 if (!tcp_sk(sk)->md5sig_info)
1009                         return -ENOENT;
1010                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1011         }
1012
1013         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1014                 return -EINVAL;
1015
1016         if (!tcp_sk(sk)->md5sig_info) {
1017                 struct tcp_sock *tp = tcp_sk(sk);
1018                 struct tcp_md5sig_info *p;
1019
1020                 p = kzalloc(sizeof(*p), sk->sk_allocation);
1021                 if (!p)
1022                         return -EINVAL;
1023
1024                 tp->md5sig_info = p;
1025                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1026         }
1027
1028         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1029         if (!newkey)
1030                 return -ENOMEM;
1031         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1032                                  newkey, cmd.tcpm_keylen);
1033 }
1034
1035 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1036                                         __be32 daddr, __be32 saddr, int nbytes)
1037 {
1038         struct tcp4_pseudohdr *bp;
1039         struct scatterlist sg;
1040
1041         bp = &hp->md5_blk.ip4;
1042
1043         /*
1044          * 1. the TCP pseudo-header (in the order: source IP address,
1045          * destination IP address, zero-padded protocol number, and
1046          * segment length)
1047          */
1048         bp->saddr = saddr;
1049         bp->daddr = daddr;
1050         bp->pad = 0;
1051         bp->protocol = IPPROTO_TCP;
1052         bp->len = cpu_to_be16(nbytes);
1053
1054         sg_init_one(&sg, bp, sizeof(*bp));
1055         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1056 }
1057
1058 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1059                                __be32 daddr, __be32 saddr, struct tcphdr *th)
1060 {
1061         struct tcp_md5sig_pool *hp;
1062         struct hash_desc *desc;
1063
1064         hp = tcp_get_md5sig_pool();
1065         if (!hp)
1066                 goto clear_hash_noput;
1067         desc = &hp->md5_desc;
1068
1069         if (crypto_hash_init(desc))
1070                 goto clear_hash;
1071         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1072                 goto clear_hash;
1073         if (tcp_md5_hash_header(hp, th))
1074                 goto clear_hash;
1075         if (tcp_md5_hash_key(hp, key))
1076                 goto clear_hash;
1077         if (crypto_hash_final(desc, md5_hash))
1078                 goto clear_hash;
1079
1080         tcp_put_md5sig_pool();
1081         return 0;
1082
1083 clear_hash:
1084         tcp_put_md5sig_pool();
1085 clear_hash_noput:
1086         memset(md5_hash, 0, 16);
1087         return 1;
1088 }
1089
1090 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1091                         struct sock *sk, struct request_sock *req,
1092                         struct sk_buff *skb)
1093 {
1094         struct tcp_md5sig_pool *hp;
1095         struct hash_desc *desc;
1096         struct tcphdr *th = tcp_hdr(skb);
1097         __be32 saddr, daddr;
1098
1099         if (sk) {
1100                 saddr = inet_sk(sk)->inet_saddr;
1101                 daddr = inet_sk(sk)->inet_daddr;
1102         } else if (req) {
1103                 saddr = inet_rsk(req)->loc_addr;
1104                 daddr = inet_rsk(req)->rmt_addr;
1105         } else {
1106                 const struct iphdr *iph = ip_hdr(skb);
1107                 saddr = iph->saddr;
1108                 daddr = iph->daddr;
1109         }
1110
1111         hp = tcp_get_md5sig_pool();
1112         if (!hp)
1113                 goto clear_hash_noput;
1114         desc = &hp->md5_desc;
1115
1116         if (crypto_hash_init(desc))
1117                 goto clear_hash;
1118
1119         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1120                 goto clear_hash;
1121         if (tcp_md5_hash_header(hp, th))
1122                 goto clear_hash;
1123         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1124                 goto clear_hash;
1125         if (tcp_md5_hash_key(hp, key))
1126                 goto clear_hash;
1127         if (crypto_hash_final(desc, md5_hash))
1128                 goto clear_hash;
1129
1130         tcp_put_md5sig_pool();
1131         return 0;
1132
1133 clear_hash:
1134         tcp_put_md5sig_pool();
1135 clear_hash_noput:
1136         memset(md5_hash, 0, 16);
1137         return 1;
1138 }
1139
1140 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1141
1142 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1143 {
1144         /*
1145          * This gets called for each TCP segment that arrives
1146          * so we want to be efficient.
1147          * We have 3 drop cases:
1148          * o No MD5 hash and one expected.
1149          * o MD5 hash and we're not expecting one.
1150          * o MD5 hash and its wrong.
1151          */
1152         __u8 *hash_location = NULL;
1153         struct tcp_md5sig_key *hash_expected;
1154         const struct iphdr *iph = ip_hdr(skb);
1155         struct tcphdr *th = tcp_hdr(skb);
1156         int genhash;
1157         unsigned char newhash[16];
1158
1159         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1160         hash_location = tcp_parse_md5sig_option(th);
1161
1162         /* We've parsed the options - do we have a hash? */
1163         if (!hash_expected && !hash_location)
1164                 return 0;
1165
1166         if (hash_expected && !hash_location) {
1167                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1168                 return 1;
1169         }
1170
1171         if (!hash_expected && hash_location) {
1172                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1173                 return 1;
1174         }
1175
1176         /* Okay, so this is hash_expected and hash_location -
1177          * so we need to calculate the checksum.
1178          */
1179         genhash = tcp_v4_md5_hash_skb(newhash,
1180                                       hash_expected,
1181                                       NULL, NULL, skb);
1182
1183         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1184                 if (net_ratelimit()) {
1185                         printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1186                                &iph->saddr, ntohs(th->source),
1187                                &iph->daddr, ntohs(th->dest),
1188                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1189                 }
1190                 return 1;
1191         }
1192         return 0;
1193 }
1194
1195 #endif
1196
1197 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1198         .family         =       PF_INET,
1199         .obj_size       =       sizeof(struct tcp_request_sock),
1200         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1201         .send_ack       =       tcp_v4_reqsk_send_ack,
1202         .destructor     =       tcp_v4_reqsk_destructor,
1203         .send_reset     =       tcp_v4_send_reset,
1204         .syn_ack_timeout =      tcp_syn_ack_timeout,
1205 };
1206
1207 #ifdef CONFIG_TCP_MD5SIG
1208 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1209         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1210         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1211 };
1212 #endif
1213
1214 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1215         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1216         .twsk_unique    = tcp_twsk_unique,
1217         .twsk_destructor= tcp_twsk_destructor,
1218 };
1219
1220 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1221 {
1222         struct tcp_extend_values tmp_ext;
1223         struct tcp_options_received tmp_opt;
1224         u8 *hash_location;
1225         struct request_sock *req;
1226         struct inet_request_sock *ireq;
1227         struct tcp_sock *tp = tcp_sk(sk);
1228         struct dst_entry *dst = NULL;
1229         __be32 saddr = ip_hdr(skb)->saddr;
1230         __be32 daddr = ip_hdr(skb)->daddr;
1231         __u32 isn = TCP_SKB_CB(skb)->when;
1232 #ifdef CONFIG_SYN_COOKIES
1233         int want_cookie = 0;
1234 #else
1235 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1236 #endif
1237
1238         /* Never answer to SYNs send to broadcast or multicast */
1239         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1240                 goto drop;
1241
1242         /* TW buckets are converted to open requests without
1243          * limitations, they conserve resources and peer is
1244          * evidently real one.
1245          */
1246         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1247                 if (net_ratelimit())
1248                         syn_flood_warning(skb);
1249 #ifdef CONFIG_SYN_COOKIES
1250                 if (sysctl_tcp_syncookies) {
1251                         want_cookie = 1;
1252                 } else
1253 #endif
1254                 goto drop;
1255         }
1256
1257         /* Accept backlog is full. If we have already queued enough
1258          * of warm entries in syn queue, drop request. It is better than
1259          * clogging syn queue with openreqs with exponentially increasing
1260          * timeout.
1261          */
1262         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1263                 goto drop;
1264
1265         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1266         if (!req)
1267                 goto drop;
1268
1269 #ifdef CONFIG_TCP_MD5SIG
1270         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1271 #endif
1272
1273         tcp_clear_options(&tmp_opt);
1274         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1275         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1276         tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1277
1278         if (tmp_opt.cookie_plus > 0 &&
1279             tmp_opt.saw_tstamp &&
1280             !tp->rx_opt.cookie_out_never &&
1281             (sysctl_tcp_cookie_size > 0 ||
1282              (tp->cookie_values != NULL &&
1283               tp->cookie_values->cookie_desired > 0))) {
1284                 u8 *c;
1285                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1286                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1287
1288                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1289                         goto drop_and_release;
1290
1291                 /* Secret recipe starts with IP addresses */
1292                 *mess++ ^= (__force u32)daddr;
1293                 *mess++ ^= (__force u32)saddr;
1294
1295                 /* plus variable length Initiator Cookie */
1296                 c = (u8 *)mess;
1297                 while (l-- > 0)
1298                         *c++ ^= *hash_location++;
1299
1300 #ifdef CONFIG_SYN_COOKIES
1301                 want_cookie = 0;        /* not our kind of cookie */
1302 #endif
1303                 tmp_ext.cookie_out_never = 0; /* false */
1304                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1305         } else if (!tp->rx_opt.cookie_in_always) {
1306                 /* redundant indications, but ensure initialization. */
1307                 tmp_ext.cookie_out_never = 1; /* true */
1308                 tmp_ext.cookie_plus = 0;
1309         } else {
1310                 goto drop_and_release;
1311         }
1312         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1313
1314         if (want_cookie && !tmp_opt.saw_tstamp)
1315                 tcp_clear_options(&tmp_opt);
1316
1317         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1318         tcp_openreq_init(req, &tmp_opt, skb);
1319
1320         ireq = inet_rsk(req);
1321         ireq->loc_addr = daddr;
1322         ireq->rmt_addr = saddr;
1323         ireq->no_srccheck = inet_sk(sk)->transparent;
1324         ireq->opt = tcp_v4_save_options(sk, skb);
1325
1326         if (security_inet_conn_request(sk, skb, req))
1327                 goto drop_and_free;
1328
1329         if (!want_cookie)
1330                 TCP_ECN_create_request(req, tcp_hdr(skb));
1331
1332         if (want_cookie) {
1333 #ifdef CONFIG_SYN_COOKIES
1334                 req->cookie_ts = tmp_opt.tstamp_ok;
1335 #endif
1336                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1337         } else if (!isn) {
1338                 struct inet_peer *peer = NULL;
1339
1340                 /* VJ's idea. We save last timestamp seen
1341                  * from the destination in peer table, when entering
1342                  * state TIME-WAIT, and check against it before
1343                  * accepting new connection request.
1344                  *
1345                  * If "isn" is not zero, this request hit alive
1346                  * timewait bucket, so that all the necessary checks
1347                  * are made in the function processing timewait state.
1348                  */
1349                 if (tmp_opt.saw_tstamp &&
1350                     tcp_death_row.sysctl_tw_recycle &&
1351                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1352                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1353                     peer->v4daddr == saddr) {
1354                         if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1355                             (s32)(peer->tcp_ts - req->ts_recent) >
1356                                                         TCP_PAWS_WINDOW) {
1357                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1358                                 goto drop_and_release;
1359                         }
1360                 }
1361                 /* Kill the following clause, if you dislike this way. */
1362                 else if (!sysctl_tcp_syncookies &&
1363                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1364                           (sysctl_max_syn_backlog >> 2)) &&
1365                          (!peer || !peer->tcp_ts_stamp) &&
1366                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1367                         /* Without syncookies last quarter of
1368                          * backlog is filled with destinations,
1369                          * proven to be alive.
1370                          * It means that we continue to communicate
1371                          * to destinations, already remembered
1372                          * to the moment of synflood.
1373                          */
1374                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1375                                        &saddr, ntohs(tcp_hdr(skb)->source));
1376                         goto drop_and_release;
1377                 }
1378
1379                 isn = tcp_v4_init_sequence(skb);
1380         }
1381         tcp_rsk(req)->snt_isn = isn;
1382
1383         if (tcp_v4_send_synack(sk, dst, req,
1384                                (struct request_values *)&tmp_ext) ||
1385             want_cookie)
1386                 goto drop_and_free;
1387
1388         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1389         return 0;
1390
1391 drop_and_release:
1392         dst_release(dst);
1393 drop_and_free:
1394         reqsk_free(req);
1395 drop:
1396         return 0;
1397 }
1398
1399
1400 /*
1401  * The three way handshake has completed - we got a valid synack -
1402  * now create the new socket.
1403  */
1404 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1405                                   struct request_sock *req,
1406                                   struct dst_entry *dst)
1407 {
1408         struct inet_request_sock *ireq;
1409         struct inet_sock *newinet;
1410         struct tcp_sock *newtp;
1411         struct sock *newsk;
1412 #ifdef CONFIG_TCP_MD5SIG
1413         struct tcp_md5sig_key *key;
1414 #endif
1415
1416         if (sk_acceptq_is_full(sk))
1417                 goto exit_overflow;
1418
1419         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1420                 goto exit;
1421
1422         newsk = tcp_create_openreq_child(sk, req, skb);
1423         if (!newsk)
1424                 goto exit;
1425
1426         newsk->sk_gso_type = SKB_GSO_TCPV4;
1427         sk_setup_caps(newsk, dst);
1428
1429         newtp                 = tcp_sk(newsk);
1430         newinet               = inet_sk(newsk);
1431         ireq                  = inet_rsk(req);
1432         newinet->inet_daddr   = ireq->rmt_addr;
1433         newinet->inet_rcv_saddr = ireq->loc_addr;
1434         newinet->inet_saddr           = ireq->loc_addr;
1435         newinet->opt          = ireq->opt;
1436         ireq->opt             = NULL;
1437         newinet->mc_index     = inet_iif(skb);
1438         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1439         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1440         if (newinet->opt)
1441                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1442         newinet->inet_id = newtp->write_seq ^ jiffies;
1443
1444         tcp_mtup_init(newsk);
1445         tcp_sync_mss(newsk, dst_mtu(dst));
1446         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1447         if (tcp_sk(sk)->rx_opt.user_mss &&
1448             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1449                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1450
1451         tcp_initialize_rcv_mss(newsk);
1452
1453 #ifdef CONFIG_TCP_MD5SIG
1454         /* Copy over the MD5 key from the original socket */
1455         key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1456         if (key != NULL) {
1457                 /*
1458                  * We're using one, so create a matching key
1459                  * on the newsk structure. If we fail to get
1460                  * memory, then we end up not copying the key
1461                  * across. Shucks.
1462                  */
1463                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1464                 if (newkey != NULL)
1465                         tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1466                                           newkey, key->keylen);
1467                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1468         }
1469 #endif
1470
1471         __inet_hash_nolisten(newsk, NULL);
1472         __inet_inherit_port(sk, newsk);
1473
1474         return newsk;
1475
1476 exit_overflow:
1477         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1478 exit:
1479         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1480         dst_release(dst);
1481         return NULL;
1482 }
1483
1484 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1485 {
1486         struct tcphdr *th = tcp_hdr(skb);
1487         const struct iphdr *iph = ip_hdr(skb);
1488         struct sock *nsk;
1489         struct request_sock **prev;
1490         /* Find possible connection requests. */
1491         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1492                                                        iph->saddr, iph->daddr);
1493         if (req)
1494                 return tcp_check_req(sk, skb, req, prev);
1495
1496         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1497                         th->source, iph->daddr, th->dest, inet_iif(skb));
1498
1499         if (nsk) {
1500                 if (nsk->sk_state != TCP_TIME_WAIT) {
1501                         bh_lock_sock(nsk);
1502                         return nsk;
1503                 }
1504                 inet_twsk_put(inet_twsk(nsk));
1505                 return NULL;
1506         }
1507
1508 #ifdef CONFIG_SYN_COOKIES
1509         if (!th->syn)
1510                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1511 #endif
1512         return sk;
1513 }
1514
1515 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1516 {
1517         const struct iphdr *iph = ip_hdr(skb);
1518
1519         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1520                 if (!tcp_v4_check(skb->len, iph->saddr,
1521                                   iph->daddr, skb->csum)) {
1522                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1523                         return 0;
1524                 }
1525         }
1526
1527         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1528                                        skb->len, IPPROTO_TCP, 0);
1529
1530         if (skb->len <= 76) {
1531                 return __skb_checksum_complete(skb);
1532         }
1533         return 0;
1534 }
1535
1536
1537 /* The socket must have it's spinlock held when we get
1538  * here.
1539  *
1540  * We have a potential double-lock case here, so even when
1541  * doing backlog processing we use the BH locking scheme.
1542  * This is because we cannot sleep with the original spinlock
1543  * held.
1544  */
1545 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1546 {
1547         struct sock *rsk;
1548 #ifdef CONFIG_TCP_MD5SIG
1549         /*
1550          * We really want to reject the packet as early as possible
1551          * if:
1552          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1553          *  o There is an MD5 option and we're not expecting one
1554          */
1555         if (tcp_v4_inbound_md5_hash(sk, skb))
1556                 goto discard;
1557 #endif
1558
1559         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1560                 sock_rps_save_rxhash(sk, skb->rxhash);
1561                 TCP_CHECK_TIMER(sk);
1562                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1563                         rsk = sk;
1564                         goto reset;
1565                 }
1566                 TCP_CHECK_TIMER(sk);
1567                 return 0;
1568         }
1569
1570         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1571                 goto csum_err;
1572
1573         if (sk->sk_state == TCP_LISTEN) {
1574                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1575                 if (!nsk)
1576                         goto discard;
1577
1578                 if (nsk != sk) {
1579                         if (tcp_child_process(sk, nsk, skb)) {
1580                                 rsk = nsk;
1581                                 goto reset;
1582                         }
1583                         return 0;
1584                 }
1585         } else
1586                 sock_rps_save_rxhash(sk, skb->rxhash);
1587
1588
1589         TCP_CHECK_TIMER(sk);
1590         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1591                 rsk = sk;
1592                 goto reset;
1593         }
1594         TCP_CHECK_TIMER(sk);
1595         return 0;
1596
1597 reset:
1598         tcp_v4_send_reset(rsk, skb);
1599 discard:
1600         kfree_skb(skb);
1601         /* Be careful here. If this function gets more complicated and
1602          * gcc suffers from register pressure on the x86, sk (in %ebx)
1603          * might be destroyed here. This current version compiles correctly,
1604          * but you have been warned.
1605          */
1606         return 0;
1607
1608 csum_err:
1609         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1610         goto discard;
1611 }
1612
1613 /*
1614  *      From tcp_input.c
1615  */
1616
1617 int tcp_v4_rcv(struct sk_buff *skb)
1618 {
1619         const struct iphdr *iph;
1620         struct tcphdr *th;
1621         struct sock *sk;
1622         int ret;
1623         struct net *net = dev_net(skb->dev);
1624
1625         if (skb->pkt_type != PACKET_HOST)
1626                 goto discard_it;
1627
1628         /* Count it even if it's bad */
1629         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1630
1631         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1632                 goto discard_it;
1633
1634         th = tcp_hdr(skb);
1635
1636         if (th->doff < sizeof(struct tcphdr) / 4)
1637                 goto bad_packet;
1638         if (!pskb_may_pull(skb, th->doff * 4))
1639                 goto discard_it;
1640
1641         /* An explanation is required here, I think.
1642          * Packet length and doff are validated by header prediction,
1643          * provided case of th->doff==0 is eliminated.
1644          * So, we defer the checks. */
1645         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1646                 goto bad_packet;
1647
1648         th = tcp_hdr(skb);
1649         iph = ip_hdr(skb);
1650         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1651         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1652                                     skb->len - th->doff * 4);
1653         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1654         TCP_SKB_CB(skb)->when    = 0;
1655         TCP_SKB_CB(skb)->flags   = iph->tos;
1656         TCP_SKB_CB(skb)->sacked  = 0;
1657
1658         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1659         if (!sk)
1660                 goto no_tcp_socket;
1661
1662 process:
1663         if (sk->sk_state == TCP_TIME_WAIT)
1664                 goto do_time_wait;
1665
1666         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1667                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1668                 goto discard_and_relse;
1669         }
1670
1671         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1672                 goto discard_and_relse;
1673         nf_reset(skb);
1674
1675         if (sk_filter(sk, skb))
1676                 goto discard_and_relse;
1677
1678         skb->dev = NULL;
1679
1680         bh_lock_sock_nested(sk);
1681         ret = 0;
1682         if (!sock_owned_by_user(sk)) {
1683 #ifdef CONFIG_NET_DMA
1684                 struct tcp_sock *tp = tcp_sk(sk);
1685                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1686                         tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1687                 if (tp->ucopy.dma_chan)
1688                         ret = tcp_v4_do_rcv(sk, skb);
1689                 else
1690 #endif
1691                 {
1692                         if (!tcp_prequeue(sk, skb))
1693                                 ret = tcp_v4_do_rcv(sk, skb);
1694                 }
1695         } else if (unlikely(sk_add_backlog(sk, skb))) {
1696                 bh_unlock_sock(sk);
1697                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1698                 goto discard_and_relse;
1699         }
1700         bh_unlock_sock(sk);
1701
1702         sock_put(sk);
1703
1704         return ret;
1705
1706 no_tcp_socket:
1707         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1708                 goto discard_it;
1709
1710         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1711 bad_packet:
1712                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1713         } else {
1714                 tcp_v4_send_reset(NULL, skb);
1715         }
1716
1717 discard_it:
1718         /* Discard frame. */
1719         kfree_skb(skb);
1720         return 0;
1721
1722 discard_and_relse:
1723         sock_put(sk);
1724         goto discard_it;
1725
1726 do_time_wait:
1727         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1728                 inet_twsk_put(inet_twsk(sk));
1729                 goto discard_it;
1730         }
1731
1732         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1733                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1734                 inet_twsk_put(inet_twsk(sk));
1735                 goto discard_it;
1736         }
1737         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1738         case TCP_TW_SYN: {
1739                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1740                                                         &tcp_hashinfo,
1741                                                         iph->daddr, th->dest,
1742                                                         inet_iif(skb));
1743                 if (sk2) {
1744                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1745                         inet_twsk_put(inet_twsk(sk));
1746                         sk = sk2;
1747                         goto process;
1748                 }
1749                 /* Fall through to ACK */
1750         }
1751         case TCP_TW_ACK:
1752                 tcp_v4_timewait_ack(sk, skb);
1753                 break;
1754         case TCP_TW_RST:
1755                 goto no_tcp_socket;
1756         case TCP_TW_SUCCESS:;
1757         }
1758         goto discard_it;
1759 }
1760
1761 /* VJ's idea. Save last timestamp seen from this destination
1762  * and hold it at least for normal timewait interval to use for duplicate
1763  * segment detection in subsequent connections, before they enter synchronized
1764  * state.
1765  */
1766
1767 int tcp_v4_remember_stamp(struct sock *sk)
1768 {
1769         struct inet_sock *inet = inet_sk(sk);
1770         struct tcp_sock *tp = tcp_sk(sk);
1771         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1772         struct inet_peer *peer = NULL;
1773         int release_it = 0;
1774
1775         if (!rt || rt->rt_dst != inet->inet_daddr) {
1776                 peer = inet_getpeer(inet->inet_daddr, 1);
1777                 release_it = 1;
1778         } else {
1779                 if (!rt->peer)
1780                         rt_bind_peer(rt, 1);
1781                 peer = rt->peer;
1782         }
1783
1784         if (peer) {
1785                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1786                     ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1787                      peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
1788                         peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
1789                         peer->tcp_ts = tp->rx_opt.ts_recent;
1790                 }
1791                 if (release_it)
1792                         inet_putpeer(peer);
1793                 return 1;
1794         }
1795
1796         return 0;
1797 }
1798
1799 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1800 {
1801         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1802
1803         if (peer) {
1804                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1805
1806                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1807                     ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1808                      peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
1809                         peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
1810                         peer->tcp_ts       = tcptw->tw_ts_recent;
1811                 }
1812                 inet_putpeer(peer);
1813                 return 1;
1814         }
1815
1816         return 0;
1817 }
1818
1819 const struct inet_connection_sock_af_ops ipv4_specific = {
1820         .queue_xmit        = ip_queue_xmit,
1821         .send_check        = tcp_v4_send_check,
1822         .rebuild_header    = inet_sk_rebuild_header,
1823         .conn_request      = tcp_v4_conn_request,
1824         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1825         .remember_stamp    = tcp_v4_remember_stamp,
1826         .net_header_len    = sizeof(struct iphdr),
1827         .setsockopt        = ip_setsockopt,
1828         .getsockopt        = ip_getsockopt,
1829         .addr2sockaddr     = inet_csk_addr2sockaddr,
1830         .sockaddr_len      = sizeof(struct sockaddr_in),
1831         .bind_conflict     = inet_csk_bind_conflict,
1832 #ifdef CONFIG_COMPAT
1833         .compat_setsockopt = compat_ip_setsockopt,
1834         .compat_getsockopt = compat_ip_getsockopt,
1835 #endif
1836 };
1837
1838 #ifdef CONFIG_TCP_MD5SIG
1839 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1840         .md5_lookup             = tcp_v4_md5_lookup,
1841         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1842         .md5_add                = tcp_v4_md5_add_func,
1843         .md5_parse              = tcp_v4_parse_md5_keys,
1844 };
1845 #endif
1846
1847 /* NOTE: A lot of things set to zero explicitly by call to
1848  *       sk_alloc() so need not be done here.
1849  */
1850 static int tcp_v4_init_sock(struct sock *sk)
1851 {
1852         struct inet_connection_sock *icsk = inet_csk(sk);
1853         struct tcp_sock *tp = tcp_sk(sk);
1854
1855         skb_queue_head_init(&tp->out_of_order_queue);
1856         tcp_init_xmit_timers(sk);
1857         tcp_prequeue_init(tp);
1858
1859         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1860         tp->mdev = TCP_TIMEOUT_INIT;
1861
1862         /* So many TCP implementations out there (incorrectly) count the
1863          * initial SYN frame in their delayed-ACK and congestion control
1864          * algorithms that we must have the following bandaid to talk
1865          * efficiently to them.  -DaveM
1866          */
1867         tp->snd_cwnd = 2;
1868
1869         /* See draft-stevens-tcpca-spec-01 for discussion of the
1870          * initialization of these values.
1871          */
1872         tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1873         tp->snd_cwnd_clamp = ~0;
1874         tp->mss_cache = TCP_MSS_DEFAULT;
1875
1876         tp->reordering = sysctl_tcp_reordering;
1877         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1878
1879         sk->sk_state = TCP_CLOSE;
1880
1881         sk->sk_write_space = sk_stream_write_space;
1882         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1883
1884         icsk->icsk_af_ops = &ipv4_specific;
1885         icsk->icsk_sync_mss = tcp_sync_mss;
1886 #ifdef CONFIG_TCP_MD5SIG
1887         tp->af_specific = &tcp_sock_ipv4_specific;
1888 #endif
1889
1890         /* TCP Cookie Transactions */
1891         if (sysctl_tcp_cookie_size > 0) {
1892                 /* Default, cookies without s_data_payload. */
1893                 tp->cookie_values =
1894                         kzalloc(sizeof(*tp->cookie_values),
1895                                 sk->sk_allocation);
1896                 if (tp->cookie_values != NULL)
1897                         kref_init(&tp->cookie_values->kref);
1898         }
1899         /* Presumed zeroed, in order of appearance:
1900          *      cookie_in_always, cookie_out_never,
1901          *      s_data_constant, s_data_in, s_data_out
1902          */
1903         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1904         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1905
1906         local_bh_disable();
1907         percpu_counter_inc(&tcp_sockets_allocated);
1908         local_bh_enable();
1909
1910         return 0;
1911 }
1912
1913 void tcp_v4_destroy_sock(struct sock *sk)
1914 {
1915         struct tcp_sock *tp = tcp_sk(sk);
1916
1917         tcp_clear_xmit_timers(sk);
1918
1919         tcp_cleanup_congestion_control(sk);
1920
1921         /* Cleanup up the write buffer. */
1922         tcp_write_queue_purge(sk);
1923
1924         /* Cleans up our, hopefully empty, out_of_order_queue. */
1925         __skb_queue_purge(&tp->out_of_order_queue);
1926
1927 #ifdef CONFIG_TCP_MD5SIG
1928         /* Clean up the MD5 key list, if any */
1929         if (tp->md5sig_info) {
1930                 tcp_v4_clear_md5_list(sk);
1931                 kfree(tp->md5sig_info);
1932                 tp->md5sig_info = NULL;
1933         }
1934 #endif
1935
1936 #ifdef CONFIG_NET_DMA
1937         /* Cleans up our sk_async_wait_queue */
1938         __skb_queue_purge(&sk->sk_async_wait_queue);
1939 #endif
1940
1941         /* Clean prequeue, it must be empty really */
1942         __skb_queue_purge(&tp->ucopy.prequeue);
1943
1944         /* Clean up a referenced TCP bind bucket. */
1945         if (inet_csk(sk)->icsk_bind_hash)
1946                 inet_put_port(sk);
1947
1948         /*
1949          * If sendmsg cached page exists, toss it.
1950          */
1951         if (sk->sk_sndmsg_page) {
1952                 __free_page(sk->sk_sndmsg_page);
1953                 sk->sk_sndmsg_page = NULL;
1954         }
1955
1956         /* TCP Cookie Transactions */
1957         if (tp->cookie_values != NULL) {
1958                 kref_put(&tp->cookie_values->kref,
1959                          tcp_cookie_values_release);
1960                 tp->cookie_values = NULL;
1961         }
1962
1963         percpu_counter_dec(&tcp_sockets_allocated);
1964 }
1965
1966 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1967
1968 #ifdef CONFIG_PROC_FS
1969 /* Proc filesystem TCP sock list dumping. */
1970
1971 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1972 {
1973         return hlist_nulls_empty(head) ? NULL :
1974                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1975 }
1976
1977 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1978 {
1979         return !is_a_nulls(tw->tw_node.next) ?
1980                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1981 }
1982
1983 /*
1984  * Get next listener socket follow cur.  If cur is NULL, get first socket
1985  * starting from bucket given in st->bucket; when st->bucket is zero the
1986  * very first socket in the hash table is returned.
1987  */
1988 static void *listening_get_next(struct seq_file *seq, void *cur)
1989 {
1990         struct inet_connection_sock *icsk;
1991         struct hlist_nulls_node *node;
1992         struct sock *sk = cur;
1993         struct inet_listen_hashbucket *ilb;
1994         struct tcp_iter_state *st = seq->private;
1995         struct net *net = seq_file_net(seq);
1996
1997         if (!sk) {
1998                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1999                 spin_lock_bh(&ilb->lock);
2000                 sk = sk_nulls_head(&ilb->head);
2001                 st->offset = 0;
2002                 goto get_sk;
2003         }
2004         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2005         ++st->num;
2006         ++st->offset;
2007
2008         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2009                 struct request_sock *req = cur;
2010
2011                 icsk = inet_csk(st->syn_wait_sk);
2012                 req = req->dl_next;
2013                 while (1) {
2014                         while (req) {
2015                                 if (req->rsk_ops->family == st->family) {
2016                                         cur = req;
2017                                         goto out;
2018                                 }
2019                                 req = req->dl_next;
2020                         }
2021                         st->offset = 0;
2022                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2023                                 break;
2024 get_req:
2025                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2026                 }
2027                 sk        = sk_next(st->syn_wait_sk);
2028                 st->state = TCP_SEQ_STATE_LISTENING;
2029                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2030         } else {
2031                 icsk = inet_csk(sk);
2032                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2033                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2034                         goto start_req;
2035                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2036                 sk = sk_next(sk);
2037         }
2038 get_sk:
2039         sk_nulls_for_each_from(sk, node) {
2040                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
2041                         cur = sk;
2042                         goto out;
2043                 }
2044                 icsk = inet_csk(sk);
2045                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2046                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2047 start_req:
2048                         st->uid         = sock_i_uid(sk);
2049                         st->syn_wait_sk = sk;
2050                         st->state       = TCP_SEQ_STATE_OPENREQ;
2051                         st->sbucket     = 0;
2052                         goto get_req;
2053                 }
2054                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2055         }
2056         spin_unlock_bh(&ilb->lock);
2057         st->offset = 0;
2058         if (++st->bucket < INET_LHTABLE_SIZE) {
2059                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2060                 spin_lock_bh(&ilb->lock);
2061                 sk = sk_nulls_head(&ilb->head);
2062                 goto get_sk;
2063         }
2064         cur = NULL;
2065 out:
2066         return cur;
2067 }
2068
2069 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2070 {
2071         struct tcp_iter_state *st = seq->private;
2072         void *rc;
2073
2074         st->bucket = 0;
2075         st->offset = 0;
2076         rc = listening_get_next(seq, NULL);
2077
2078         while (rc && *pos) {
2079                 rc = listening_get_next(seq, rc);
2080                 --*pos;
2081         }
2082         return rc;
2083 }
2084
2085 static inline int empty_bucket(struct tcp_iter_state *st)
2086 {
2087         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2088                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2089 }
2090
2091 /*
2092  * Get first established socket starting from bucket given in st->bucket.
2093  * If st->bucket is zero, the very first socket in the hash is returned.
2094  */
2095 static void *established_get_first(struct seq_file *seq)
2096 {
2097         struct tcp_iter_state *st = seq->private;
2098         struct net *net = seq_file_net(seq);
2099         void *rc = NULL;
2100
2101         st->offset = 0;
2102         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2103                 struct sock *sk;
2104                 struct hlist_nulls_node *node;
2105                 struct inet_timewait_sock *tw;
2106                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2107
2108                 /* Lockless fast path for the common case of empty buckets */
2109                 if (empty_bucket(st))
2110                         continue;
2111
2112                 spin_lock_bh(lock);
2113                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2114                         if (sk->sk_family != st->family ||
2115                             !net_eq(sock_net(sk), net)) {
2116                                 continue;
2117                         }
2118                         rc = sk;
2119                         goto out;
2120                 }
2121                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2122                 inet_twsk_for_each(tw, node,
2123                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2124                         if (tw->tw_family != st->family ||
2125                             !net_eq(twsk_net(tw), net)) {
2126                                 continue;
2127                         }
2128                         rc = tw;
2129                         goto out;
2130                 }
2131                 spin_unlock_bh(lock);
2132                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2133         }
2134 out:
2135         return rc;
2136 }
2137
2138 static void *established_get_next(struct seq_file *seq, void *cur)
2139 {
2140         struct sock *sk = cur;
2141         struct inet_timewait_sock *tw;
2142         struct hlist_nulls_node *node;
2143         struct tcp_iter_state *st = seq->private;
2144         struct net *net = seq_file_net(seq);
2145
2146         ++st->num;
2147         ++st->offset;
2148
2149         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2150                 tw = cur;
2151                 tw = tw_next(tw);
2152 get_tw:
2153                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2154                         tw = tw_next(tw);
2155                 }
2156                 if (tw) {
2157                         cur = tw;
2158                         goto out;
2159                 }
2160                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2161                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2162
2163                 /* Look for next non empty bucket */
2164                 st->offset = 0;
2165                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2166                                 empty_bucket(st))
2167                         ;
2168                 if (st->bucket > tcp_hashinfo.ehash_mask)
2169                         return NULL;
2170
2171                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2172                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2173         } else
2174                 sk = sk_nulls_next(sk);
2175
2176         sk_nulls_for_each_from(sk, node) {
2177                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2178                         goto found;
2179         }
2180
2181         st->state = TCP_SEQ_STATE_TIME_WAIT;
2182         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2183         goto get_tw;
2184 found:
2185         cur = sk;
2186 out:
2187         return cur;
2188 }
2189
2190 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2191 {
2192         struct tcp_iter_state *st = seq->private;
2193         void *rc;
2194
2195         st->bucket = 0;
2196         rc = established_get_first(seq);
2197
2198         while (rc && pos) {
2199                 rc = established_get_next(seq, rc);
2200                 --pos;
2201         }
2202         return rc;
2203 }
2204
2205 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2206 {
2207         void *rc;
2208         struct tcp_iter_state *st = seq->private;
2209
2210         st->state = TCP_SEQ_STATE_LISTENING;
2211         rc        = listening_get_idx(seq, &pos);
2212
2213         if (!rc) {
2214                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2215                 rc        = established_get_idx(seq, pos);
2216         }
2217
2218         return rc;
2219 }
2220
2221 static void *tcp_seek_last_pos(struct seq_file *seq)
2222 {
2223         struct tcp_iter_state *st = seq->private;
2224         int offset = st->offset;
2225         int orig_num = st->num;
2226         void *rc = NULL;
2227
2228         switch (st->state) {
2229         case TCP_SEQ_STATE_OPENREQ:
2230         case TCP_SEQ_STATE_LISTENING:
2231                 if (st->bucket >= INET_LHTABLE_SIZE)
2232                         break;
2233                 st->state = TCP_SEQ_STATE_LISTENING;
2234                 rc = listening_get_next(seq, NULL);
2235                 while (offset-- && rc)
2236                         rc = listening_get_next(seq, rc);
2237                 if (rc)
2238                         break;
2239                 st->bucket = 0;
2240                 /* Fallthrough */
2241         case TCP_SEQ_STATE_ESTABLISHED:
2242         case TCP_SEQ_STATE_TIME_WAIT:
2243                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2244                 if (st->bucket > tcp_hashinfo.ehash_mask)
2245                         break;
2246                 rc = established_get_first(seq);
2247                 while (offset-- && rc)
2248                         rc = established_get_next(seq, rc);
2249         }
2250
2251         st->num = orig_num;
2252
2253         return rc;
2254 }
2255
2256 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2257 {
2258         struct tcp_iter_state *st = seq->private;
2259         void *rc;
2260
2261         if (*pos && *pos == st->last_pos) {
2262                 rc = tcp_seek_last_pos(seq);
2263                 if (rc)
2264                         goto out;
2265         }
2266
2267         st->state = TCP_SEQ_STATE_LISTENING;
2268         st->num = 0;
2269         st->bucket = 0;
2270         st->offset = 0;
2271         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2272
2273 out:
2274         st->last_pos = *pos;
2275         return rc;
2276 }
2277
2278 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2279 {
2280         struct tcp_iter_state *st = seq->private;
2281         void *rc = NULL;
2282
2283         if (v == SEQ_START_TOKEN) {
2284                 rc = tcp_get_idx(seq, 0);
2285                 goto out;
2286         }
2287
2288         switch (st->state) {
2289         case TCP_SEQ_STATE_OPENREQ:
2290         case TCP_SEQ_STATE_LISTENING:
2291                 rc = listening_get_next(seq, v);
2292                 if (!rc) {
2293                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2294                         st->bucket = 0;
2295                         st->offset = 0;
2296                         rc        = established_get_first(seq);
2297                 }
2298                 break;
2299         case TCP_SEQ_STATE_ESTABLISHED:
2300         case TCP_SEQ_STATE_TIME_WAIT:
2301                 rc = established_get_next(seq, v);
2302                 break;
2303         }
2304 out:
2305         ++*pos;
2306         st->last_pos = *pos;
2307         return rc;
2308 }
2309
2310 static void tcp_seq_stop(struct seq_file *seq, void *v)
2311 {
2312         struct tcp_iter_state *st = seq->private;
2313
2314         switch (st->state) {
2315         case TCP_SEQ_STATE_OPENREQ:
2316                 if (v) {
2317                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2318                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2319                 }
2320         case TCP_SEQ_STATE_LISTENING:
2321                 if (v != SEQ_START_TOKEN)
2322                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2323                 break;
2324         case TCP_SEQ_STATE_TIME_WAIT:
2325         case TCP_SEQ_STATE_ESTABLISHED:
2326                 if (v)
2327                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2328                 break;
2329         }
2330 }
2331
2332 static int tcp_seq_open(struct inode *inode, struct file *file)
2333 {
2334         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2335         struct tcp_iter_state *s;
2336         int err;
2337
2338         err = seq_open_net(inode, file, &afinfo->seq_ops,
2339                           sizeof(struct tcp_iter_state));
2340         if (err < 0)
2341                 return err;
2342
2343         s = ((struct seq_file *)file->private_data)->private;
2344         s->family               = afinfo->family;
2345         s->last_pos             = 0;
2346         return 0;
2347 }
2348
2349 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2350 {
2351         int rc = 0;
2352         struct proc_dir_entry *p;
2353
2354         afinfo->seq_fops.open           = tcp_seq_open;
2355         afinfo->seq_fops.read           = seq_read;
2356         afinfo->seq_fops.llseek         = seq_lseek;
2357         afinfo->seq_fops.release        = seq_release_net;
2358
2359         afinfo->seq_ops.start           = tcp_seq_start;
2360         afinfo->seq_ops.next            = tcp_seq_next;
2361         afinfo->seq_ops.stop            = tcp_seq_stop;
2362
2363         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2364                              &afinfo->seq_fops, afinfo);
2365         if (!p)
2366                 rc = -ENOMEM;
2367         return rc;
2368 }
2369
2370 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2371 {
2372         proc_net_remove(net, afinfo->name);
2373 }
2374
2375 static void get_openreq4(struct sock *sk, struct request_sock *req,
2376                          struct seq_file *f, int i, int uid, int *len)
2377 {
2378         const struct inet_request_sock *ireq = inet_rsk(req);
2379         int ttd = req->expires - jiffies;
2380
2381         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2382                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2383                 i,
2384                 ireq->loc_addr,
2385                 ntohs(inet_sk(sk)->inet_sport),
2386                 ireq->rmt_addr,
2387                 ntohs(ireq->rmt_port),
2388                 TCP_SYN_RECV,
2389                 0, 0, /* could print option size, but that is af dependent. */
2390                 1,    /* timers active (only the expire timer) */
2391                 jiffies_to_clock_t(ttd),
2392                 req->retrans,
2393                 uid,
2394                 0,  /* non standard timer */
2395                 0, /* open_requests have no inode */
2396                 atomic_read(&sk->sk_refcnt),
2397                 req,
2398                 len);
2399 }
2400
2401 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2402 {
2403         int timer_active;
2404         unsigned long timer_expires;
2405         struct tcp_sock *tp = tcp_sk(sk);
2406         const struct inet_connection_sock *icsk = inet_csk(sk);
2407         struct inet_sock *inet = inet_sk(sk);
2408         __be32 dest = inet->inet_daddr;
2409         __be32 src = inet->inet_rcv_saddr;
2410         __u16 destp = ntohs(inet->inet_dport);
2411         __u16 srcp = ntohs(inet->inet_sport);
2412         int rx_queue;
2413
2414         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2415                 timer_active    = 1;
2416                 timer_expires   = icsk->icsk_timeout;
2417         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2418                 timer_active    = 4;
2419                 timer_expires   = icsk->icsk_timeout;
2420         } else if (timer_pending(&sk->sk_timer)) {
2421                 timer_active    = 2;
2422                 timer_expires   = sk->sk_timer.expires;
2423         } else {
2424                 timer_active    = 0;
2425                 timer_expires = jiffies;
2426         }
2427
2428         if (sk->sk_state == TCP_LISTEN)
2429                 rx_queue = sk->sk_ack_backlog;
2430         else
2431                 /*
2432                  * because we dont lock socket, we might find a transient negative value
2433                  */
2434                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2435
2436         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2437                         "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2438                 i, src, srcp, dest, destp, sk->sk_state,
2439                 tp->write_seq - tp->snd_una,
2440                 rx_queue,
2441                 timer_active,
2442                 jiffies_to_clock_t(timer_expires - jiffies),
2443                 icsk->icsk_retransmits,
2444                 sock_i_uid(sk),
2445                 icsk->icsk_probes_out,
2446                 sock_i_ino(sk),
2447                 atomic_read(&sk->sk_refcnt), sk,
2448                 jiffies_to_clock_t(icsk->icsk_rto),
2449                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2450                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2451                 tp->snd_cwnd,
2452                 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2453                 len);
2454 }
2455
2456 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2457                                struct seq_file *f, int i, int *len)
2458 {
2459         __be32 dest, src;
2460         __u16 destp, srcp;
2461         int ttd = tw->tw_ttd - jiffies;
2462
2463         if (ttd < 0)
2464                 ttd = 0;
2465
2466         dest  = tw->tw_daddr;
2467         src   = tw->tw_rcv_saddr;
2468         destp = ntohs(tw->tw_dport);
2469         srcp  = ntohs(tw->tw_sport);
2470
2471         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2472                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2473                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2474                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2475                 atomic_read(&tw->tw_refcnt), tw, len);
2476 }
2477
2478 #define TMPSZ 150
2479
2480 static int tcp4_seq_show(struct seq_file *seq, void *v)
2481 {
2482         struct tcp_iter_state *st;
2483         int len;
2484
2485         if (v == SEQ_START_TOKEN) {
2486                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2487                            "  sl  local_address rem_address   st tx_queue "
2488                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2489                            "inode");
2490                 goto out;
2491         }
2492         st = seq->private;
2493
2494         switch (st->state) {
2495         case TCP_SEQ_STATE_LISTENING:
2496         case TCP_SEQ_STATE_ESTABLISHED:
2497                 get_tcp4_sock(v, seq, st->num, &len);
2498                 break;
2499         case TCP_SEQ_STATE_OPENREQ:
2500                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2501                 break;
2502         case TCP_SEQ_STATE_TIME_WAIT:
2503                 get_timewait4_sock(v, seq, st->num, &len);
2504                 break;
2505         }
2506         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2507 out:
2508         return 0;
2509 }
2510
2511 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2512         .name           = "tcp",
2513         .family         = AF_INET,
2514         .seq_fops       = {
2515                 .owner          = THIS_MODULE,
2516         },
2517         .seq_ops        = {
2518                 .show           = tcp4_seq_show,
2519         },
2520 };
2521
2522 static int __net_init tcp4_proc_init_net(struct net *net)
2523 {
2524         return tcp_proc_register(net, &tcp4_seq_afinfo);
2525 }
2526
2527 static void __net_exit tcp4_proc_exit_net(struct net *net)
2528 {
2529         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2530 }
2531
2532 static struct pernet_operations tcp4_net_ops = {
2533         .init = tcp4_proc_init_net,
2534         .exit = tcp4_proc_exit_net,
2535 };
2536
2537 int __init tcp4_proc_init(void)
2538 {
2539         return register_pernet_subsys(&tcp4_net_ops);
2540 }
2541
2542 void tcp4_proc_exit(void)
2543 {
2544         unregister_pernet_subsys(&tcp4_net_ops);
2545 }
2546 #endif /* CONFIG_PROC_FS */
2547
2548 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2549 {
2550         struct iphdr *iph = skb_gro_network_header(skb);
2551
2552         switch (skb->ip_summed) {
2553         case CHECKSUM_COMPLETE:
2554                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2555                                   skb->csum)) {
2556                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2557                         break;
2558                 }
2559
2560                 /* fall through */
2561         case CHECKSUM_NONE:
2562                 NAPI_GRO_CB(skb)->flush = 1;
2563                 return NULL;
2564         }
2565
2566         return tcp_gro_receive(head, skb);
2567 }
2568 EXPORT_SYMBOL(tcp4_gro_receive);
2569
2570 int tcp4_gro_complete(struct sk_buff *skb)
2571 {
2572         struct iphdr *iph = ip_hdr(skb);
2573         struct tcphdr *th = tcp_hdr(skb);
2574
2575         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2576                                   iph->saddr, iph->daddr, 0);
2577         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2578
2579         return tcp_gro_complete(skb);
2580 }
2581 EXPORT_SYMBOL(tcp4_gro_complete);
2582
2583 struct proto tcp_prot = {
2584         .name                   = "TCP",
2585         .owner                  = THIS_MODULE,
2586         .close                  = tcp_close,
2587         .connect                = tcp_v4_connect,
2588         .disconnect             = tcp_disconnect,
2589         .accept                 = inet_csk_accept,
2590         .ioctl                  = tcp_ioctl,
2591         .init                   = tcp_v4_init_sock,
2592         .destroy                = tcp_v4_destroy_sock,
2593         .shutdown               = tcp_shutdown,
2594         .setsockopt             = tcp_setsockopt,
2595         .getsockopt             = tcp_getsockopt,
2596         .recvmsg                = tcp_recvmsg,
2597         .backlog_rcv            = tcp_v4_do_rcv,
2598         .hash                   = inet_hash,
2599         .unhash                 = inet_unhash,
2600         .get_port               = inet_csk_get_port,
2601         .enter_memory_pressure  = tcp_enter_memory_pressure,
2602         .sockets_allocated      = &tcp_sockets_allocated,
2603         .orphan_count           = &tcp_orphan_count,
2604         .memory_allocated       = &tcp_memory_allocated,
2605         .memory_pressure        = &tcp_memory_pressure,
2606         .sysctl_mem             = sysctl_tcp_mem,
2607         .sysctl_wmem            = sysctl_tcp_wmem,
2608         .sysctl_rmem            = sysctl_tcp_rmem,
2609         .max_header             = MAX_TCP_HEADER,
2610         .obj_size               = sizeof(struct tcp_sock),
2611         .slab_flags             = SLAB_DESTROY_BY_RCU,
2612         .twsk_prot              = &tcp_timewait_sock_ops,
2613         .rsk_prot               = &tcp_request_sock_ops,
2614         .h.hashinfo             = &tcp_hashinfo,
2615 #ifdef CONFIG_COMPAT
2616         .compat_setsockopt      = compat_tcp_setsockopt,
2617         .compat_getsockopt      = compat_tcp_getsockopt,
2618 #endif
2619 };
2620
2621
2622 static int __net_init tcp_sk_init(struct net *net)
2623 {
2624         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2625                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2626 }
2627
2628 static void __net_exit tcp_sk_exit(struct net *net)
2629 {
2630         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2631 }
2632
2633 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2634 {
2635         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2636 }
2637
2638 static struct pernet_operations __net_initdata tcp_sk_ops = {
2639        .init       = tcp_sk_init,
2640        .exit       = tcp_sk_exit,
2641        .exit_batch = tcp_sk_exit_batch,
2642 };
2643
2644 void __init tcp_v4_init(void)
2645 {
2646         inet_hashinfo_init(&tcp_hashinfo);
2647         if (register_pernet_subsys(&tcp_sk_ops))
2648                 panic("Failed to create the TCP control socket.\n");
2649 }
2650
2651 EXPORT_SYMBOL(ipv4_specific);
2652 EXPORT_SYMBOL(tcp_hashinfo);
2653 EXPORT_SYMBOL(tcp_prot);
2654 EXPORT_SYMBOL(tcp_v4_conn_request);
2655 EXPORT_SYMBOL(tcp_v4_connect);
2656 EXPORT_SYMBOL(tcp_v4_do_rcv);
2657 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2658 EXPORT_SYMBOL(tcp_v4_send_check);
2659 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2660
2661 #ifdef CONFIG_PROC_FS
2662 EXPORT_SYMBOL(tcp_proc_register);
2663 EXPORT_SYMBOL(tcp_proc_unregister);
2664 #endif
2665 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2666