net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53
  54 #include <linux/bottom_half.h>
  55 #include <linux/types.h>
  56 #include <linux/fcntl.h>
  57 #include <linux/module.h>
  58 #include <linux/random.h>
  59 #include <linux/cache.h>
  60 #include <linux/jhash.h>
  61 #include <linux/init.h>
  62 #include <linux/times.h>
  63 #include <linux/slab.h>
  64
  65 #include <net/net_namespace.h>
  66 #include <net/icmp.h>
  67 #include <net/inet_hashtables.h>
  68 #include <net/tcp.h>
  69 #include <net/transp_v6.h>
  70 #include <net/ipv6.h>
  71 #include <net/inet_common.h>
  72 #include <net/timewait_sock.h>
  73 #include <net/xfrm.h>
  74 #include <net/netdma.h>
  75
  76 #include <linux/inet.h>
  77 #include <linux/ipv6.h>
  78 #include <linux/stddef.h>
  79 #include <linux/proc_fs.h>
  80 #include <linux/seq_file.h>
  81
  82 #include <linux/crypto.h>
  83 #include <linux/scatterlist.h>
  84
  85 int sysctl_tcp_tw_reuse __read_mostly;
  86 int sysctl_tcp_low_latency __read_mostly;
  87
  88
  89 #ifdef CONFIG_TCP_MD5SIG
  90 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
  91                                                    __be32 addr);
  92 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
  93                                __be32 daddr, __be32 saddr, struct tcphdr *th);
  94 #else
  95 static inline
  96 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
  97 {
  98         return NULL;
  99 }
 100 #endif
 101
 102 struct inet_hashinfo tcp_hashinfo;
 103
 104 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
 105 {
 106         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 107                                           ip_hdr(skb)->saddr,
 108                                           tcp_hdr(skb)->dest,
 109                                           tcp_hdr(skb)->source);
 110 }
 111
 112 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 113 {
 114         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 115         struct tcp_sock *tp = tcp_sk(sk);
 116
 117         /* With PAWS, it is safe from the viewpoint
 118            of data integrity. Even without PAWS it is safe provided sequence
 119            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 120
 121            Actually, the idea is close to VJ's one, only timestamp cache is
 122            held not per host, but per port pair and TW bucket is used as state
 123            holder.
 124
 125            If TW bucket has been already destroyed we fall back to VJ's scheme
 126            and use initial timestamp retrieved from peer table.
 127          */
 128         if (tcptw->tw_ts_recent_stamp &&
 129             (twp == NULL || (sysctl_tcp_tw_reuse &&
 130                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 131                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 132                 if (tp->write_seq == 0)
 133                         tp->write_seq = 1;
 134                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 135                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 136                 sock_hold(sktw);
 137                 return 1;
 138         }
 139
 140         return 0;
 141 }
 142
 143 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 144
 145 /* This will initiate an outgoing connection. */
 146 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 147 {
 148         struct inet_sock *inet = inet_sk(sk);
 149         struct tcp_sock *tp = tcp_sk(sk);
 150         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 151         struct rtable *rt;
 152         __be32 daddr, nexthop;
 153         int tmp;
 154         int err;
 155
 156         if (addr_len < sizeof(struct sockaddr_in))
 157                 return -EINVAL;
 158
 159         if (usin->sin_family != AF_INET)
 160                 return -EAFNOSUPPORT;
 161
 162         nexthop = daddr = usin->sin_addr.s_addr;
 163         if (inet->opt && inet->opt->srr) {
 164                 if (!daddr)
 165                         return -EINVAL;
 166                 nexthop = inet->opt->faddr;
 167         }
 168
 169         tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
 170                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 171                                IPPROTO_TCP,
 172                                inet->inet_sport, usin->sin_port, sk, 1);
 173         if (tmp < 0) {
 174                 if (tmp == -ENETUNREACH)
 175                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 176                 return tmp;
 177         }
 178
 179         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 180                 ip_rt_put(rt);
 181                 return -ENETUNREACH;
 182         }
 183
 184         if (!inet->opt || !inet->opt->srr)
 185                 daddr = rt->rt_dst;
 186
 187         if (!inet->inet_saddr)
 188                 inet->inet_saddr = rt->rt_src;
 189         inet->inet_rcv_saddr = inet->inet_saddr;
 190
 191         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 192                 /* Reset inherited state */
 193                 tp->rx_opt.ts_recent       = 0;
 194                 tp->rx_opt.ts_recent_stamp = 0;
 195                 tp->write_seq              = 0;
 196         }
 197
 198         if (tcp_death_row.sysctl_tw_recycle &&
 199             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 200                 struct inet_peer *peer = rt_get_peer(rt);
 201                 /*
 202                  * VJ's idea. We save last timestamp seen from
 203                  * the destination in peer table, when entering state
 204                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
 205                  * when trying new connection.
 206                  */
 207                 if (peer) {
 208                         inet_peer_refcheck(peer);
 209                         if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
 210                                 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 211                                 tp->rx_opt.ts_recent = peer->tcp_ts;
 212                         }
 213                 }
 214         }
 215
 216         inet->inet_dport = usin->sin_port;
 217         inet->inet_daddr = daddr;
 218
 219         inet_csk(sk)->icsk_ext_hdr_len = 0;
 220         if (inet->opt)
 221                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
 222
 223         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 224
 225         /* Socket identity is still unknown (sport may be zero).
 226          * However we set state to SYN-SENT and not releasing socket
 227          * lock select source port, enter ourselves into the hash tables and
 228          * complete initialization after this.
 229          */
 230         tcp_set_state(sk, TCP_SYN_SENT);
 231         err = inet_hash_connect(&tcp_death_row, sk);
 232         if (err)
 233                 goto failure;
 234
 235         err = ip_route_newports(&rt, IPPROTO_TCP,
 236                                 inet->inet_sport, inet->inet_dport, sk);
 237         if (err)
 238                 goto failure;
 239
 240         /* OK, now commit destination to socket.  */
 241         sk->sk_gso_type = SKB_GSO_TCPV4;
 242         sk_setup_caps(sk, &rt->dst);
 243
 244         if (!tp->write_seq)
 245                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 246                                                            inet->inet_daddr,
 247                                                            inet->inet_sport,
 248                                                            usin->sin_port);
 249
 250         inet->inet_id = tp->write_seq ^ jiffies;
 251
 252         err = tcp_connect(sk);
 253         rt = NULL;
 254         if (err)
 255                 goto failure;
 256
 257         return 0;
 258
 259 failure:
 260         /*
 261          * This unhashes the socket and releases the local port,
 262          * if necessary.
 263          */
 264         tcp_set_state(sk, TCP_CLOSE);
 265         ip_rt_put(rt);
 266         sk->sk_route_caps = 0;
 267         inet->inet_dport = 0;
 268         return err;
 269 }
 270
 271 /*
 272  * This routine does path mtu discovery as defined in RFC1191.
 273  */
 274 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
 275 {
 276         struct dst_entry *dst;
 277         struct inet_sock *inet = inet_sk(sk);
 278
 279         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 280          * send out by Linux are always <576bytes so they should go through
 281          * unfragmented).
 282          */
 283         if (sk->sk_state == TCP_LISTEN)
 284                 return;
 285
 286         /* We don't check in the destentry if pmtu discovery is forbidden
 287          * on this route. We just assume that no packet_to_big packets
 288          * are send back when pmtu discovery is not active.
 289          * There is a small race when the user changes this flag in the
 290          * route, but I think that's acceptable.
 291          */
 292         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 293                 return;
 294
 295         dst->ops->update_pmtu(dst, mtu);
 296
 297         /* Something is about to be wrong... Remember soft error
 298          * for the case, if this connection will not able to recover.
 299          */
 300         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 301                 sk->sk_err_soft = EMSGSIZE;
 302
 303         mtu = dst_mtu(dst);
 304
 305         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 306             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 307                 tcp_sync_mss(sk, mtu);
 308
 309                 /* Resend the TCP packet because it's
 310                  * clear that the old packet has been
 311                  * dropped. This is the new "fast" path mtu
 312                  * discovery.
 313                  */
 314                 tcp_simple_retransmit(sk);
 315         } /* else let the usual retransmit timer handle it */
 316 }
 317
 318 /*
 319  * This routine is called by the ICMP module when it gets some
 320  * sort of error condition.  If err < 0 then the socket should
 321  * be closed and the error returned to the user.  If err > 0
 322  * it's just the icmp type << 8 | icmp code.  After adjustment
 323  * header points to the first 8 bytes of the tcp header.  We need
 324  * to find the appropriate port.
 325  *
 326  * The locking strategy used here is very "optimistic". When
 327  * someone else accesses the socket the ICMP is just dropped
 328  * and for some paths there is no check at all.
 329  * A more general error queue to queue errors for later handling
 330  * is probably better.
 331  *
 332  */
 333
 334 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 335 {
 336         struct iphdr *iph = (struct iphdr *)icmp_skb->data;
 337         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 338         struct inet_connection_sock *icsk;
 339         struct tcp_sock *tp;
 340         struct inet_sock *inet;
 341         const int type = icmp_hdr(icmp_skb)->type;
 342         const int code = icmp_hdr(icmp_skb)->code;
 343         struct sock *sk;
 344         struct sk_buff *skb;
 345         __u32 seq;
 346         __u32 remaining;
 347         int err;
 348         struct net *net = dev_net(icmp_skb->dev);
 349
 350         if (icmp_skb->len < (iph->ihl << 2) + 8) {
 351                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 352                 return;
 353         }
 354
 355         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 356                         iph->saddr, th->source, inet_iif(icmp_skb));
 357         if (!sk) {
 358                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 359                 return;
 360         }
 361         if (sk->sk_state == TCP_TIME_WAIT) {
 362                 inet_twsk_put(inet_twsk(sk));
 363                 return;
 364         }
 365
 366         bh_lock_sock(sk);
 367         /* If too many ICMPs get dropped on busy
 368          * servers this needs to be solved differently.
 369          */
 370         if (sock_owned_by_user(sk))
 371                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 372
 373         if (sk->sk_state == TCP_CLOSE)
 374                 goto out;
 375
 376         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 377                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 378                 goto out;
 379         }
 380
 381         icsk = inet_csk(sk);
 382         tp = tcp_sk(sk);
 383         seq = ntohl(th->seq);
 384         if (sk->sk_state != TCP_LISTEN &&
 385             !between(seq, tp->snd_una, tp->snd_nxt)) {
 386                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 387                 goto out;
 388         }
 389
 390         switch (type) {
 391         case ICMP_SOURCE_QUENCH:
 392                 /* Just silently ignore these. */
 393                 goto out;
 394         case ICMP_PARAMETERPROB:
 395                 err = EPROTO;
 396                 break;
 397         case ICMP_DEST_UNREACH:
 398                 if (code > NR_ICMP_UNREACH)
 399                         goto out;
 400
 401                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 402                         if (!sock_owned_by_user(sk))
 403                                 do_pmtu_discovery(sk, iph, info);
 404                         goto out;
 405                 }
 406
 407                 err = icmp_err_convert[code].errno;
 408                 /* check if icmp_skb allows revert of backoff
 409                  * (see draft-zimmermann-tcp-lcd) */
 410                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 411                         break;
 412                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 413                     !icsk->icsk_backoff)
 414                         break;
 415
 416                 icsk->icsk_backoff--;
 417                 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
 418                                          icsk->icsk_backoff;
 419                 tcp_bound_rto(sk);
 420
 421                 skb = tcp_write_queue_head(sk);
 422                 BUG_ON(!skb);
 423
 424                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 425                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
 426
 427                 if (remaining) {
 428                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 429                                                   remaining, TCP_RTO_MAX);
 430                 } else if (sock_owned_by_user(sk)) {
 431                         /* RTO revert clocked out retransmission,
 432                          * but socket is locked. Will defer. */
 433                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 434                                                   HZ/20, TCP_RTO_MAX);
 435                 } else {
 436                         /* RTO revert clocked out retransmission.
 437                          * Will retransmit now */
 438                         tcp_retransmit_timer(sk);
 439                 }
 440
 441                 break;
 442         case ICMP_TIME_EXCEEDED:
 443                 err = EHOSTUNREACH;
 444                 break;
 445         default:
 446                 goto out;
 447         }
 448
 449         switch (sk->sk_state) {
 450                 struct request_sock *req, **prev;
 451         case TCP_LISTEN:
 452                 if (sock_owned_by_user(sk))
 453                         goto out;
 454
 455                 req = inet_csk_search_req(sk, &prev, th->dest,
 456                                           iph->daddr, iph->saddr);
 457                 if (!req)
 458                         goto out;
 459
 460                 /* ICMPs are not backlogged, hence we cannot get
 461                    an established socket here.
 462                  */
 463                 WARN_ON(req->sk);
 464
 465                 if (seq != tcp_rsk(req)->snt_isn) {
 466                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 467                         goto out;
 468                 }
 469
 470                 /*
 471                  * Still in SYN_RECV, just remove it silently.
 472                  * There is no good way to pass the error to the newly
 473                  * created socket, and POSIX does not want network
 474                  * errors returned from accept().
 475                  */
 476                 inet_csk_reqsk_queue_drop(sk, req, prev);
 477                 goto out;
 478
 479         case TCP_SYN_SENT:
 480         case TCP_SYN_RECV:  /* Cannot happen.
 481                                It can f.e. if SYNs crossed.
 482                              */
 483                 if (!sock_owned_by_user(sk)) {
 484                         sk->sk_err = err;
 485
 486                         sk->sk_error_report(sk);
 487
 488                         tcp_done(sk);
 489                 } else {
 490                         sk->sk_err_soft = err;
 491                 }
 492                 goto out;
 493         }
 494
 495         /* If we've already connected we will keep trying
 496          * until we time out, or the user gives up.
 497          *
 498          * rfc1122 4.2.3.9 allows to consider as hard errors
 499          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 500          * but it is obsoleted by pmtu discovery).
 501          *
 502          * Note, that in modern internet, where routing is unreliable
 503          * and in each dark corner broken firewalls sit, sending random
 504          * errors ordered by their masters even this two messages finally lose
 505          * their original sense (even Linux sends invalid PORT_UNREACHs)
 506          *
 507          * Now we are in compliance with RFCs.
 508          *                                                      --ANK (980905)
 509          */
 510
 511         inet = inet_sk(sk);
 512         if (!sock_owned_by_user(sk) && inet->recverr) {
 513                 sk->sk_err = err;
 514                 sk->sk_error_report(sk);
 515         } else  { /* Only an error on timeout */
 516                 sk->sk_err_soft = err;
 517         }
 518
 519 out:
 520         bh_unlock_sock(sk);
 521         sock_put(sk);
 522 }
 523
 524 static void __tcp_v4_send_check(struct sk_buff *skb,
 525                                 __be32 saddr, __be32 daddr)
 526 {
 527         struct tcphdr *th = tcp_hdr(skb);
 528
 529         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 530                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 531                 skb->csum_start = skb_transport_header(skb) - skb->head;
 532                 skb->csum_offset = offsetof(struct tcphdr, check);
 533         } else {
 534                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 535                                          csum_partial(th,
 536                                                       th->doff << 2,
 537                                                       skb->csum));
 538         }
 539 }
 540
 541 /* This routine computes an IPv4 TCP checksum. */
 542 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 543 {
 544         struct inet_sock *inet = inet_sk(sk);
 545
 546         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 547 }
 548
 549 int tcp_v4_gso_send_check(struct sk_buff *skb)
 550 {
 551         const struct iphdr *iph;
 552         struct tcphdr *th;
 553
 554         if (!pskb_may_pull(skb, sizeof(*th)))
 555                 return -EINVAL;
 556
 557         iph = ip_hdr(skb);
 558         th = tcp_hdr(skb);
 559
 560         th->check = 0;
 561         skb->ip_summed = CHECKSUM_PARTIAL;
 562         __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
 563         return 0;
 564 }
 565
 566 /*
 567  *      This routine will send an RST to the other tcp.
 568  *
 569  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 570  *                    for reset.
 571  *      Answer: if a packet caused RST, it is not for a socket
 572  *              existing in our system, if it is matched to a socket,
 573  *              it is just duplicate segment or bug in other side's TCP.
 574  *              So that we build reply only basing on parameters
 575  *              arrived with segment.
 576  *      Exception: precedence violation. We do not implement it in any case.
 577  */
 578
 579 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 580 {
 581         struct tcphdr *th = tcp_hdr(skb);
 582         struct {
 583                 struct tcphdr th;
 584 #ifdef CONFIG_TCP_MD5SIG
 585                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 586 #endif
 587         } rep;
 588         struct ip_reply_arg arg;
 589 #ifdef CONFIG_TCP_MD5SIG
 590         struct tcp_md5sig_key *key;
 591 #endif
 592         struct net *net;
 593
 594         /* Never send a reset in response to a reset. */
 595         if (th->rst)
 596                 return;
 597
 598         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 599                 return;
 600
 601         /* Swap the send and the receive. */
 602         memset(&rep, 0, sizeof(rep));
 603         rep.th.dest   = th->source;
 604         rep.th.source = th->dest;
 605         rep.th.doff   = sizeof(struct tcphdr) / 4;
 606         rep.th.rst    = 1;
 607
 608         if (th->ack) {
 609                 rep.th.seq = th->ack_seq;
 610         } else {
 611                 rep.th.ack = 1;
 612                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 613                                        skb->len - (th->doff << 2));
 614         }
 615
 616         memset(&arg, 0, sizeof(arg));
 617         arg.iov[0].iov_base = (unsigned char *)&rep;
 618         arg.iov[0].iov_len  = sizeof(rep.th);
 619
 620 #ifdef CONFIG_TCP_MD5SIG
 621         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
 622         if (key) {
 623                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 624                                    (TCPOPT_NOP << 16) |
 625                                    (TCPOPT_MD5SIG << 8) |
 626                                    TCPOLEN_MD5SIG);
 627                 /* Update length and the length the header thinks exists */
 628                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 629                 rep.th.doff = arg.iov[0].iov_len / 4;
 630
 631                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 632                                      key, ip_hdr(skb)->saddr,
 633                                      ip_hdr(skb)->daddr, &rep.th);
 634         }
 635 #endif
 636         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 637                                       ip_hdr(skb)->saddr, /* XXX */
 638                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 639         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 640         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 641
 642         net = dev_net(skb_dst(skb)->dev);
 643         ip_send_reply(net->ipv4.tcp_sock, skb,
 644                       &arg, arg.iov[0].iov_len);
 645
 646         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 647         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 648 }
 649
 650 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 651    outside socket context is ugly, certainly. What can I do?
 652  */
 653
 654 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 655                             u32 win, u32 ts, int oif,
 656                             struct tcp_md5sig_key *key,
 657                             int reply_flags)
 658 {
 659         struct tcphdr *th = tcp_hdr(skb);
 660         struct {
 661                 struct tcphdr th;
 662                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 663 #ifdef CONFIG_TCP_MD5SIG
 664                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 665 #endif
 666                         ];
 667         } rep;
 668         struct ip_reply_arg arg;
 669         struct net *net = dev_net(skb_dst(skb)->dev);
 670
 671         memset(&rep.th, 0, sizeof(struct tcphdr));
 672         memset(&arg, 0, sizeof(arg));
 673
 674         arg.iov[0].iov_base = (unsigned char *)&rep;
 675         arg.iov[0].iov_len  = sizeof(rep.th);
 676         if (ts) {
 677                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 678                                    (TCPOPT_TIMESTAMP << 8) |
 679                                    TCPOLEN_TIMESTAMP);
 680                 rep.opt[1] = htonl(tcp_time_stamp);
 681                 rep.opt[2] = htonl(ts);
 682                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 683         }
 684
 685         /* Swap the send and the receive. */
 686         rep.th.dest    = th->source;
 687         rep.th.source  = th->dest;
 688         rep.th.doff    = arg.iov[0].iov_len / 4;
 689         rep.th.seq     = htonl(seq);
 690         rep.th.ack_seq = htonl(ack);
 691         rep.th.ack     = 1;
 692         rep.th.window  = htons(win);
 693
 694 #ifdef CONFIG_TCP_MD5SIG
 695         if (key) {
 696                 int offset = (ts) ? 3 : 0;
 697
 698                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 699                                           (TCPOPT_NOP << 16) |
 700                                           (TCPOPT_MD5SIG << 8) |
 701                                           TCPOLEN_MD5SIG);
 702                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 703                 rep.th.doff = arg.iov[0].iov_len/4;
 704
 705                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 706                                     key, ip_hdr(skb)->saddr,
 707                                     ip_hdr(skb)->daddr, &rep.th);
 708         }
 709 #endif
 710         arg.flags = reply_flags;
 711         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 712                                       ip_hdr(skb)->saddr, /* XXX */
 713                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 714         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 715         if (oif)
 716                 arg.bound_dev_if = oif;
 717
 718         ip_send_reply(net->ipv4.tcp_sock, skb,
 719                       &arg, arg.iov[0].iov_len);
 720
 721         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 722 }
 723
 724 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 725 {
 726         struct inet_timewait_sock *tw = inet_twsk(sk);
 727         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 728
 729         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 730                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 731                         tcptw->tw_ts_recent,
 732                         tw->tw_bound_dev_if,
 733                         tcp_twsk_md5_key(tcptw),
 734                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
 735                         );
 736
 737         inet_twsk_put(tw);
 738 }
 739
 740 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 741                                   struct request_sock *req)
 742 {
 743         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
 744                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 745                         req->ts_recent,
 746                         0,
 747                         tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
 748                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
 749 }
 750
 751 /*
 752  *      Send a SYN-ACK after having received a SYN.
 753  *      This still operates on a request_sock only, not on a big
 754  *      socket.
 755  */
 756 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 757                               struct request_sock *req,
 758                               struct request_values *rvp)
 759 {
 760         const struct inet_request_sock *ireq = inet_rsk(req);
 761         int err = -1;
 762         struct sk_buff * skb;
 763
 764         /* First, grab a route. */
 765         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
 766                 return -1;
 767
 768         skb = tcp_make_synack(sk, dst, req, rvp);
 769
 770         if (skb) {
 771                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
 772
 773                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 774                                             ireq->rmt_addr,
 775                                             ireq->opt);
 776                 err = net_xmit_eval(err);
 777         }
 778
 779         dst_release(dst);
 780         return err;
 781 }
 782
 783 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
 784                               struct request_values *rvp)
 785 {
 786         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
 787         return tcp_v4_send_synack(sk, NULL, req, rvp);
 788 }
 789
 790 /*
 791  *      IPv4 request_sock destructor.
 792  */
 793 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 794 {
 795         kfree(inet_rsk(req)->opt);
 796 }
 797
 798 static void syn_flood_warning(const struct sk_buff *skb)
 799 {
 800         const char *msg;
 801
 802 #ifdef CONFIG_SYN_COOKIES
 803         if (sysctl_tcp_syncookies)
 804                 msg = "Sending cookies";
 805         else
 806 #endif
 807                 msg = "Dropping request";
 808
 809         pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
 810                                 ntohs(tcp_hdr(skb)->dest), msg);
 811 }
 812
 813 /*
 814  * Save and compile IPv4 options into the request_sock if needed.
 815  */
 816 static struct ip_options *tcp_v4_save_options(struct sock *sk,
 817                                               struct sk_buff *skb)
 818 {
 819         struct ip_options *opt = &(IPCB(skb)->opt);
 820         struct ip_options *dopt = NULL;
 821
 822         if (opt && opt->optlen) {
 823                 int opt_size = optlength(opt);
 824                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 825                 if (dopt) {
 826                         if (ip_options_echo(dopt, skb)) {
 827                                 kfree(dopt);
 828                                 dopt = NULL;
 829                         }
 830                 }
 831         }
 832         return dopt;
 833 }
 834
 835 #ifdef CONFIG_TCP_MD5SIG
 836 /*
 837  * RFC2385 MD5 checksumming requires a mapping of
 838  * IP address->MD5 Key.
 839  * We need to maintain these in the sk structure.
 840  */
 841
 842 /* Find the Key structure for an address.  */
 843 static struct tcp_md5sig_key *
 844                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
 845 {
 846         struct tcp_sock *tp = tcp_sk(sk);
 847         int i;
 848
 849         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
 850                 return NULL;
 851         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 852                 if (tp->md5sig_info->keys4[i].addr == addr)
 853                         return &tp->md5sig_info->keys4[i].base;
 854         }
 855         return NULL;
 856 }
 857
 858 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 859                                          struct sock *addr_sk)
 860 {
 861         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
 862 }
 863
 864 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 865
 866 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 867                                                       struct request_sock *req)
 868 {
 869         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
 870 }
 871
 872 /* This can be called on a newly created socket, from other files */
 873 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
 874                       u8 *newkey, u8 newkeylen)
 875 {
 876         /* Add Key to the list */
 877         struct tcp_md5sig_key *key;
 878         struct tcp_sock *tp = tcp_sk(sk);
 879         struct tcp4_md5sig_key *keys;
 880
 881         key = tcp_v4_md5_do_lookup(sk, addr);
 882         if (key) {
 883                 /* Pre-existing entry - just update that one. */
 884                 kfree(key->key);
 885                 key->key = newkey;
 886                 key->keylen = newkeylen;
 887         } else {
 888                 struct tcp_md5sig_info *md5sig;
 889
 890                 if (!tp->md5sig_info) {
 891                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
 892                                                   GFP_ATOMIC);
 893                         if (!tp->md5sig_info) {
 894                                 kfree(newkey);
 895                                 return -ENOMEM;
 896                         }
 897                         sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 898                 }
 899                 if (tcp_alloc_md5sig_pool(sk) == NULL) {
 900                         kfree(newkey);
 901                         return -ENOMEM;
 902                 }
 903                 md5sig = tp->md5sig_info;
 904
 905                 if (md5sig->alloced4 == md5sig->entries4) {
 906                         keys = kmalloc((sizeof(*keys) *
 907                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
 908                         if (!keys) {
 909                                 kfree(newkey);
 910                                 tcp_free_md5sig_pool();
 911                                 return -ENOMEM;
 912                         }
 913
 914                         if (md5sig->entries4)
 915                                 memcpy(keys, md5sig->keys4,
 916                                        sizeof(*keys) * md5sig->entries4);
 917
 918                         /* Free old key list, and reference new one */
 919                         kfree(md5sig->keys4);
 920                         md5sig->keys4 = keys;
 921                         md5sig->alloced4++;
 922                 }
 923                 md5sig->entries4++;
 924                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
 925                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
 926                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
 927         }
 928         return 0;
 929 }
 930
 931 EXPORT_SYMBOL(tcp_v4_md5_do_add);
 932
 933 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
 934                                u8 *newkey, u8 newkeylen)
 935 {
 936         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
 937                                  newkey, newkeylen);
 938 }
 939
 940 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
 941 {
 942         struct tcp_sock *tp = tcp_sk(sk);
 943         int i;
 944
 945         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 946                 if (tp->md5sig_info->keys4[i].addr == addr) {
 947                         /* Free the key */
 948                         kfree(tp->md5sig_info->keys4[i].base.key);
 949                         tp->md5sig_info->entries4--;
 950
 951                         if (tp->md5sig_info->entries4 == 0) {
 952                                 kfree(tp->md5sig_info->keys4);
 953                                 tp->md5sig_info->keys4 = NULL;
 954                                 tp->md5sig_info->alloced4 = 0;
 955                         } else if (tp->md5sig_info->entries4 != i) {
 956                                 /* Need to do some manipulation */
 957                                 memmove(&tp->md5sig_info->keys4[i],
 958                                         &tp->md5sig_info->keys4[i+1],
 959                                         (tp->md5sig_info->entries4 - i) *
 960                                          sizeof(struct tcp4_md5sig_key));
 961                         }
 962                         tcp_free_md5sig_pool();
 963                         return 0;
 964                 }
 965         }
 966         return -ENOENT;
 967 }
 968
 969 EXPORT_SYMBOL(tcp_v4_md5_do_del);
 970
 971 static void tcp_v4_clear_md5_list(struct sock *sk)
 972 {
 973         struct tcp_sock *tp = tcp_sk(sk);
 974
 975         /* Free each key, then the set of key keys,
 976          * the crypto element, and then decrement our
 977          * hold on the last resort crypto.
 978          */
 979         if (tp->md5sig_info->entries4) {
 980                 int i;
 981                 for (i = 0; i < tp->md5sig_info->entries4; i++)
 982                         kfree(tp->md5sig_info->keys4[i].base.key);
 983                 tp->md5sig_info->entries4 = 0;
 984                 tcp_free_md5sig_pool();
 985         }
 986         if (tp->md5sig_info->keys4) {
 987                 kfree(tp->md5sig_info->keys4);
 988                 tp->md5sig_info->keys4 = NULL;
 989                 tp->md5sig_info->alloced4  = 0;
 990         }
 991 }
 992
 993 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
 994                                  int optlen)
 995 {
 996         struct tcp_md5sig cmd;
 997         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
 998         u8 *newkey;
 999
1000         if (optlen < sizeof(cmd))
1001                 return -EINVAL;
1002
1003         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1004                 return -EFAULT;
1005
1006         if (sin->sin_family != AF_INET)
1007                 return -EINVAL;
1008
1009         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1010                 if (!tcp_sk(sk)->md5sig_info)
1011                         return -ENOENT;
1012                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1013         }
1014
1015         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1016                 return -EINVAL;
1017
1018         if (!tcp_sk(sk)->md5sig_info) {
1019                 struct tcp_sock *tp = tcp_sk(sk);
1020                 struct tcp_md5sig_info *p;
1021
1022                 p = kzalloc(sizeof(*p), sk->sk_allocation);
1023                 if (!p)
1024                         return -EINVAL;
1025
1026                 tp->md5sig_info = p;
1027                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1028         }
1029
1030         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1031         if (!newkey)
1032                 return -ENOMEM;
1033         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1034                                  newkey, cmd.tcpm_keylen);
1035 }
1036
1037 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1038                                         __be32 daddr, __be32 saddr, int nbytes)
1039 {
1040         struct tcp4_pseudohdr *bp;
1041         struct scatterlist sg;
1042
1043         bp = &hp->md5_blk.ip4;
1044
1045         /*
1046          * 1. the TCP pseudo-header (in the order: source IP address,
1047          * destination IP address, zero-padded protocol number, and
1048          * segment length)
1049          */
1050         bp->saddr = saddr;
1051         bp->daddr = daddr;
1052         bp->pad = 0;
1053         bp->protocol = IPPROTO_TCP;
1054         bp->len = cpu_to_be16(nbytes);
1055
1056         sg_init_one(&sg, bp, sizeof(*bp));
1057         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1058 }
1059
1060 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1061                                __be32 daddr, __be32 saddr, struct tcphdr *th)
1062 {
1063         struct tcp_md5sig_pool *hp;
1064         struct hash_desc *desc;
1065
1066         hp = tcp_get_md5sig_pool();
1067         if (!hp)
1068                 goto clear_hash_noput;
1069         desc = &hp->md5_desc;
1070
1071         if (crypto_hash_init(desc))
1072                 goto clear_hash;
1073         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1074                 goto clear_hash;
1075         if (tcp_md5_hash_header(hp, th))
1076                 goto clear_hash;
1077         if (tcp_md5_hash_key(hp, key))
1078                 goto clear_hash;
1079         if (crypto_hash_final(desc, md5_hash))
1080                 goto clear_hash;
1081
1082         tcp_put_md5sig_pool();
1083         return 0;
1084
1085 clear_hash:
1086         tcp_put_md5sig_pool();
1087 clear_hash_noput:
1088         memset(md5_hash, 0, 16);
1089         return 1;
1090 }
1091
1092 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1093                         struct sock *sk, struct request_sock *req,
1094                         struct sk_buff *skb)
1095 {
1096         struct tcp_md5sig_pool *hp;
1097         struct hash_desc *desc;
1098         struct tcphdr *th = tcp_hdr(skb);
1099         __be32 saddr, daddr;
1100
1101         if (sk) {
1102                 saddr = inet_sk(sk)->inet_saddr;
1103                 daddr = inet_sk(sk)->inet_daddr;
1104         } else if (req) {
1105                 saddr = inet_rsk(req)->loc_addr;
1106                 daddr = inet_rsk(req)->rmt_addr;
1107         } else {
1108                 const struct iphdr *iph = ip_hdr(skb);
1109                 saddr = iph->saddr;
1110                 daddr = iph->daddr;
1111         }
1112
1113         hp = tcp_get_md5sig_pool();
1114         if (!hp)
1115                 goto clear_hash_noput;
1116         desc = &hp->md5_desc;
1117
1118         if (crypto_hash_init(desc))
1119                 goto clear_hash;
1120
1121         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1122                 goto clear_hash;
1123         if (tcp_md5_hash_header(hp, th))
1124                 goto clear_hash;
1125         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1126                 goto clear_hash;
1127         if (tcp_md5_hash_key(hp, key))
1128                 goto clear_hash;
1129         if (crypto_hash_final(desc, md5_hash))
1130                 goto clear_hash;
1131
1132         tcp_put_md5sig_pool();
1133         return 0;
1134
1135 clear_hash:
1136         tcp_put_md5sig_pool();
1137 clear_hash_noput:
1138         memset(md5_hash, 0, 16);
1139         return 1;
1140 }
1141
1142 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1143
1144 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1145 {
1146         /*
1147          * This gets called for each TCP segment that arrives
1148          * so we want to be efficient.
1149          * We have 3 drop cases:
1150          * o No MD5 hash and one expected.
1151          * o MD5 hash and we're not expecting one.
1152          * o MD5 hash and its wrong.
1153          */
1154         __u8 *hash_location = NULL;
1155         struct tcp_md5sig_key *hash_expected;
1156         const struct iphdr *iph = ip_hdr(skb);
1157         struct tcphdr *th = tcp_hdr(skb);
1158         int genhash;
1159         unsigned char newhash[16];
1160
1161         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1162         hash_location = tcp_parse_md5sig_option(th);
1163
1164         /* We've parsed the options - do we have a hash? */
1165         if (!hash_expected && !hash_location)
1166                 return 0;
1167
1168         if (hash_expected && !hash_location) {
1169                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1170                 return 1;
1171         }
1172
1173         if (!hash_expected && hash_location) {
1174                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1175                 return 1;
1176         }
1177
1178         /* Okay, so this is hash_expected and hash_location -
1179          * so we need to calculate the checksum.
1180          */
1181         genhash = tcp_v4_md5_hash_skb(newhash,
1182                                       hash_expected,
1183                                       NULL, NULL, skb);
1184
1185         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1186                 if (net_ratelimit()) {
1187                         printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1188                                &iph->saddr, ntohs(th->source),
1189                                &iph->daddr, ntohs(th->dest),
1190                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1191                 }
1192                 return 1;
1193         }
1194         return 0;
1195 }
1196
1197 #endif
1198
1199 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1200         .family         =       PF_INET,
1201         .obj_size       =       sizeof(struct tcp_request_sock),
1202         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1203         .send_ack       =       tcp_v4_reqsk_send_ack,
1204         .destructor     =       tcp_v4_reqsk_destructor,
1205         .send_reset     =       tcp_v4_send_reset,
1206         .syn_ack_timeout =      tcp_syn_ack_timeout,
1207 };
1208
1209 #ifdef CONFIG_TCP_MD5SIG
1210 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1211         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1212         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1213 };
1214 #endif
1215
1216 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1217         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1218         .twsk_unique    = tcp_twsk_unique,
1219         .twsk_destructor= tcp_twsk_destructor,
1220 };
1221
1222 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1223 {
1224         struct tcp_extend_values tmp_ext;
1225         struct tcp_options_received tmp_opt;
1226         u8 *hash_location;
1227         struct request_sock *req;
1228         struct inet_request_sock *ireq;
1229         struct tcp_sock *tp = tcp_sk(sk);
1230         struct dst_entry *dst = NULL;
1231         __be32 saddr = ip_hdr(skb)->saddr;
1232         __be32 daddr = ip_hdr(skb)->daddr;
1233         __u32 isn = TCP_SKB_CB(skb)->when;
1234 #ifdef CONFIG_SYN_COOKIES
1235         int want_cookie = 0;
1236 #else
1237 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1238 #endif
1239
1240         /* Never answer to SYNs send to broadcast or multicast */
1241         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1242                 goto drop;
1243
1244         /* TW buckets are converted to open requests without
1245          * limitations, they conserve resources and peer is
1246          * evidently real one.
1247          */
1248         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1249                 if (net_ratelimit())
1250                         syn_flood_warning(skb);
1251 #ifdef CONFIG_SYN_COOKIES
1252                 if (sysctl_tcp_syncookies) {
1253                         want_cookie = 1;
1254                 } else
1255 #endif
1256                 goto drop;
1257         }
1258
1259         /* Accept backlog is full. If we have already queued enough
1260          * of warm entries in syn queue, drop request. It is better than
1261          * clogging syn queue with openreqs with exponentially increasing
1262          * timeout.
1263          */
1264         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1265                 goto drop;
1266
1267         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1268         if (!req)
1269                 goto drop;
1270
1271 #ifdef CONFIG_TCP_MD5SIG
1272         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1273 #endif
1274
1275         tcp_clear_options(&tmp_opt);
1276         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1277         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1278         tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1279
1280         if (tmp_opt.cookie_plus > 0 &&
1281             tmp_opt.saw_tstamp &&
1282             !tp->rx_opt.cookie_out_never &&
1283             (sysctl_tcp_cookie_size > 0 ||
1284              (tp->cookie_values != NULL &&
1285               tp->cookie_values->cookie_desired > 0))) {
1286                 u8 *c;
1287                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1288                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1289
1290                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1291                         goto drop_and_release;
1292
1293                 /* Secret recipe starts with IP addresses */
1294                 *mess++ ^= (__force u32)daddr;
1295                 *mess++ ^= (__force u32)saddr;
1296
1297                 /* plus variable length Initiator Cookie */
1298                 c = (u8 *)mess;
1299                 while (l-- > 0)
1300                         *c++ ^= *hash_location++;
1301
1302 #ifdef CONFIG_SYN_COOKIES
1303                 want_cookie = 0;        /* not our kind of cookie */
1304 #endif
1305                 tmp_ext.cookie_out_never = 0; /* false */
1306                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1307         } else if (!tp->rx_opt.cookie_in_always) {
1308                 /* redundant indications, but ensure initialization. */
1309                 tmp_ext.cookie_out_never = 1; /* true */
1310                 tmp_ext.cookie_plus = 0;
1311         } else {
1312                 goto drop_and_release;
1313         }
1314         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1315
1316         if (want_cookie && !tmp_opt.saw_tstamp)
1317                 tcp_clear_options(&tmp_opt);
1318
1319         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1320         tcp_openreq_init(req, &tmp_opt, skb);
1321
1322         ireq = inet_rsk(req);
1323         ireq->loc_addr = daddr;
1324         ireq->rmt_addr = saddr;
1325         ireq->no_srccheck = inet_sk(sk)->transparent;
1326         ireq->opt = tcp_v4_save_options(sk, skb);
1327
1328         if (security_inet_conn_request(sk, skb, req))
1329                 goto drop_and_free;
1330
1331         if (!want_cookie || tmp_opt.tstamp_ok)
1332                 TCP_ECN_create_request(req, tcp_hdr(skb));
1333
1334         if (want_cookie) {
1335                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1336                 req->cookie_ts = tmp_opt.tstamp_ok;
1337         } else if (!isn) {
1338                 struct inet_peer *peer = NULL;
1339
1340                 /* VJ's idea. We save last timestamp seen
1341                  * from the destination in peer table, when entering
1342                  * state TIME-WAIT, and check against it before
1343                  * accepting new connection request.
1344                  *
1345                  * If "isn" is not zero, this request hit alive
1346                  * timewait bucket, so that all the necessary checks
1347                  * are made in the function processing timewait state.
1348                  */
1349                 if (tmp_opt.saw_tstamp &&
1350                     tcp_death_row.sysctl_tw_recycle &&
1351                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1352                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1353                     peer->v4daddr == saddr) {
1354                         inet_peer_refcheck(peer);
1355                         if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1356                             (s32)(peer->tcp_ts - req->ts_recent) >
1357                                                         TCP_PAWS_WINDOW) {
1358                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1359                                 goto drop_and_release;
1360                         }
1361                 }
1362                 /* Kill the following clause, if you dislike this way. */
1363                 else if (!sysctl_tcp_syncookies &&
1364                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1365                           (sysctl_max_syn_backlog >> 2)) &&
1366                          (!peer || !peer->tcp_ts_stamp) &&
1367                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1368                         /* Without syncookies last quarter of
1369                          * backlog is filled with destinations,
1370                          * proven to be alive.
1371                          * It means that we continue to communicate
1372                          * to destinations, already remembered
1373                          * to the moment of synflood.
1374                          */
1375                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1376                                        &saddr, ntohs(tcp_hdr(skb)->source));
1377                         goto drop_and_release;
1378                 }
1379
1380                 isn = tcp_v4_init_sequence(skb);
1381         }
1382         tcp_rsk(req)->snt_isn = isn;
1383
1384         if (tcp_v4_send_synack(sk, dst, req,
1385                                (struct request_values *)&tmp_ext) ||
1386             want_cookie)
1387                 goto drop_and_free;
1388
1389         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1390         return 0;
1391
1392 drop_and_release:
1393         dst_release(dst);
1394 drop_and_free:
1395         reqsk_free(req);
1396 drop:
1397         return 0;
1398 }
1399
1400
1401 /*
1402  * The three way handshake has completed - we got a valid synack -
1403  * now create the new socket.
1404  */
1405 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1406                                   struct request_sock *req,
1407                                   struct dst_entry *dst)
1408 {
1409         struct inet_request_sock *ireq;
1410         struct inet_sock *newinet;
1411         struct tcp_sock *newtp;
1412         struct sock *newsk;
1413 #ifdef CONFIG_TCP_MD5SIG
1414         struct tcp_md5sig_key *key;
1415 #endif
1416
1417         if (sk_acceptq_is_full(sk))
1418                 goto exit_overflow;
1419
1420         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1421                 goto exit;
1422
1423         newsk = tcp_create_openreq_child(sk, req, skb);
1424         if (!newsk)
1425                 goto exit;
1426
1427         newsk->sk_gso_type = SKB_GSO_TCPV4;
1428         sk_setup_caps(newsk, dst);
1429
1430         newtp                 = tcp_sk(newsk);
1431         newinet               = inet_sk(newsk);
1432         ireq                  = inet_rsk(req);
1433         newinet->inet_daddr   = ireq->rmt_addr;
1434         newinet->inet_rcv_saddr = ireq->loc_addr;
1435         newinet->inet_saddr           = ireq->loc_addr;
1436         newinet->opt          = ireq->opt;
1437         ireq->opt             = NULL;
1438         newinet->mc_index     = inet_iif(skb);
1439         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1440         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1441         if (newinet->opt)
1442                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1443         newinet->inet_id = newtp->write_seq ^ jiffies;
1444
1445         tcp_mtup_init(newsk);
1446         tcp_sync_mss(newsk, dst_mtu(dst));
1447         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1448         if (tcp_sk(sk)->rx_opt.user_mss &&
1449             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1450                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1451
1452         tcp_initialize_rcv_mss(newsk);
1453
1454 #ifdef CONFIG_TCP_MD5SIG
1455         /* Copy over the MD5 key from the original socket */
1456         key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1457         if (key != NULL) {
1458                 /*
1459                  * We're using one, so create a matching key
1460                  * on the newsk structure. If we fail to get
1461                  * memory, then we end up not copying the key
1462                  * across. Shucks.
1463                  */
1464                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1465                 if (newkey != NULL)
1466                         tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1467                                           newkey, key->keylen);
1468                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1469         }
1470 #endif
1471
1472         __inet_hash_nolisten(newsk, NULL);
1473         __inet_inherit_port(sk, newsk);
1474
1475         return newsk;
1476
1477 exit_overflow:
1478         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1479 exit:
1480         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1481         dst_release(dst);
1482         return NULL;
1483 }
1484
1485 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1486 {
1487         struct tcphdr *th = tcp_hdr(skb);
1488         const struct iphdr *iph = ip_hdr(skb);
1489         struct sock *nsk;
1490         struct request_sock **prev;
1491         /* Find possible connection requests. */
1492         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1493                                                        iph->saddr, iph->daddr);
1494         if (req)
1495                 return tcp_check_req(sk, skb, req, prev);
1496
1497         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1498                         th->source, iph->daddr, th->dest, inet_iif(skb));
1499
1500         if (nsk) {
1501                 if (nsk->sk_state != TCP_TIME_WAIT) {
1502                         bh_lock_sock(nsk);
1503                         return nsk;
1504                 }
1505                 inet_twsk_put(inet_twsk(nsk));
1506                 return NULL;
1507         }
1508
1509 #ifdef CONFIG_SYN_COOKIES
1510         if (!th->syn)
1511                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1512 #endif
1513         return sk;
1514 }
1515
1516 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1517 {
1518         const struct iphdr *iph = ip_hdr(skb);
1519
1520         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1521                 if (!tcp_v4_check(skb->len, iph->saddr,
1522                                   iph->daddr, skb->csum)) {
1523                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1524                         return 0;
1525                 }
1526         }
1527
1528         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1529                                        skb->len, IPPROTO_TCP, 0);
1530
1531         if (skb->len <= 76) {
1532                 return __skb_checksum_complete(skb);
1533         }
1534         return 0;
1535 }
1536
1537
1538 /* The socket must have it's spinlock held when we get
1539  * here.
1540  *
1541  * We have a potential double-lock case here, so even when
1542  * doing backlog processing we use the BH locking scheme.
1543  * This is because we cannot sleep with the original spinlock
1544  * held.
1545  */
1546 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1547 {
1548         struct sock *rsk;
1549 #ifdef CONFIG_TCP_MD5SIG
1550         /*
1551          * We really want to reject the packet as early as possible
1552          * if:
1553          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1554          *  o There is an MD5 option and we're not expecting one
1555          */
1556         if (tcp_v4_inbound_md5_hash(sk, skb))
1557                 goto discard;
1558 #endif
1559
1560         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1561                 sock_rps_save_rxhash(sk, skb->rxhash);
1562                 TCP_CHECK_TIMER(sk);
1563                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1564                         rsk = sk;
1565                         goto reset;
1566                 }
1567                 TCP_CHECK_TIMER(sk);
1568                 return 0;
1569         }
1570
1571         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1572                 goto csum_err;
1573
1574         if (sk->sk_state == TCP_LISTEN) {
1575                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1576                 if (!nsk)
1577                         goto discard;
1578
1579                 if (nsk != sk) {
1580                         if (tcp_child_process(sk, nsk, skb)) {
1581                                 rsk = nsk;
1582                                 goto reset;
1583                         }
1584                         return 0;
1585                 }
1586         } else
1587                 sock_rps_save_rxhash(sk, skb->rxhash);
1588
1589
1590         TCP_CHECK_TIMER(sk);
1591         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1592                 rsk = sk;
1593                 goto reset;
1594         }
1595         TCP_CHECK_TIMER(sk);
1596         return 0;
1597
1598 reset:
1599         tcp_v4_send_reset(rsk, skb);
1600 discard:
1601         kfree_skb(skb);
1602         /* Be careful here. If this function gets more complicated and
1603          * gcc suffers from register pressure on the x86, sk (in %ebx)
1604          * might be destroyed here. This current version compiles correctly,
1605          * but you have been warned.
1606          */
1607         return 0;
1608
1609 csum_err:
1610         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1611         goto discard;
1612 }
1613
1614 /*
1615  *      From tcp_input.c
1616  */
1617
1618 int tcp_v4_rcv(struct sk_buff *skb)
1619 {
1620         const struct iphdr *iph;
1621         struct tcphdr *th;
1622         struct sock *sk;
1623         int ret;
1624         struct net *net = dev_net(skb->dev);
1625
1626         if (skb->pkt_type != PACKET_HOST)
1627                 goto discard_it;
1628
1629         /* Count it even if it's bad */
1630         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1631
1632         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1633                 goto discard_it;
1634
1635         th = tcp_hdr(skb);
1636
1637         if (th->doff < sizeof(struct tcphdr) / 4)
1638                 goto bad_packet;
1639         if (!pskb_may_pull(skb, th->doff * 4))
1640                 goto discard_it;
1641
1642         /* An explanation is required here, I think.
1643          * Packet length and doff are validated by header prediction,
1644          * provided case of th->doff==0 is eliminated.
1645          * So, we defer the checks. */
1646         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1647                 goto bad_packet;
1648
1649         th = tcp_hdr(skb);
1650         iph = ip_hdr(skb);
1651         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1652         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1653                                     skb->len - th->doff * 4);
1654         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1655         TCP_SKB_CB(skb)->when    = 0;
1656         TCP_SKB_CB(skb)->flags   = iph->tos;
1657         TCP_SKB_CB(skb)->sacked  = 0;
1658
1659         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1660         if (!sk)
1661                 goto no_tcp_socket;
1662
1663 process:
1664         if (sk->sk_state == TCP_TIME_WAIT)
1665                 goto do_time_wait;
1666
1667         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1668                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1669                 goto discard_and_relse;
1670         }
1671
1672         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1673                 goto discard_and_relse;
1674         nf_reset(skb);
1675
1676         if (sk_filter(sk, skb))
1677                 goto discard_and_relse;
1678
1679         skb->dev = NULL;
1680
1681         bh_lock_sock_nested(sk);
1682         ret = 0;
1683         if (!sock_owned_by_user(sk)) {
1684 #ifdef CONFIG_NET_DMA
1685                 struct tcp_sock *tp = tcp_sk(sk);
1686                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1687                         tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1688                 if (tp->ucopy.dma_chan)
1689                         ret = tcp_v4_do_rcv(sk, skb);
1690                 else
1691 #endif
1692                 {
1693                         if (!tcp_prequeue(sk, skb))
1694                                 ret = tcp_v4_do_rcv(sk, skb);
1695                 }
1696         } else if (unlikely(sk_add_backlog(sk, skb))) {
1697                 bh_unlock_sock(sk);
1698                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1699                 goto discard_and_relse;
1700         }
1701         bh_unlock_sock(sk);
1702
1703         sock_put(sk);
1704
1705         return ret;
1706
1707 no_tcp_socket:
1708         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1709                 goto discard_it;
1710
1711         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1712 bad_packet:
1713                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1714         } else {
1715                 tcp_v4_send_reset(NULL, skb);
1716         }
1717
1718 discard_it:
1719         /* Discard frame. */
1720         kfree_skb(skb);
1721         return 0;
1722
1723 discard_and_relse:
1724         sock_put(sk);
1725         goto discard_it;
1726
1727 do_time_wait:
1728         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1729                 inet_twsk_put(inet_twsk(sk));
1730                 goto discard_it;
1731         }
1732
1733         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1734                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1735                 inet_twsk_put(inet_twsk(sk));
1736                 goto discard_it;
1737         }
1738         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1739         case TCP_TW_SYN: {
1740                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1741                                                         &tcp_hashinfo,
1742                                                         iph->daddr, th->dest,
1743                                                         inet_iif(skb));
1744                 if (sk2) {
1745                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1746                         inet_twsk_put(inet_twsk(sk));
1747                         sk = sk2;
1748                         goto process;
1749                 }
1750                 /* Fall through to ACK */
1751         }
1752         case TCP_TW_ACK:
1753                 tcp_v4_timewait_ack(sk, skb);
1754                 break;
1755         case TCP_TW_RST:
1756                 goto no_tcp_socket;
1757         case TCP_TW_SUCCESS:;
1758         }
1759         goto discard_it;
1760 }
1761
1762 /* VJ's idea. Save last timestamp seen from this destination
1763  * and hold it at least for normal timewait interval to use for duplicate
1764  * segment detection in subsequent connections, before they enter synchronized
1765  * state.
1766  */
1767
1768 int tcp_v4_remember_stamp(struct sock *sk)
1769 {
1770         struct inet_sock *inet = inet_sk(sk);
1771         struct tcp_sock *tp = tcp_sk(sk);
1772         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1773         struct inet_peer *peer = NULL;
1774         int release_it = 0;
1775
1776         if (!rt || rt->rt_dst != inet->inet_daddr) {
1777                 peer = inet_getpeer(inet->inet_daddr, 1);
1778                 release_it = 1;
1779         } else {
1780                 if (!rt->peer)
1781                         rt_bind_peer(rt, 1);
1782                 peer = rt->peer;
1783         }
1784
1785         if (peer) {
1786                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1787                     ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1788                      peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
1789                         peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
1790                         peer->tcp_ts = tp->rx_opt.ts_recent;
1791                 }
1792                 if (release_it)
1793                         inet_putpeer(peer);
1794                 return 1;
1795         }
1796
1797         return 0;
1798 }
1799
1800 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1801 {
1802         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1803
1804         if (peer) {
1805                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1806
1807                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1808                     ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1809                      peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
1810                         peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
1811                         peer->tcp_ts       = tcptw->tw_ts_recent;
1812                 }
1813                 inet_putpeer(peer);
1814                 return 1;
1815         }
1816
1817         return 0;
1818 }
1819
1820 const struct inet_connection_sock_af_ops ipv4_specific = {
1821         .queue_xmit        = ip_queue_xmit,
1822         .send_check        = tcp_v4_send_check,
1823         .rebuild_header    = inet_sk_rebuild_header,
1824         .conn_request      = tcp_v4_conn_request,
1825         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1826         .remember_stamp    = tcp_v4_remember_stamp,
1827         .net_header_len    = sizeof(struct iphdr),
1828         .setsockopt        = ip_setsockopt,
1829         .getsockopt        = ip_getsockopt,
1830         .addr2sockaddr     = inet_csk_addr2sockaddr,
1831         .sockaddr_len      = sizeof(struct sockaddr_in),
1832         .bind_conflict     = inet_csk_bind_conflict,
1833 #ifdef CONFIG_COMPAT
1834         .compat_setsockopt = compat_ip_setsockopt,
1835         .compat_getsockopt = compat_ip_getsockopt,
1836 #endif
1837 };
1838
1839 #ifdef CONFIG_TCP_MD5SIG
1840 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1841         .md5_lookup             = tcp_v4_md5_lookup,
1842         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1843         .md5_add                = tcp_v4_md5_add_func,
1844         .md5_parse              = tcp_v4_parse_md5_keys,
1845 };
1846 #endif
1847
1848 /* NOTE: A lot of things set to zero explicitly by call to
1849  *       sk_alloc() so need not be done here.
1850  */
1851 static int tcp_v4_init_sock(struct sock *sk)
1852 {
1853         struct inet_connection_sock *icsk = inet_csk(sk);
1854         struct tcp_sock *tp = tcp_sk(sk);
1855
1856         skb_queue_head_init(&tp->out_of_order_queue);
1857         tcp_init_xmit_timers(sk);
1858         tcp_prequeue_init(tp);
1859
1860         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1861         tp->mdev = TCP_TIMEOUT_INIT;
1862
1863         /* So many TCP implementations out there (incorrectly) count the
1864          * initial SYN frame in their delayed-ACK and congestion control
1865          * algorithms that we must have the following bandaid to talk
1866          * efficiently to them.  -DaveM
1867          */
1868         tp->snd_cwnd = 2;
1869
1870         /* See draft-stevens-tcpca-spec-01 for discussion of the
1871          * initialization of these values.
1872          */
1873         tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1874         tp->snd_cwnd_clamp = ~0;
1875         tp->mss_cache = TCP_MSS_DEFAULT;
1876
1877         tp->reordering = sysctl_tcp_reordering;
1878         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1879
1880         sk->sk_state = TCP_CLOSE;
1881
1882         sk->sk_write_space = sk_stream_write_space;
1883         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1884
1885         icsk->icsk_af_ops = &ipv4_specific;
1886         icsk->icsk_sync_mss = tcp_sync_mss;
1887 #ifdef CONFIG_TCP_MD5SIG
1888         tp->af_specific = &tcp_sock_ipv4_specific;
1889 #endif
1890
1891         /* TCP Cookie Transactions */
1892         if (sysctl_tcp_cookie_size > 0) {
1893                 /* Default, cookies without s_data_payload. */
1894                 tp->cookie_values =
1895                         kzalloc(sizeof(*tp->cookie_values),
1896                                 sk->sk_allocation);
1897                 if (tp->cookie_values != NULL)
1898                         kref_init(&tp->cookie_values->kref);
1899         }
1900         /* Presumed zeroed, in order of appearance:
1901          *      cookie_in_always, cookie_out_never,
1902          *      s_data_constant, s_data_in, s_data_out
1903          */
1904         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1905         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1906
1907         local_bh_disable();
1908         percpu_counter_inc(&tcp_sockets_allocated);
1909         local_bh_enable();
1910
1911         return 0;
1912 }
1913
1914 void tcp_v4_destroy_sock(struct sock *sk)
1915 {
1916         struct tcp_sock *tp = tcp_sk(sk);
1917
1918         tcp_clear_xmit_timers(sk);
1919
1920         tcp_cleanup_congestion_control(sk);
1921
1922         /* Cleanup up the write buffer. */
1923         tcp_write_queue_purge(sk);
1924
1925         /* Cleans up our, hopefully empty, out_of_order_queue. */
1926         __skb_queue_purge(&tp->out_of_order_queue);
1927
1928 #ifdef CONFIG_TCP_MD5SIG
1929         /* Clean up the MD5 key list, if any */
1930         if (tp->md5sig_info) {
1931                 tcp_v4_clear_md5_list(sk);
1932                 kfree(tp->md5sig_info);
1933                 tp->md5sig_info = NULL;
1934         }
1935 #endif
1936
1937 #ifdef CONFIG_NET_DMA
1938         /* Cleans up our sk_async_wait_queue */
1939         __skb_queue_purge(&sk->sk_async_wait_queue);
1940 #endif
1941
1942         /* Clean prequeue, it must be empty really */
1943         __skb_queue_purge(&tp->ucopy.prequeue);
1944
1945         /* Clean up a referenced TCP bind bucket. */
1946         if (inet_csk(sk)->icsk_bind_hash)
1947                 inet_put_port(sk);
1948
1949         /*
1950          * If sendmsg cached page exists, toss it.
1951          */
1952         if (sk->sk_sndmsg_page) {
1953                 __free_page(sk->sk_sndmsg_page);
1954                 sk->sk_sndmsg_page = NULL;
1955         }
1956
1957         /* TCP Cookie Transactions */
1958         if (tp->cookie_values != NULL) {
1959                 kref_put(&tp->cookie_values->kref,
1960                          tcp_cookie_values_release);
1961                 tp->cookie_values = NULL;
1962         }
1963
1964         percpu_counter_dec(&tcp_sockets_allocated);
1965 }
1966
1967 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1968
1969 #ifdef CONFIG_PROC_FS
1970 /* Proc filesystem TCP sock list dumping. */
1971
1972 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1973 {
1974         return hlist_nulls_empty(head) ? NULL :
1975                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1976 }
1977
1978 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1979 {
1980         return !is_a_nulls(tw->tw_node.next) ?
1981                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1982 }
1983
1984 /*
1985  * Get next listener socket follow cur.  If cur is NULL, get first socket
1986  * starting from bucket given in st->bucket; when st->bucket is zero the
1987  * very first socket in the hash table is returned.
1988  */
1989 static void *listening_get_next(struct seq_file *seq, void *cur)
1990 {
1991         struct inet_connection_sock *icsk;
1992         struct hlist_nulls_node *node;
1993         struct sock *sk = cur;
1994         struct inet_listen_hashbucket *ilb;
1995         struct tcp_iter_state *st = seq->private;
1996         struct net *net = seq_file_net(seq);
1997
1998         if (!sk) {
1999                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2000                 spin_lock_bh(&ilb->lock);
2001                 sk = sk_nulls_head(&ilb->head);
2002                 st->offset = 0;
2003                 goto get_sk;
2004         }
2005         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2006         ++st->num;
2007         ++st->offset;
2008
2009         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2010                 struct request_sock *req = cur;
2011
2012                 icsk = inet_csk(st->syn_wait_sk);
2013                 req = req->dl_next;
2014                 while (1) {
2015                         while (req) {
2016                                 if (req->rsk_ops->family == st->family) {
2017                                         cur = req;
2018                                         goto out;
2019                                 }
2020                                 req = req->dl_next;
2021                         }
2022                         st->offset = 0;
2023                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2024                                 break;
2025 get_req:
2026                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2027                 }
2028                 sk        = sk_next(st->syn_wait_sk);
2029                 st->state = TCP_SEQ_STATE_LISTENING;
2030                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2031         } else {
2032                 icsk = inet_csk(sk);
2033                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2034                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2035                         goto start_req;
2036                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2037                 sk = sk_next(sk);
2038         }
2039 get_sk:
2040         sk_nulls_for_each_from(sk, node) {
2041                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
2042                         cur = sk;
2043                         goto out;
2044                 }
2045                 icsk = inet_csk(sk);
2046                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2047                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2048 start_req:
2049                         st->uid         = sock_i_uid(sk);
2050                         st->syn_wait_sk = sk;
2051                         st->state       = TCP_SEQ_STATE_OPENREQ;
2052                         st->sbucket     = 0;
2053                         goto get_req;
2054                 }
2055                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2056         }
2057         spin_unlock_bh(&ilb->lock);
2058         st->offset = 0;
2059         if (++st->bucket < INET_LHTABLE_SIZE) {
2060                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2061                 spin_lock_bh(&ilb->lock);
2062                 sk = sk_nulls_head(&ilb->head);
2063                 goto get_sk;
2064         }
2065         cur = NULL;
2066 out:
2067         return cur;
2068 }
2069
2070 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2071 {
2072         struct tcp_iter_state *st = seq->private;
2073         void *rc;
2074
2075         st->bucket = 0;
2076         st->offset = 0;
2077         rc = listening_get_next(seq, NULL);
2078
2079         while (rc && *pos) {
2080                 rc = listening_get_next(seq, rc);
2081                 --*pos;
2082         }
2083         return rc;
2084 }
2085
2086 static inline int empty_bucket(struct tcp_iter_state *st)
2087 {
2088         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2089                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2090 }
2091
2092 /*
2093  * Get first established socket starting from bucket given in st->bucket.
2094  * If st->bucket is zero, the very first socket in the hash is returned.
2095  */
2096 static void *established_get_first(struct seq_file *seq)
2097 {
2098         struct tcp_iter_state *st = seq->private;
2099         struct net *net = seq_file_net(seq);
2100         void *rc = NULL;
2101
2102         st->offset = 0;
2103         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2104                 struct sock *sk;
2105                 struct hlist_nulls_node *node;
2106                 struct inet_timewait_sock *tw;
2107                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2108
2109                 /* Lockless fast path for the common case of empty buckets */
2110                 if (empty_bucket(st))
2111                         continue;
2112
2113                 spin_lock_bh(lock);
2114                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2115                         if (sk->sk_family != st->family ||
2116                             !net_eq(sock_net(sk), net)) {
2117                                 continue;
2118                         }
2119                         rc = sk;
2120                         goto out;
2121                 }
2122                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2123                 inet_twsk_for_each(tw, node,
2124                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2125                         if (tw->tw_family != st->family ||
2126                             !net_eq(twsk_net(tw), net)) {
2127                                 continue;
2128                         }
2129                         rc = tw;
2130                         goto out;
2131                 }
2132                 spin_unlock_bh(lock);
2133                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2134         }
2135 out:
2136         return rc;
2137 }
2138
2139 static void *established_get_next(struct seq_file *seq, void *cur)
2140 {
2141         struct sock *sk = cur;
2142         struct inet_timewait_sock *tw;
2143         struct hlist_nulls_node *node;
2144         struct tcp_iter_state *st = seq->private;
2145         struct net *net = seq_file_net(seq);
2146
2147         ++st->num;
2148         ++st->offset;
2149
2150         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2151                 tw = cur;
2152                 tw = tw_next(tw);
2153 get_tw:
2154                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2155                         tw = tw_next(tw);
2156                 }
2157                 if (tw) {
2158                         cur = tw;
2159                         goto out;
2160                 }
2161                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2162                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2163
2164                 /* Look for next non empty bucket */
2165                 st->offset = 0;
2166                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2167                                 empty_bucket(st))
2168                         ;
2169                 if (st->bucket > tcp_hashinfo.ehash_mask)
2170                         return NULL;
2171
2172                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2173                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2174         } else
2175                 sk = sk_nulls_next(sk);
2176
2177         sk_nulls_for_each_from(sk, node) {
2178                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2179                         goto found;
2180         }
2181
2182         st->state = TCP_SEQ_STATE_TIME_WAIT;
2183         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2184         goto get_tw;
2185 found:
2186         cur = sk;
2187 out:
2188         return cur;
2189 }
2190
2191 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2192 {
2193         struct tcp_iter_state *st = seq->private;
2194         void *rc;
2195
2196         st->bucket = 0;
2197         rc = established_get_first(seq);
2198
2199         while (rc && pos) {
2200                 rc = established_get_next(seq, rc);
2201                 --pos;
2202         }
2203         return rc;
2204 }
2205
2206 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2207 {
2208         void *rc;
2209         struct tcp_iter_state *st = seq->private;
2210
2211         st->state = TCP_SEQ_STATE_LISTENING;
2212         rc        = listening_get_idx(seq, &pos);
2213
2214         if (!rc) {
2215                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2216                 rc        = established_get_idx(seq, pos);
2217         }
2218
2219         return rc;
2220 }
2221
2222 static void *tcp_seek_last_pos(struct seq_file *seq)
2223 {
2224         struct tcp_iter_state *st = seq->private;
2225         int offset = st->offset;
2226         int orig_num = st->num;
2227         void *rc = NULL;
2228
2229         switch (st->state) {
2230         case TCP_SEQ_STATE_OPENREQ:
2231         case TCP_SEQ_STATE_LISTENING:
2232                 if (st->bucket >= INET_LHTABLE_SIZE)
2233                         break;
2234                 st->state = TCP_SEQ_STATE_LISTENING;
2235                 rc = listening_get_next(seq, NULL);
2236                 while (offset-- && rc)
2237                         rc = listening_get_next(seq, rc);
2238                 if (rc)
2239                         break;
2240                 st->bucket = 0;
2241                 /* Fallthrough */
2242         case TCP_SEQ_STATE_ESTABLISHED:
2243         case TCP_SEQ_STATE_TIME_WAIT:
2244                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2245                 if (st->bucket > tcp_hashinfo.ehash_mask)
2246                         break;
2247                 rc = established_get_first(seq);
2248                 while (offset-- && rc)
2249                         rc = established_get_next(seq, rc);
2250         }
2251
2252         st->num = orig_num;
2253
2254         return rc;
2255 }
2256
2257 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2258 {
2259         struct tcp_iter_state *st = seq->private;
2260         void *rc;
2261
2262         if (*pos && *pos == st->last_pos) {
2263                 rc = tcp_seek_last_pos(seq);
2264                 if (rc)
2265                         goto out;
2266         }
2267
2268         st->state = TCP_SEQ_STATE_LISTENING;
2269         st->num = 0;
2270         st->bucket = 0;
2271         st->offset = 0;
2272         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2273
2274 out:
2275         st->last_pos = *pos;
2276         return rc;
2277 }
2278
2279 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2280 {
2281         struct tcp_iter_state *st = seq->private;
2282         void *rc = NULL;
2283
2284         if (v == SEQ_START_TOKEN) {
2285                 rc = tcp_get_idx(seq, 0);
2286                 goto out;
2287         }
2288
2289         switch (st->state) {
2290         case TCP_SEQ_STATE_OPENREQ:
2291         case TCP_SEQ_STATE_LISTENING:
2292                 rc = listening_get_next(seq, v);
2293                 if (!rc) {
2294                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2295                         st->bucket = 0;
2296                         st->offset = 0;
2297                         rc        = established_get_first(seq);
2298                 }
2299                 break;
2300         case TCP_SEQ_STATE_ESTABLISHED:
2301         case TCP_SEQ_STATE_TIME_WAIT:
2302                 rc = established_get_next(seq, v);
2303                 break;
2304         }
2305 out:
2306         ++*pos;
2307         st->last_pos = *pos;
2308         return rc;
2309 }
2310
2311 static void tcp_seq_stop(struct seq_file *seq, void *v)
2312 {
2313         struct tcp_iter_state *st = seq->private;
2314
2315         switch (st->state) {
2316         case TCP_SEQ_STATE_OPENREQ:
2317                 if (v) {
2318                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2319                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2320                 }
2321         case TCP_SEQ_STATE_LISTENING:
2322                 if (v != SEQ_START_TOKEN)
2323                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2324                 break;
2325         case TCP_SEQ_STATE_TIME_WAIT:
2326         case TCP_SEQ_STATE_ESTABLISHED:
2327                 if (v)
2328                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2329                 break;
2330         }
2331 }
2332
2333 static int tcp_seq_open(struct inode *inode, struct file *file)
2334 {
2335         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2336         struct tcp_iter_state *s;
2337         int err;
2338
2339         err = seq_open_net(inode, file, &afinfo->seq_ops,
2340                           sizeof(struct tcp_iter_state));
2341         if (err < 0)
2342                 return err;
2343
2344         s = ((struct seq_file *)file->private_data)->private;
2345         s->family               = afinfo->family;
2346         s->last_pos             = 0;
2347         return 0;
2348 }
2349
2350 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2351 {
2352         int rc = 0;
2353         struct proc_dir_entry *p;
2354
2355         afinfo->seq_fops.open           = tcp_seq_open;
2356         afinfo->seq_fops.read           = seq_read;
2357         afinfo->seq_fops.llseek         = seq_lseek;
2358         afinfo->seq_fops.release        = seq_release_net;
2359
2360         afinfo->seq_ops.start           = tcp_seq_start;
2361         afinfo->seq_ops.next            = tcp_seq_next;
2362         afinfo->seq_ops.stop            = tcp_seq_stop;
2363
2364         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2365                              &afinfo->seq_fops, afinfo);
2366         if (!p)
2367                 rc = -ENOMEM;
2368         return rc;
2369 }
2370
2371 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2372 {
2373         proc_net_remove(net, afinfo->name);
2374 }
2375
2376 static void get_openreq4(struct sock *sk, struct request_sock *req,
2377                          struct seq_file *f, int i, int uid, int *len)
2378 {
2379         const struct inet_request_sock *ireq = inet_rsk(req);
2380         int ttd = req->expires - jiffies;
2381
2382         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2383                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2384                 i,
2385                 ireq->loc_addr,
2386                 ntohs(inet_sk(sk)->inet_sport),
2387                 ireq->rmt_addr,
2388                 ntohs(ireq->rmt_port),
2389                 TCP_SYN_RECV,
2390                 0, 0, /* could print option size, but that is af dependent. */
2391                 1,    /* timers active (only the expire timer) */
2392                 jiffies_to_clock_t(ttd),
2393                 req->retrans,
2394                 uid,
2395                 0,  /* non standard timer */
2396                 0, /* open_requests have no inode */
2397                 atomic_read(&sk->sk_refcnt),
2398                 req,
2399                 len);
2400 }
2401
2402 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2403 {
2404         int timer_active;
2405         unsigned long timer_expires;
2406         struct tcp_sock *tp = tcp_sk(sk);
2407         const struct inet_connection_sock *icsk = inet_csk(sk);
2408         struct inet_sock *inet = inet_sk(sk);
2409         __be32 dest = inet->inet_daddr;
2410         __be32 src = inet->inet_rcv_saddr;
2411         __u16 destp = ntohs(inet->inet_dport);
2412         __u16 srcp = ntohs(inet->inet_sport);
2413         int rx_queue;
2414
2415         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2416                 timer_active    = 1;
2417                 timer_expires   = icsk->icsk_timeout;
2418         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2419                 timer_active    = 4;
2420                 timer_expires   = icsk->icsk_timeout;
2421         } else if (timer_pending(&sk->sk_timer)) {
2422                 timer_active    = 2;
2423                 timer_expires   = sk->sk_timer.expires;
2424         } else {
2425                 timer_active    = 0;
2426                 timer_expires = jiffies;
2427         }
2428
2429         if (sk->sk_state == TCP_LISTEN)
2430                 rx_queue = sk->sk_ack_backlog;
2431         else
2432                 /*
2433                  * because we dont lock socket, we might find a transient negative value
2434                  */
2435                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2436
2437         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2438                         "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2439                 i, src, srcp, dest, destp, sk->sk_state,
2440                 tp->write_seq - tp->snd_una,
2441                 rx_queue,
2442                 timer_active,
2443                 jiffies_to_clock_t(timer_expires - jiffies),
2444                 icsk->icsk_retransmits,
2445                 sock_i_uid(sk),
2446                 icsk->icsk_probes_out,
2447                 sock_i_ino(sk),
2448                 atomic_read(&sk->sk_refcnt), sk,
2449                 jiffies_to_clock_t(icsk->icsk_rto),
2450                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2451                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2452                 tp->snd_cwnd,
2453                 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2454                 len);
2455 }
2456
2457 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2458                                struct seq_file *f, int i, int *len)
2459 {
2460         __be32 dest, src;
2461         __u16 destp, srcp;
2462         int ttd = tw->tw_ttd - jiffies;
2463
2464         if (ttd < 0)
2465                 ttd = 0;
2466
2467         dest  = tw->tw_daddr;
2468         src   = tw->tw_rcv_saddr;
2469         destp = ntohs(tw->tw_dport);
2470         srcp  = ntohs(tw->tw_sport);
2471
2472         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2473                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2474                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2475                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2476                 atomic_read(&tw->tw_refcnt), tw, len);
2477 }
2478
2479 #define TMPSZ 150
2480
2481 static int tcp4_seq_show(struct seq_file *seq, void *v)
2482 {
2483         struct tcp_iter_state *st;
2484         int len;
2485
2486         if (v == SEQ_START_TOKEN) {
2487                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2488                            "  sl  local_address rem_address   st tx_queue "
2489                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2490                            "inode");
2491                 goto out;
2492         }
2493         st = seq->private;
2494
2495         switch (st->state) {
2496         case TCP_SEQ_STATE_LISTENING:
2497         case TCP_SEQ_STATE_ESTABLISHED:
2498                 get_tcp4_sock(v, seq, st->num, &len);
2499                 break;
2500         case TCP_SEQ_STATE_OPENREQ:
2501                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2502                 break;
2503         case TCP_SEQ_STATE_TIME_WAIT:
2504                 get_timewait4_sock(v, seq, st->num, &len);
2505                 break;
2506         }
2507         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2508 out:
2509         return 0;
2510 }
2511
2512 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2513         .name           = "tcp",
2514         .family         = AF_INET,
2515         .seq_fops       = {
2516                 .owner          = THIS_MODULE,
2517         },
2518         .seq_ops        = {
2519                 .show           = tcp4_seq_show,
2520         },
2521 };
2522
2523 static int __net_init tcp4_proc_init_net(struct net *net)
2524 {
2525         return tcp_proc_register(net, &tcp4_seq_afinfo);
2526 }
2527
2528 static void __net_exit tcp4_proc_exit_net(struct net *net)
2529 {
2530         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2531 }
2532
2533 static struct pernet_operations tcp4_net_ops = {
2534         .init = tcp4_proc_init_net,
2535         .exit = tcp4_proc_exit_net,
2536 };
2537
2538 int __init tcp4_proc_init(void)
2539 {
2540         return register_pernet_subsys(&tcp4_net_ops);
2541 }
2542
2543 void tcp4_proc_exit(void)
2544 {
2545         unregister_pernet_subsys(&tcp4_net_ops);
2546 }
2547 #endif /* CONFIG_PROC_FS */
2548
2549 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2550 {
2551         struct iphdr *iph = skb_gro_network_header(skb);
2552
2553         switch (skb->ip_summed) {
2554         case CHECKSUM_COMPLETE:
2555                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2556                                   skb->csum)) {
2557                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2558                         break;
2559                 }
2560
2561                 /* fall through */
2562         case CHECKSUM_NONE:
2563                 NAPI_GRO_CB(skb)->flush = 1;
2564                 return NULL;
2565         }
2566
2567         return tcp_gro_receive(head, skb);
2568 }
2569 EXPORT_SYMBOL(tcp4_gro_receive);
2570
2571 int tcp4_gro_complete(struct sk_buff *skb)
2572 {
2573         struct iphdr *iph = ip_hdr(skb);
2574         struct tcphdr *th = tcp_hdr(skb);
2575
2576         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2577                                   iph->saddr, iph->daddr, 0);
2578         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2579
2580         return tcp_gro_complete(skb);
2581 }
2582 EXPORT_SYMBOL(tcp4_gro_complete);
2583
2584 struct proto tcp_prot = {
2585         .name                   = "TCP",
2586         .owner                  = THIS_MODULE,
2587         .close                  = tcp_close,
2588         .connect                = tcp_v4_connect,
2589         .disconnect             = tcp_disconnect,
2590         .accept                 = inet_csk_accept,
2591         .ioctl                  = tcp_ioctl,
2592         .init                   = tcp_v4_init_sock,
2593         .destroy                = tcp_v4_destroy_sock,
2594         .shutdown               = tcp_shutdown,
2595         .setsockopt             = tcp_setsockopt,
2596         .getsockopt             = tcp_getsockopt,
2597         .recvmsg                = tcp_recvmsg,
2598         .backlog_rcv            = tcp_v4_do_rcv,
2599         .hash                   = inet_hash,
2600         .unhash                 = inet_unhash,
2601         .get_port               = inet_csk_get_port,
2602         .enter_memory_pressure  = tcp_enter_memory_pressure,
2603         .sockets_allocated      = &tcp_sockets_allocated,
2604         .orphan_count           = &tcp_orphan_count,
2605         .memory_allocated       = &tcp_memory_allocated,
2606         .memory_pressure        = &tcp_memory_pressure,
2607         .sysctl_mem             = sysctl_tcp_mem,
2608         .sysctl_wmem            = sysctl_tcp_wmem,
2609         .sysctl_rmem            = sysctl_tcp_rmem,
2610         .max_header             = MAX_TCP_HEADER,
2611         .obj_size               = sizeof(struct tcp_sock),
2612         .slab_flags             = SLAB_DESTROY_BY_RCU,
2613         .twsk_prot              = &tcp_timewait_sock_ops,
2614         .rsk_prot               = &tcp_request_sock_ops,
2615         .h.hashinfo             = &tcp_hashinfo,
2616 #ifdef CONFIG_COMPAT
2617         .compat_setsockopt      = compat_tcp_setsockopt,
2618         .compat_getsockopt      = compat_tcp_getsockopt,
2619 #endif
2620 };
2621
2622
2623 static int __net_init tcp_sk_init(struct net *net)
2624 {
2625         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2626                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2627 }
2628
2629 static void __net_exit tcp_sk_exit(struct net *net)
2630 {
2631         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2632 }
2633
2634 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2635 {
2636         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2637 }
2638
2639 static struct pernet_operations __net_initdata tcp_sk_ops = {
2640        .init       = tcp_sk_init,
2641        .exit       = tcp_sk_exit,
2642        .exit_batch = tcp_sk_exit_batch,
2643 };
2644
2645 void __init tcp_v4_init(void)
2646 {
2647         inet_hashinfo_init(&tcp_hashinfo);
2648         if (register_pernet_subsys(&tcp_sk_ops))
2649                 panic("Failed to create the TCP control socket.\n");
2650 }
2651
2652 EXPORT_SYMBOL(ipv4_specific);
2653 EXPORT_SYMBOL(tcp_hashinfo);
2654 EXPORT_SYMBOL(tcp_prot);
2655 EXPORT_SYMBOL(tcp_v4_conn_request);
2656 EXPORT_SYMBOL(tcp_v4_connect);
2657 EXPORT_SYMBOL(tcp_v4_do_rcv);
2658 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2659 EXPORT_SYMBOL(tcp_v4_send_check);
2660 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2661
2662 #ifdef CONFIG_PROC_FS
2663 EXPORT_SYMBOL(tcp_proc_register);
2664 EXPORT_SYMBOL(tcp_proc_unregister);
2665 #endif
2666 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2667