2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_output.c,v 1.146 2002/02/01 22:01:04 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
24 * Changes: Pedro Roque : Retransmit queue handled by TCP.
25 * : Fragmentation on mtu decrease
26 * : Segment collapse on retransmit
29 * Linus Torvalds : send_delayed_ack
30 * David S. Miller : Charge memory using the right skb
31 * during syn/ack processing.
32 * David S. Miller : Output engine completely rewritten.
33 * Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
34 * Cacophonix Gaul : draft-minshall-nagle-01
35 * J Hadi Salim : ECN support
41 #include <linux/compiler.h>
42 #include <linux/module.h>
43 #include <linux/smp_lock.h>
45 /* People can turn this off for buggy TCP's found in printers etc. */
46 int sysctl_tcp_retrans_collapse = 1;
48 /* This limits the percentage of the congestion window which we
49 * will allow a single TSO frame to consume. Building TSO frames
50 * which are too large can cause TCP streams to be bursty.
52 int sysctl_tcp_tso_win_divisor = 8;
54 static inline void update_send_head(struct sock *sk, struct tcp_sock *tp,
57 sk->sk_send_head = skb->next;
58 if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue)
59 sk->sk_send_head = NULL;
60 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
61 tcp_packets_out_inc(sk, tp, skb);
64 /* SND.NXT, if window was not shrunk.
65 * If window has been shrunk, what should we make? It is not clear at all.
66 * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
67 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
68 * invalid. OK, let's make this for now:
70 static inline __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_sock *tp)
72 if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt))
75 return tp->snd_una+tp->snd_wnd;
78 /* Calculate mss to advertise in SYN segment.
79 * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
81 * 1. It is independent of path mtu.
82 * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
83 * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
84 * attached devices, because some buggy hosts are confused by
86 * 4. We do not make 3, we advertise MSS, calculated from first
87 * hop device mtu, but allow to raise it to ip_rt_min_advmss.
88 * This may be overridden via information stored in routing table.
89 * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
90 * probably even Jumbo".
92 static __u16 tcp_advertise_mss(struct sock *sk)
94 struct tcp_sock *tp = tcp_sk(sk);
95 struct dst_entry *dst = __sk_dst_get(sk);
98 if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) {
99 mss = dst_metric(dst, RTAX_ADVMSS);
106 /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
107 * This is the first part of cwnd validation mechanism. */
108 static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
110 s32 delta = tcp_time_stamp - tp->lsndtime;
111 u32 restart_cwnd = tcp_init_cwnd(tp, dst);
112 u32 cwnd = tp->snd_cwnd;
114 if (tcp_is_vegas(tp))
115 tcp_vegas_enable(tp);
117 tp->snd_ssthresh = tcp_current_ssthresh(tp);
118 restart_cwnd = min(restart_cwnd, cwnd);
120 while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd)
122 tp->snd_cwnd = max(cwnd, restart_cwnd);
123 tp->snd_cwnd_stamp = tcp_time_stamp;
124 tp->snd_cwnd_used = 0;
127 static inline void tcp_event_data_sent(struct tcp_sock *tp,
128 struct sk_buff *skb, struct sock *sk)
130 u32 now = tcp_time_stamp;
132 if (!tcp_get_pcount(&tp->packets_out) &&
133 (s32)(now - tp->lsndtime) > tp->rto)
134 tcp_cwnd_restart(tp, __sk_dst_get(sk));
138 /* If it is a reply for ato after last received
139 * packet, enter pingpong mode.
141 if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato)
142 tp->ack.pingpong = 1;
145 static __inline__ void tcp_event_ack_sent(struct sock *sk)
147 struct tcp_sock *tp = tcp_sk(sk);
149 tcp_dec_quickack_mode(tp);
150 tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
153 /* Determine a window scaling and initial window to offer.
154 * Based on the assumption that the given amount of space
155 * will be offered. Store the results in the tp structure.
156 * NOTE: for smooth operation initial space offering should
157 * be a multiple of mss if possible. We assume here that mss >= 1.
158 * This MUST be enforced by all callers.
160 void tcp_select_initial_window(int __space, __u32 mss,
161 __u32 *rcv_wnd, __u32 *window_clamp,
162 int wscale_ok, __u8 *rcv_wscale)
164 unsigned int space = (__space < 0 ? 0 : __space);
166 /* If no clamp set the clamp to the max possible scaled window */
167 if (*window_clamp == 0)
168 (*window_clamp) = (65535 << 14);
169 space = min(*window_clamp, space);
171 /* Quantize space offering to a multiple of mss if possible. */
173 space = (space / mss) * mss;
175 /* NOTE: offering an initial window larger than 32767
176 * will break some buggy TCP stacks. We try to be nice.
177 * If we are not window scaling, then this truncates
178 * our initial window offering to 32k. There should also
179 * be a sysctl option to stop being nice.
181 (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
184 /* Set window scaling on max possible window
185 * See RFC1323 for an explanation of the limit to 14
187 space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
188 while (space > 65535 && (*rcv_wscale) < 14) {
194 /* Set initial window to value enough for senders,
195 * following RFC1414. Senders, not following this RFC,
196 * will be satisfied with 2.
198 if (mss > (1<<*rcv_wscale)) {
204 if (*rcv_wnd > init_cwnd*mss)
205 *rcv_wnd = init_cwnd*mss;
208 /* Set the clamp no higher than max representable value */
209 (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
212 /* Chose a new window to advertise, update state in tcp_sock for the
213 * socket, and return result with RFC1323 scaling applied. The return
214 * value can be stuffed directly into th->window for an outgoing
217 static __inline__ u16 tcp_select_window(struct sock *sk)
219 struct tcp_sock *tp = tcp_sk(sk);
220 u32 cur_win = tcp_receive_window(tp);
221 u32 new_win = __tcp_select_window(sk);
223 /* Never shrink the offered window */
224 if(new_win < cur_win) {
225 /* Danger Will Robinson!
226 * Don't update rcv_wup/rcv_wnd here or else
227 * we will not be able to advertise a zero
228 * window in time. --DaveM
230 * Relax Will Robinson.
234 tp->rcv_wnd = new_win;
235 tp->rcv_wup = tp->rcv_nxt;
237 /* Make sure we do not exceed the maximum possible
241 new_win = min(new_win, MAX_TCP_WINDOW);
243 new_win = min(new_win, (65535U << tp->rcv_wscale));
245 /* RFC1323 scaling applied */
246 new_win >>= tp->rcv_wscale;
248 /* If we advertise zero window, disable fast path. */
256 /* This routine actually transmits TCP packets queued in by
257 * tcp_do_sendmsg(). This is used by both the initial
258 * transmission and possible later retransmissions.
259 * All SKB's seen here are completely headerless. It is our
260 * job to build the TCP header, and pass the packet down to
261 * IP so it can do the same plus pass the packet off to the
264 * We are working here with either a clone of the original
265 * SKB, or a fresh unique copy made by the retransmit engine.
267 static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
270 struct inet_sock *inet = inet_sk(sk);
271 struct tcp_sock *tp = tcp_sk(sk);
272 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
273 int tcp_header_size = tp->tcp_header_len;
278 BUG_ON(!tcp_skb_pcount(skb));
280 #define SYSCTL_FLAG_TSTAMPS 0x1
281 #define SYSCTL_FLAG_WSCALE 0x2
282 #define SYSCTL_FLAG_SACK 0x4
285 if (tcb->flags & TCPCB_FLAG_SYN) {
286 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
287 if(sysctl_tcp_timestamps) {
288 tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
289 sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
291 if(sysctl_tcp_window_scaling) {
292 tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
293 sysctl_flags |= SYSCTL_FLAG_WSCALE;
295 if(sysctl_tcp_sack) {
296 sysctl_flags |= SYSCTL_FLAG_SACK;
297 if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
298 tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
300 } else if (tp->eff_sacks) {
301 /* A SACK is 2 pad bytes, a 2 byte header, plus
302 * 2 32-bit sequence numbers for each SACK block.
304 tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
305 (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
309 * If the connection is idle and we are restarting,
310 * then we don't want to do any Vegas calculations
311 * until we get fresh RTT samples. So when we
312 * restart, we reset our Vegas state to a clean
313 * slate. After we get acks for this flight of
314 * packets, _then_ we can make Vegas calculations
317 if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0)
318 tcp_vegas_enable(tp);
320 th = (struct tcphdr *) skb_push(skb, tcp_header_size);
322 skb_set_owner_w(skb, sk);
324 /* Build TCP header and checksum it. */
325 th->source = inet->sport;
326 th->dest = inet->dport;
327 th->seq = htonl(tcb->seq);
328 th->ack_seq = htonl(tp->rcv_nxt);
329 *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags);
330 if (tcb->flags & TCPCB_FLAG_SYN) {
331 /* RFC1323: The window in SYN & SYN/ACK segments
334 th->window = htons(tp->rcv_wnd);
336 th->window = htons(tcp_select_window(sk));
342 between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
343 th->urg_ptr = htons(tp->snd_up-tcb->seq);
347 if (tcb->flags & TCPCB_FLAG_SYN) {
348 tcp_syn_build_options((__u32 *)(th + 1),
349 tcp_advertise_mss(sk),
350 (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
351 (sysctl_flags & SYSCTL_FLAG_SACK),
352 (sysctl_flags & SYSCTL_FLAG_WSCALE),
357 tcp_build_and_update_options((__u32 *)(th + 1),
360 TCP_ECN_send(sk, tp, skb, tcp_header_size);
362 tp->af_specific->send_check(sk, th, skb->len, skb);
364 if (tcb->flags & TCPCB_FLAG_ACK)
365 tcp_event_ack_sent(sk);
367 if (skb->len != tcp_header_size)
368 tcp_event_data_sent(tp, skb, sk);
370 TCP_INC_STATS(TCP_MIB_OUTSEGS);
372 err = tp->af_specific->queue_xmit(skb, 0);
378 /* NET_XMIT_CN is special. It does not guarantee,
379 * that this packet is lost. It tells that device
380 * is about to start to drop packets or already
381 * drops some packets of the same priority and
382 * invokes us to send less aggressively.
384 return err == NET_XMIT_CN ? 0 : err;
387 #undef SYSCTL_FLAG_TSTAMPS
388 #undef SYSCTL_FLAG_WSCALE
389 #undef SYSCTL_FLAG_SACK
393 /* This routine just queue's the buffer
395 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
396 * otherwise socket can stall.
398 static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
400 struct tcp_sock *tp = tcp_sk(sk);
402 /* Advance write_seq and place onto the write_queue. */
403 tp->write_seq = TCP_SKB_CB(skb)->end_seq;
404 __skb_queue_tail(&sk->sk_write_queue, skb);
405 sk_charge_skb(sk, skb);
407 /* Queue it, remembering where we must start sending. */
408 if (sk->sk_send_head == NULL)
409 sk->sk_send_head = skb;
412 /* Send _single_ skb sitting at the send head. This function requires
413 * true push pending frames to setup probe timer etc.
415 void tcp_push_one(struct sock *sk, unsigned cur_mss)
417 struct tcp_sock *tp = tcp_sk(sk);
418 struct sk_buff *skb = sk->sk_send_head;
420 if (tcp_snd_test(tp, skb, cur_mss, TCP_NAGLE_PUSH)) {
421 /* Send it out now. */
422 TCP_SKB_CB(skb)->when = tcp_time_stamp;
423 if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
424 sk->sk_send_head = NULL;
425 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
426 tcp_packets_out_inc(sk, tp, skb);
432 void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_std)
434 if (skb->len <= mss_std) {
435 /* Avoid the costly divide in the normal
438 skb_shinfo(skb)->tso_segs = 1;
439 skb_shinfo(skb)->tso_size = 0;
443 factor = skb->len + (mss_std - 1);
445 skb_shinfo(skb)->tso_segs = factor;
446 skb_shinfo(skb)->tso_size = mss_std;
450 /* Function to create two new TCP segments. Shrinks the given segment
451 * to the specified size and appends a new segment with the rest of the
452 * packet to the list. This won't be called frequently, I hope.
453 * Remember, these are still headerless SKBs at this point.
455 static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
457 struct tcp_sock *tp = tcp_sk(sk);
458 struct sk_buff *buff;
462 nsize = skb_headlen(skb) - len;
466 if (skb_cloned(skb) &&
467 skb_is_nonlinear(skb) &&
468 pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
471 /* Get a new skb... force flag on. */
472 buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
474 return -ENOMEM; /* We'll just try again later. */
475 sk_charge_skb(sk, buff);
477 /* Correct the sequence numbers. */
478 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
479 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
480 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
482 /* PSH and FIN should only be set in the second packet. */
483 flags = TCP_SKB_CB(skb)->flags;
484 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
485 TCP_SKB_CB(buff)->flags = flags;
486 TCP_SKB_CB(buff)->sacked =
487 (TCP_SKB_CB(skb)->sacked &
488 (TCPCB_LOST | TCPCB_EVER_RETRANS | TCPCB_AT_TAIL));
489 TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL;
491 if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) {
492 /* Copy and checksum data tail into the new buffer. */
493 buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
498 skb->csum = csum_block_sub(skb->csum, buff->csum, len);
500 skb->ip_summed = CHECKSUM_HW;
501 skb_split(skb, buff, len);
504 buff->ip_summed = skb->ip_summed;
506 /* Looks stupid, but our code really uses when of
507 * skbs, which it never sent before. --ANK
509 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
511 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
512 tcp_dec_pcount(&tp->lost_out, skb);
513 tcp_dec_pcount(&tp->left_out, skb);
516 /* Fix up tso_factor for both original and new SKB. */
517 tcp_set_skb_tso_segs(skb, tp->mss_cache_std);
518 tcp_set_skb_tso_segs(buff, tp->mss_cache_std);
520 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
521 tcp_inc_pcount(&tp->lost_out, skb);
522 tcp_inc_pcount(&tp->left_out, skb);
525 if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) {
526 tcp_inc_pcount(&tp->lost_out, buff);
527 tcp_inc_pcount(&tp->left_out, buff);
530 /* Link BUFF into the send queue. */
531 __skb_append(skb, buff);
536 /* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
537 * eventually). The difference is that pulled data not copied, but
538 * immediately discarded.
540 static unsigned char *__pskb_trim_head(struct sk_buff *skb, int len)
546 for (i=0; i<skb_shinfo(skb)->nr_frags; i++) {
547 if (skb_shinfo(skb)->frags[i].size <= eat) {
548 put_page(skb_shinfo(skb)->frags[i].page);
549 eat -= skb_shinfo(skb)->frags[i].size;
551 skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
553 skb_shinfo(skb)->frags[k].page_offset += eat;
554 skb_shinfo(skb)->frags[k].size -= eat;
560 skb_shinfo(skb)->nr_frags = k;
562 skb->tail = skb->data;
563 skb->data_len -= len;
564 skb->len = skb->data_len;
568 int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
570 if (skb_cloned(skb) &&
571 pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
574 if (len <= skb_headlen(skb)) {
575 __skb_pull(skb, len);
577 if (__pskb_trim_head(skb, len-skb_headlen(skb)) == NULL)
581 TCP_SKB_CB(skb)->seq += len;
582 skb->ip_summed = CHECKSUM_HW;
584 skb->truesize -= len;
585 sk->sk_queue_shrunk = 1;
586 sk->sk_wmem_queued -= len;
587 sk->sk_forward_alloc += len;
589 /* Any change of skb->len requires recalculation of tso
592 if (tcp_skb_pcount(skb) > 1)
593 tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb));
598 /* This function synchronize snd mss to current pmtu/exthdr set.
600 tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
601 for TCP options, but includes only bare TCP header.
603 tp->mss_clamp is mss negotiated at connection setup.
604 It is minumum of user_mss and mss received with SYN.
605 It also does not include TCP options.
607 tp->pmtu_cookie is last pmtu, seen by this function.
609 tp->mss_cache is current effective sending mss, including
610 all tcp options except for SACKs. It is evaluated,
611 taking into account current pmtu, but never exceeds
614 NOTE1. rfc1122 clearly states that advertised MSS
615 DOES NOT include either tcp or ip options.
617 NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
618 this function. --ANK (980731)
621 unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
623 struct tcp_sock *tp = tcp_sk(sk);
624 struct dst_entry *dst = __sk_dst_get(sk);
627 if (dst && dst->ops->get_mss)
628 pmtu = dst->ops->get_mss(dst, pmtu);
630 /* Calculate base mss without TCP options:
631 It is MMS_S - sizeof(tcphdr) of rfc1122
633 mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
635 /* Clamp it (mss_clamp does not include tcp options) */
636 if (mss_now > tp->mss_clamp)
637 mss_now = tp->mss_clamp;
639 /* Now subtract optional transport overhead */
640 mss_now -= tp->ext_header_len + tp->ext2_header_len;
642 /* Then reserve room for full set of TCP options and 8 bytes of data */
646 /* Now subtract TCP options size, not including SACKs */
647 mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
649 /* Bound mss with half of window */
650 if (tp->max_window && mss_now > (tp->max_window>>1))
651 mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
653 /* And store cached results */
654 tp->pmtu_cookie = pmtu;
655 tp->mss_cache = tp->mss_cache_std = mss_now;
660 /* Compute the current effective MSS, taking SACKs and IP options,
661 * and even PMTU discovery events into account.
663 * LARGESEND note: !urg_mode is overkill, only frames up to snd_up
664 * cannot be large. However, taking into account rare use of URG, this
668 unsigned int tcp_current_mss(struct sock *sk, int large)
670 struct tcp_sock *tp = tcp_sk(sk);
671 struct dst_entry *dst = __sk_dst_get(sk);
672 unsigned int do_large, mss_now;
674 mss_now = tp->mss_cache_std;
676 u32 mtu = dst_pmtu(dst);
677 if (mtu != tp->pmtu_cookie ||
678 tp->ext2_header_len != dst->header_len)
679 mss_now = tcp_sync_mss(sk, mtu);
683 (sk->sk_route_caps & NETIF_F_TSO) &&
687 unsigned int large_mss, factor, limit;
689 large_mss = 65535 - tp->af_specific->net_header_len -
690 tp->ext_header_len - tp->ext2_header_len -
693 if (tp->max_window && large_mss > (tp->max_window>>1))
694 large_mss = max((tp->max_window>>1),
695 68U - tp->tcp_header_len);
697 factor = large_mss / mss_now;
699 /* Always keep large mss multiple of real mss, but
700 * do not exceed 1/tso_win_divisor of the congestion window
701 * so we can keep the ACK clock ticking and minimize
704 limit = tp->snd_cwnd;
705 if (sysctl_tcp_tso_win_divisor)
706 limit /= sysctl_tcp_tso_win_divisor;
707 limit = max(1U, limit);
711 tp->mss_cache = mss_now * factor;
713 mss_now = tp->mss_cache;
717 mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
718 (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
722 /* This routine writes packets to the network. It advances the
723 * send_head. This happens as incoming acks open up the remote
726 * Returns 1, if no segments are in flight and we have queued segments, but
727 * cannot send anything now because of SWS or another problem.
729 int tcp_write_xmit(struct sock *sk, int nonagle)
731 struct tcp_sock *tp = tcp_sk(sk);
732 unsigned int mss_now;
734 /* If we are closed, the bytes will have to remain here.
735 * In time closedown will finish, we empty the write queue and all
738 if (sk->sk_state != TCP_CLOSE) {
742 /* Account for SACKS, we may need to fragment due to this.
743 * It is just like the real MSS changing on us midstream.
744 * We also handle things correctly when the user adds some
745 * IP options mid-stream. Silly to do, but cover it.
747 mss_now = tcp_current_mss(sk, 1);
749 while ((skb = sk->sk_send_head) &&
750 tcp_snd_test(tp, skb, mss_now,
751 tcp_skb_is_last(sk, skb) ? nonagle :
753 if (skb->len > mss_now) {
754 if (tcp_fragment(sk, skb, mss_now))
758 TCP_SKB_CB(skb)->when = tcp_time_stamp;
759 if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
762 /* Advance the send_head. This one is sent out.
763 * This call will increment packets_out.
765 update_send_head(sk, tp, skb);
767 tcp_minshall_update(tp, mss_now, skb);
772 tcp_cwnd_validate(sk, tp);
776 return !tcp_get_pcount(&tp->packets_out) && sk->sk_send_head;
781 /* This function returns the amount that we can raise the
782 * usable window based on the following constraints
784 * 1. The window can never be shrunk once it is offered (RFC 793)
785 * 2. We limit memory per socket
788 * "the suggested [SWS] avoidance algorithm for the receiver is to keep
789 * RECV.NEXT + RCV.WIN fixed until:
790 * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
792 * i.e. don't raise the right edge of the window until you can raise
793 * it at least MSS bytes.
795 * Unfortunately, the recommended algorithm breaks header prediction,
796 * since header prediction assumes th->window stays fixed.
798 * Strictly speaking, keeping th->window fixed violates the receiver
799 * side SWS prevention criteria. The problem is that under this rule
800 * a stream of single byte packets will cause the right side of the
801 * window to always advance by a single byte.
803 * Of course, if the sender implements sender side SWS prevention
804 * then this will not be a problem.
806 * BSD seems to make the following compromise:
808 * If the free space is less than the 1/4 of the maximum
809 * space available and the free space is less than 1/2 mss,
810 * then set the window to 0.
811 * [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
812 * Otherwise, just prevent the window from shrinking
813 * and from being larger than the largest representable value.
815 * This prevents incremental opening of the window in the regime
816 * where TCP is limited by the speed of the reader side taking
817 * data out of the TCP receive queue. It does nothing about
818 * those cases where the window is constrained on the sender side
819 * because the pipeline is full.
821 * BSD also seems to "accidentally" limit itself to windows that are a
822 * multiple of MSS, at least until the free space gets quite small.
823 * This would appear to be a side effect of the mbuf implementation.
824 * Combining these two algorithms results in the observed behavior
825 * of having a fixed window size at almost all times.
827 * Below we obtain similar behavior by forcing the offered window to
828 * a multiple of the mss when it is feasible to do so.
830 * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
831 * Regular options like TIMESTAMP are taken into account.
833 u32 __tcp_select_window(struct sock *sk)
835 struct tcp_sock *tp = tcp_sk(sk);
836 /* MSS for the peer's data. Previous verions used mss_clamp
837 * here. I don't know if the value based on our guesses
838 * of peer's MSS is better for the performance. It's more correct
839 * but may be worse for the performance because of rcv_mss
840 * fluctuations. --SAW 1998/11/1
842 int mss = tp->ack.rcv_mss;
843 int free_space = tcp_space(sk);
844 int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
847 if (mss > full_space)
850 if (free_space < full_space/2) {
853 if (tcp_memory_pressure)
854 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
856 if (free_space < mss)
860 if (free_space > tp->rcv_ssthresh)
861 free_space = tp->rcv_ssthresh;
863 /* Don't do rounding if we are using window scaling, since the
864 * scaled window will not line up with the MSS boundary anyway.
866 window = tp->rcv_wnd;
867 if (tp->rcv_wscale) {
870 /* Advertise enough space so that it won't get scaled away.
871 * Import case: prevent zero window announcement if
872 * 1<<rcv_wscale > mss.
874 if (((window >> tp->rcv_wscale) << tp->rcv_wscale) != window)
875 window = (((window >> tp->rcv_wscale) + 1)
878 /* Get the largest window that is a nice multiple of mss.
879 * Window clamp already applied above.
880 * If our current window offering is within 1 mss of the
881 * free space we just keep it. This prevents the divide
882 * and multiply from happening most of the time.
883 * We also don't do any window rounding when the free space
886 if (window <= free_space - mss || window > free_space)
887 window = (free_space/mss)*mss;
893 /* Attempt to collapse two adjacent SKB's during retransmission. */
894 static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
896 struct tcp_sock *tp = tcp_sk(sk);
897 struct sk_buff *next_skb = skb->next;
899 /* The first test we must make is that neither of these two
900 * SKB's are still referenced by someone else.
902 if (!skb_cloned(skb) && !skb_cloned(next_skb)) {
903 int skb_size = skb->len, next_skb_size = next_skb->len;
904 u16 flags = TCP_SKB_CB(skb)->flags;
906 /* Also punt if next skb has been SACK'd. */
907 if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
910 /* Next skb is out of window. */
911 if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd))
914 /* Punt if not enough space exists in the first SKB for
915 * the data in the second, or the total combined payload
916 * would exceed the MSS.
918 if ((next_skb_size > skb_tailroom(skb)) ||
919 ((skb_size + next_skb_size) > mss_now))
922 BUG_ON(tcp_skb_pcount(skb) != 1 ||
923 tcp_skb_pcount(next_skb) != 1);
925 /* Ok. We will be able to collapse the packet. */
926 __skb_unlink(next_skb, next_skb->list);
928 memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
930 if (next_skb->ip_summed == CHECKSUM_HW)
931 skb->ip_summed = CHECKSUM_HW;
933 if (skb->ip_summed != CHECKSUM_HW)
934 skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
936 /* Update sequence range on original skb. */
937 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
939 /* Merge over control information. */
940 flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
941 TCP_SKB_CB(skb)->flags = flags;
943 /* All done, get rid of second SKB and account for it so
944 * packet counting does not break.
946 TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
947 if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS)
948 tcp_dec_pcount(&tp->retrans_out, next_skb);
949 if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) {
950 tcp_dec_pcount(&tp->lost_out, next_skb);
951 tcp_dec_pcount(&tp->left_out, next_skb);
953 /* Reno case is special. Sigh... */
954 if (!tp->sack_ok && tcp_get_pcount(&tp->sacked_out)) {
955 tcp_dec_pcount_approx(&tp->sacked_out, next_skb);
956 tcp_dec_pcount(&tp->left_out, next_skb);
959 /* Not quite right: it can be > snd.fack, but
960 * it is better to underestimate fackets.
962 tcp_dec_pcount_approx(&tp->fackets_out, next_skb);
963 tcp_packets_out_dec(tp, next_skb);
964 sk_stream_free_skb(sk, next_skb);
968 /* Do a simple retransmit without using the backoff mechanisms in
969 * tcp_timer. This is used for path mtu discovery.
970 * The socket is already locked here.
972 void tcp_simple_retransmit(struct sock *sk)
974 struct tcp_sock *tp = tcp_sk(sk);
976 unsigned int mss = tcp_current_mss(sk, 0);
979 sk_stream_for_retrans_queue(skb, sk) {
980 if (skb->len > mss &&
981 !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
982 if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
983 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
984 tcp_dec_pcount(&tp->retrans_out, skb);
986 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) {
987 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
988 tcp_inc_pcount(&tp->lost_out, skb);
997 tcp_sync_left_out(tp);
999 /* Don't muck with the congestion window here.
1000 * Reason is that we do not increase amount of _data_
1001 * in network, but units changed and effective
1002 * cwnd/ssthresh really reduced now.
1004 if (tp->ca_state != TCP_CA_Loss) {
1005 tp->high_seq = tp->snd_nxt;
1006 tp->snd_ssthresh = tcp_current_ssthresh(tp);
1007 tp->prior_ssthresh = 0;
1008 tp->undo_marker = 0;
1009 tcp_set_ca_state(tp, TCP_CA_Loss);
1011 tcp_xmit_retransmit_queue(sk);
1014 /* This retransmits one SKB. Policy decisions and retransmit queue
1015 * state updates are done by the caller. Returns non-zero if an
1016 * error occurred which prevented the send.
1018 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1020 struct tcp_sock *tp = tcp_sk(sk);
1021 unsigned int cur_mss = tcp_current_mss(sk, 0);
1024 /* Do not sent more than we queued. 1/4 is reserved for possible
1025 * copying overhead: frgagmentation, tunneling, mangling etc.
1027 if (atomic_read(&sk->sk_wmem_alloc) >
1028 min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
1031 if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
1032 if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
1035 if (sk->sk_route_caps & NETIF_F_TSO) {
1036 sk->sk_route_caps &= ~NETIF_F_TSO;
1037 sk->sk_no_largesend = 1;
1038 tp->mss_cache = tp->mss_cache_std;
1041 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
1045 /* If receiver has shrunk his window, and skb is out of
1046 * new window, do not retransmit it. The exception is the
1047 * case, when window is shrunk to zero. In this case
1048 * our retransmit serves as a zero window probe.
1050 if (!before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)
1051 && TCP_SKB_CB(skb)->seq != tp->snd_una)
1054 if (skb->len > cur_mss) {
1055 int old_factor = tcp_skb_pcount(skb);
1058 if (tcp_fragment(sk, skb, cur_mss))
1059 return -ENOMEM; /* We'll try again later. */
1061 /* New SKB created, account for it. */
1062 new_factor = tcp_skb_pcount(skb);
1063 tcp_dec_pcount_explicit(&tp->packets_out,
1064 old_factor - new_factor);
1065 tcp_inc_pcount(&tp->packets_out, skb->next);
1068 /* Collapse two adjacent packets if worthwhile and we can. */
1069 if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
1070 (skb->len < (cur_mss >> 1)) &&
1071 (skb->next != sk->sk_send_head) &&
1072 (skb->next != (struct sk_buff *)&sk->sk_write_queue) &&
1073 (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(skb->next)->nr_frags == 0) &&
1074 (sysctl_tcp_retrans_collapse != 0))
1075 tcp_retrans_try_collapse(sk, skb, cur_mss);
1077 if(tp->af_specific->rebuild_header(sk))
1078 return -EHOSTUNREACH; /* Routing failure or similar. */
1080 /* Some Solaris stacks overoptimize and ignore the FIN on a
1081 * retransmit when old data is attached. So strip it off
1082 * since it is cheap to do so and saves bytes on the network.
1085 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
1086 tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
1087 if (!pskb_trim(skb, 0)) {
1088 TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
1089 skb_shinfo(skb)->tso_segs = 1;
1090 skb_shinfo(skb)->tso_size = 0;
1091 skb->ip_summed = CHECKSUM_NONE;
1096 /* Make a copy, if the first transmission SKB clone we made
1097 * is still in somebody's hands, else make a clone.
1099 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1101 err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
1102 pskb_copy(skb, GFP_ATOMIC):
1103 skb_clone(skb, GFP_ATOMIC)));
1106 /* Update global TCP statistics. */
1107 TCP_INC_STATS(TCP_MIB_RETRANSSEGS);
1109 tp->total_retrans++;
1111 #if FASTRETRANS_DEBUG > 0
1112 if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
1113 if (net_ratelimit())
1114 printk(KERN_DEBUG "retrans_out leaked.\n");
1117 TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
1118 tcp_inc_pcount(&tp->retrans_out, skb);
1120 /* Save stamp of the first retransmit. */
1121 if (!tp->retrans_stamp)
1122 tp->retrans_stamp = TCP_SKB_CB(skb)->when;
1126 /* snd_nxt is stored to detect loss of retransmitted segment,
1127 * see tcp_input.c tcp_sacktag_write_queue().
1129 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
1134 /* This gets called after a retransmit timeout, and the initially
1135 * retransmitted data is acknowledged. It tries to continue
1136 * resending the rest of the retransmit queue, until either
1137 * we've sent it all or the congestion window limit is reached.
1138 * If doing SACK, the first ACK which comes back for a timeout
1139 * based retransmit packet might feed us FACK information again.
1140 * If so, we use it to avoid unnecessarily retransmissions.
1142 void tcp_xmit_retransmit_queue(struct sock *sk)
1144 struct tcp_sock *tp = tcp_sk(sk);
1145 struct sk_buff *skb;
1146 int packet_cnt = tcp_get_pcount(&tp->lost_out);
1148 /* First pass: retransmit lost packets. */
1150 sk_stream_for_retrans_queue(skb, sk) {
1151 __u8 sacked = TCP_SKB_CB(skb)->sacked;
1153 /* Assume this retransmit will generate
1154 * only one packet for congestion window
1155 * calculation purposes. This works because
1156 * tcp_retransmit_skb() will chop up the
1157 * packet to be MSS sized and all the
1158 * packet counting works out.
1160 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
1163 if (sacked&TCPCB_LOST) {
1164 if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
1165 if (tcp_retransmit_skb(sk, skb))
1167 if (tp->ca_state != TCP_CA_Loss)
1168 NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
1170 NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS);
1173 skb_peek(&sk->sk_write_queue))
1174 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1177 packet_cnt -= tcp_skb_pcount(skb);
1178 if (packet_cnt <= 0)
1184 /* OK, demanded retransmission is finished. */
1186 /* Forward retransmissions are possible only during Recovery. */
1187 if (tp->ca_state != TCP_CA_Recovery)
1190 /* No forward retransmissions in Reno are possible. */
1194 /* Yeah, we have to make difficult choice between forward transmission
1195 * and retransmission... Both ways have their merits...
1197 * For now we do not retransmit anything, while we have some new
1201 if (tcp_may_send_now(sk, tp))
1206 sk_stream_for_retrans_queue(skb, sk) {
1207 /* Similar to the retransmit loop above we
1208 * can pretend that the retransmitted SKB
1209 * we send out here will be composed of one
1210 * real MSS sized packet because tcp_retransmit_skb()
1211 * will fragment it if necessary.
1213 if (++packet_cnt > tcp_get_pcount(&tp->fackets_out))
1216 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
1219 if (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
1222 /* Ok, retransmit it. */
1223 if (tcp_retransmit_skb(sk, skb))
1226 if (skb == skb_peek(&sk->sk_write_queue))
1227 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1229 NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS);
1234 /* Send a fin. The caller locks the socket for us. This cannot be
1235 * allowed to fail queueing a FIN frame under any circumstances.
1237 void tcp_send_fin(struct sock *sk)
1239 struct tcp_sock *tp = tcp_sk(sk);
1240 struct sk_buff *skb = skb_peek_tail(&sk->sk_write_queue);
1243 /* Optimization, tack on the FIN if we have a queue of
1244 * unsent frames. But be careful about outgoing SACKS
1247 mss_now = tcp_current_mss(sk, 1);
1249 if (sk->sk_send_head != NULL) {
1250 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
1251 TCP_SKB_CB(skb)->end_seq++;
1254 /* Socket is locked, keep trying until memory is available. */
1256 skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
1262 /* Reserve space for headers and prepare control bits. */
1263 skb_reserve(skb, MAX_TCP_HEADER);
1265 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
1266 TCP_SKB_CB(skb)->sacked = 0;
1267 skb_shinfo(skb)->tso_segs = 1;
1268 skb_shinfo(skb)->tso_size = 0;
1270 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
1271 TCP_SKB_CB(skb)->seq = tp->write_seq;
1272 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
1273 tcp_queue_skb(sk, skb);
1275 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_OFF);
1278 /* We get here when a process closes a file descriptor (either due to
1279 * an explicit close() or as a byproduct of exit()'ing) and there
1280 * was unread data in the receive queue. This behavior is recommended
1281 * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
1283 void tcp_send_active_reset(struct sock *sk, int priority)
1285 struct tcp_sock *tp = tcp_sk(sk);
1286 struct sk_buff *skb;
1288 /* NOTE: No TCP options attached and we never retransmit this. */
1289 skb = alloc_skb(MAX_TCP_HEADER, priority);
1291 NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
1295 /* Reserve space for headers and prepare control bits. */
1296 skb_reserve(skb, MAX_TCP_HEADER);
1298 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
1299 TCP_SKB_CB(skb)->sacked = 0;
1300 skb_shinfo(skb)->tso_segs = 1;
1301 skb_shinfo(skb)->tso_size = 0;
1304 TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
1305 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1306 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1307 if (tcp_transmit_skb(sk, skb))
1308 NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
1311 /* WARNING: This routine must only be called when we have already sent
1312 * a SYN packet that crossed the incoming SYN that caused this routine
1313 * to get called. If this assumption fails then the initial rcv_wnd
1314 * and rcv_wscale values will not be correct.
1316 int tcp_send_synack(struct sock *sk)
1318 struct sk_buff* skb;
1320 skb = skb_peek(&sk->sk_write_queue);
1321 if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) {
1322 printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
1325 if (!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) {
1326 if (skb_cloned(skb)) {
1327 struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
1330 __skb_unlink(skb, &sk->sk_write_queue);
1331 __skb_queue_head(&sk->sk_write_queue, nskb);
1332 sk_stream_free_skb(sk, skb);
1333 sk_charge_skb(sk, nskb);
1337 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK;
1338 TCP_ECN_send_synack(tcp_sk(sk), skb);
1340 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1341 return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1345 * Prepare a SYN-ACK.
1347 struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
1348 struct open_request *req)
1350 struct tcp_sock *tp = tcp_sk(sk);
1352 int tcp_header_size;
1353 struct sk_buff *skb;
1355 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
1359 /* Reserve space for headers. */
1360 skb_reserve(skb, MAX_TCP_HEADER);
1362 skb->dst = dst_clone(dst);
1364 tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
1365 (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
1366 (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
1367 /* SACK_PERM is in the place of NOP NOP of TS */
1368 ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
1369 skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
1371 memset(th, 0, sizeof(struct tcphdr));
1374 if (dst->dev->features&NETIF_F_TSO)
1376 TCP_ECN_make_synack(req, th);
1377 th->source = inet_sk(sk)->sport;
1378 th->dest = req->rmt_port;
1379 TCP_SKB_CB(skb)->seq = req->snt_isn;
1380 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
1381 TCP_SKB_CB(skb)->sacked = 0;
1382 skb_shinfo(skb)->tso_segs = 1;
1383 skb_shinfo(skb)->tso_size = 0;
1384 th->seq = htonl(TCP_SKB_CB(skb)->seq);
1385 th->ack_seq = htonl(req->rcv_isn + 1);
1386 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
1388 /* Set this up on the first call only */
1389 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
1390 /* tcp_full_space because it is guaranteed to be the first packet */
1391 tcp_select_initial_window(tcp_full_space(sk),
1392 dst_metric(dst, RTAX_ADVMSS) - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
1397 req->rcv_wscale = rcv_wscale;
1400 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
1401 th->window = htons(req->rcv_wnd);
1403 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1404 tcp_syn_build_options((__u32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), req->tstamp_ok,
1405 req->sack_ok, req->wscale_ok, req->rcv_wscale,
1406 TCP_SKB_CB(skb)->when,
1410 th->doff = (tcp_header_size >> 2);
1411 TCP_INC_STATS(TCP_MIB_OUTSEGS);
1416 * Do all connect socket setups that can be done AF independent.
1418 static inline void tcp_connect_init(struct sock *sk)
1420 struct dst_entry *dst = __sk_dst_get(sk);
1421 struct tcp_sock *tp = tcp_sk(sk);
1423 /* We'll fix this up when we get a response from the other end.
1424 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
1426 tp->tcp_header_len = sizeof(struct tcphdr) +
1427 (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
1429 /* If user gave his TCP_MAXSEG, record it to clamp */
1431 tp->mss_clamp = tp->user_mss;
1433 tcp_sync_mss(sk, dst_pmtu(dst));
1435 if (!tp->window_clamp)
1436 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
1437 tp->advmss = dst_metric(dst, RTAX_ADVMSS);
1438 tcp_initialize_rcv_mss(sk);
1441 tcp_select_initial_window(tcp_full_space(sk),
1442 tp->advmss - (tp->ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
1445 sysctl_tcp_window_scaling,
1448 tp->rcv_ssthresh = tp->rcv_wnd;
1451 sock_reset_flag(sk, SOCK_DONE);
1453 tcp_init_wl(tp, tp->write_seq, 0);
1454 tp->snd_una = tp->write_seq;
1455 tp->snd_sml = tp->write_seq;
1460 tp->rto = TCP_TIMEOUT_INIT;
1461 tp->retransmits = 0;
1462 tcp_clear_retrans(tp);
1466 * Build a SYN and send it off.
1468 int tcp_connect(struct sock *sk)
1470 struct tcp_sock *tp = tcp_sk(sk);
1471 struct sk_buff *buff;
1473 tcp_connect_init(sk);
1475 buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation);
1476 if (unlikely(buff == NULL))
1479 /* Reserve space for headers. */
1480 skb_reserve(buff, MAX_TCP_HEADER);
1482 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
1483 TCP_ECN_send_syn(sk, tp, buff);
1484 TCP_SKB_CB(buff)->sacked = 0;
1485 skb_shinfo(buff)->tso_segs = 1;
1486 skb_shinfo(buff)->tso_size = 0;
1488 TCP_SKB_CB(buff)->seq = tp->write_seq++;
1489 TCP_SKB_CB(buff)->end_seq = tp->write_seq;
1490 tp->snd_nxt = tp->write_seq;
1491 tp->pushed_seq = tp->write_seq;
1495 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1496 tp->retrans_stamp = TCP_SKB_CB(buff)->when;
1497 __skb_queue_tail(&sk->sk_write_queue, buff);
1498 sk_charge_skb(sk, buff);
1499 tcp_inc_pcount(&tp->packets_out, buff);
1500 tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
1501 TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
1503 /* Timer for repeating the SYN until an answer. */
1504 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1508 /* Send out a delayed ack, the caller does the policy checking
1509 * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
1512 void tcp_send_delayed_ack(struct sock *sk)
1514 struct tcp_sock *tp = tcp_sk(sk);
1515 int ato = tp->ack.ato;
1516 unsigned long timeout;
1518 if (ato > TCP_DELACK_MIN) {
1521 if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED))
1522 max_ato = TCP_DELACK_MAX;
1524 /* Slow path, intersegment interval is "high". */
1526 /* If some rtt estimate is known, use it to bound delayed ack.
1527 * Do not use tp->rto here, use results of rtt measurements
1531 int rtt = max(tp->srtt>>3, TCP_DELACK_MIN);
1537 ato = min(ato, max_ato);
1540 /* Stay within the limit we were given */
1541 timeout = jiffies + ato;
1543 /* Use new timeout only if there wasn't a older one earlier. */
1544 if (tp->ack.pending&TCP_ACK_TIMER) {
1545 /* If delack timer was blocked or is about to expire,
1548 if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) {
1553 if (!time_before(timeout, tp->ack.timeout))
1554 timeout = tp->ack.timeout;
1556 tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER;
1557 tp->ack.timeout = timeout;
1558 sk_reset_timer(sk, &tp->delack_timer, timeout);
1561 /* This routine sends an ack and also updates the window. */
1562 void tcp_send_ack(struct sock *sk)
1564 /* If we have been reset, we may not send again. */
1565 if (sk->sk_state != TCP_CLOSE) {
1566 struct tcp_sock *tp = tcp_sk(sk);
1567 struct sk_buff *buff;
1569 /* We are not putting this on the write queue, so
1570 * tcp_transmit_skb() will set the ownership to this
1573 buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1575 tcp_schedule_ack(tp);
1576 tp->ack.ato = TCP_ATO_MIN;
1577 tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
1581 /* Reserve space for headers and prepare control bits. */
1582 skb_reserve(buff, MAX_TCP_HEADER);
1584 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
1585 TCP_SKB_CB(buff)->sacked = 0;
1586 skb_shinfo(buff)->tso_segs = 1;
1587 skb_shinfo(buff)->tso_size = 0;
1589 /* Send it off, this clears delayed acks for us. */
1590 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
1591 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1592 tcp_transmit_skb(sk, buff);
1596 /* This routine sends a packet with an out of date sequence
1597 * number. It assumes the other end will try to ack it.
1599 * Question: what should we make while urgent mode?
1600 * 4.4BSD forces sending single byte of data. We cannot send
1601 * out of window data, because we have SND.NXT==SND.MAX...
1603 * Current solution: to send TWO zero-length segments in urgent mode:
1604 * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
1605 * out-of-date with SND.UNA-1 to probe window.
1607 static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
1609 struct tcp_sock *tp = tcp_sk(sk);
1610 struct sk_buff *skb;
1612 /* We don't queue it, tcp_transmit_skb() sets ownership. */
1613 skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1617 /* Reserve space for headers and set control bits. */
1618 skb_reserve(skb, MAX_TCP_HEADER);
1620 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
1621 TCP_SKB_CB(skb)->sacked = urgent;
1622 skb_shinfo(skb)->tso_segs = 1;
1623 skb_shinfo(skb)->tso_size = 0;
1625 /* Use a previous sequence. This should cause the other
1626 * end to send an ack. Don't queue or clone SKB, just
1629 TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1;
1630 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1631 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1632 return tcp_transmit_skb(sk, skb);
1635 int tcp_write_wakeup(struct sock *sk)
1637 if (sk->sk_state != TCP_CLOSE) {
1638 struct tcp_sock *tp = tcp_sk(sk);
1639 struct sk_buff *skb;
1641 if ((skb = sk->sk_send_head) != NULL &&
1642 before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) {
1644 unsigned int mss = tcp_current_mss(sk, 0);
1645 unsigned int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq;
1647 if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
1648 tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
1650 /* We are probing the opening of a window
1651 * but the window size is != 0
1652 * must have been a result SWS avoidance ( sender )
1654 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
1656 seg_size = min(seg_size, mss);
1657 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1658 if (tcp_fragment(sk, skb, seg_size))
1660 /* SWS override triggered forced fragmentation.
1661 * Disable TSO, the connection is too sick. */
1662 if (sk->sk_route_caps & NETIF_F_TSO) {
1663 sk->sk_no_largesend = 1;
1664 sk->sk_route_caps &= ~NETIF_F_TSO;
1665 tp->mss_cache = tp->mss_cache_std;
1667 } else if (!tcp_skb_pcount(skb))
1668 tcp_set_skb_tso_segs(skb, tp->mss_cache_std);
1670 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1671 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1672 err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1674 update_send_head(sk, tp, skb);
1679 between(tp->snd_up, tp->snd_una+1, tp->snd_una+0xFFFF))
1680 tcp_xmit_probe_skb(sk, TCPCB_URG);
1681 return tcp_xmit_probe_skb(sk, 0);
1687 /* A window probe timeout has occurred. If window is not closed send
1688 * a partial packet else a zero probe.
1690 void tcp_send_probe0(struct sock *sk)
1692 struct tcp_sock *tp = tcp_sk(sk);
1695 err = tcp_write_wakeup(sk);
1697 if (tcp_get_pcount(&tp->packets_out) || !sk->sk_send_head) {
1698 /* Cancel probe timer, if it is not required. */
1705 if (tp->backoff < sysctl_tcp_retries2)
1708 tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
1709 min(tp->rto << tp->backoff, TCP_RTO_MAX));
1711 /* If packet was not sent due to local congestion,
1712 * do not backoff and do not remember probes_out.
1713 * Let local senders to fight for local resources.
1715 * Use accumulated backoff yet.
1717 if (!tp->probes_out)
1719 tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
1720 min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
1724 EXPORT_SYMBOL(tcp_connect);
1725 EXPORT_SYMBOL(tcp_make_synack);
1726 EXPORT_SYMBOL(tcp_simple_retransmit);
1727 EXPORT_SYMBOL(tcp_sync_mss);