tcp_input.c 187 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
Linus Torvalds's avatar
Linus Torvalds committed
2 3 4 5 6 7 8
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		Implementation of the Transmission Control Protocol(TCP).
 *
9
 * Authors:	Ross Biro
Linus Torvalds's avatar
Linus Torvalds committed
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *		Mark Evans, <evansmp@uhura.aston.ac.uk>
 *		Corey Minyard <wf-rch!minyard@relay.EU.net>
 *		Florian La Roche, <flla@stud.uni-sb.de>
 *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
 *		Linus Torvalds, <torvalds@cs.helsinki.fi>
 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
 *		Matthew Dillon, <dillon@apollo.west.oic.com>
 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *		Jorge Cwik, <jorge@laser.satlink.net>
 */

/*
 * Changes:
 *		Pedro Roque	:	Fast Retransmit/Recovery.
 *					Two receive queues.
 *					Retransmit queue handled by TCP.
 *					Better retransmit timer handling.
 *					New congestion avoidance.
 *					Header prediction.
 *					Variable renaming.
 *
 *		Eric		:	Fast Retransmit.
 *		Randy Scott	:	MSS option defines.
 *		Eric Schenk	:	Fixes to slow start algorithm.
 *		Eric Schenk	:	Yet another double ACK bug.
 *		Eric Schenk	:	Delayed ACK bug fixes.
 *		Eric Schenk	:	Floyd style fast retrans war avoidance.
 *		David S. Miller	:	Don't allow zero congestion window.
 *		Eric Schenk	:	Fix retransmitter so that it sends
 *					next packet on ack of previous packet.
 *		Andi Kleen	:	Moved open_request checking here
 *					and process RSTs for open_requests.
 *		Andi Kleen	:	Better prune_queue, and other fixes.
44
 *		Andrey Savochkin:	Fix RTT measurements in the presence of
Linus Torvalds's avatar
Linus Torvalds committed
45 46 47 48 49 50 51
 *					timestamps.
 *		Andrey Savochkin:	Check sequence numbers correctly when
 *					removing SACKs due to in sequence incoming
 *					data segments.
 *		Andi Kleen:		Make sure we never ack data there is not
 *					enough room for. Also make this condition
 *					a fatal error if it might still happen.
52
 *		Andi Kleen:		Add tcp_measure_rcv_mss to make
Linus Torvalds's avatar
Linus Torvalds committed
53
 *					connections with MSS<min(MTU,ann. MSS)
54
 *					work without delayed acks.
Linus Torvalds's avatar
Linus Torvalds committed
55 56 57 58 59 60 61 62 63 64
 *		Andi Kleen:		Process packets with PSH set in the
 *					fast path.
 *		J Hadi Salim:		ECN support
 *	 	Andrei Gurtov,
 *		Pasi Sarolahti,
 *		Panu Kuhlberg:		Experimental audit of TCP (re)transmission
 *					engine. Lots of bugs are found.
 *		Pasi Sarolahti:		F-RTO for dealing with spurious RTOs
 */

65 66
#define pr_fmt(fmt) "TCP: " fmt

Linus Torvalds's avatar
Linus Torvalds committed
67
#include <linux/mm.h>
68
#include <linux/slab.h>
Linus Torvalds's avatar
Linus Torvalds committed
69 70
#include <linux/module.h>
#include <linux/sysctl.h>
71
#include <linux/kernel.h>
72
#include <linux/prefetch.h>
73
#include <net/dst.h>
Linus Torvalds's avatar
Linus Torvalds committed
74 75 76 77
#include <net/tcp.h>
#include <net/inet_common.h>
#include <linux/ipsec.h>
#include <asm/unaligned.h>
78
#include <linux/errqueue.h>
79
#include <trace/events/tcp.h>
80
#include <linux/static_key.h>
81
#include <net/busy_poll.h>
Linus Torvalds's avatar
Linus Torvalds committed
82

83
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
Linus Torvalds's avatar
Linus Torvalds committed
84 85 86 87 88 89 90 91

#define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
#define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
#define FLAG_DATA_ACKED		0x04 /* This ACK acknowledged new data.		*/
#define FLAG_RETRANS_DATA_ACKED	0x08 /* "" "" some of which was retransmitted.	*/
#define FLAG_SYN_ACKED		0x10 /* This ACK acknowledged SYN.		*/
#define FLAG_DATA_SACKED	0x20 /* New SACK.				*/
#define FLAG_ECE		0x40 /* ECE in this ACK				*/
92
#define FLAG_LOST_RETRANS	0x80 /* This ACK marks some retransmission lost */
93
#define FLAG_SLOWPATH		0x100 /* Do not skip RFC checks for window update.*/
94
#define FLAG_ORIG_SACK_ACKED	0x200 /* Never retransmitted data are (s)acked	*/
95
#define FLAG_SND_UNA_ADVANCED	0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
96
#define FLAG_DSACKING_ACK	0x800 /* SACK blocks contained D-SACK info */
97
#define FLAG_SET_XMIT_TIMER	0x1000 /* Set TLP or RTO timer */
98
#define FLAG_SACK_RENEGING	0x2000 /* snd_una advanced to a sacked seq */
99
#define FLAG_UPDATE_TS_RECENT	0x4000 /* tcp_replace_ts_recent() */
100
#define FLAG_NO_CHALLENGE_ACK	0x8000 /* do not call tcp_send_challenge_ack()	*/
101
#define FLAG_ACK_MAYBE_DELAYED	0x10000 /* Likely a delayed ACK */
Linus Torvalds's avatar
Linus Torvalds committed
102 103 104

#define FLAG_ACKED		(FLAG_DATA_ACKED|FLAG_SYN_ACKED)
#define FLAG_NOT_DUP		(FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
105
#define FLAG_CA_ALERT		(FLAG_DATA_SACKED|FLAG_ECE|FLAG_DSACKING_ACK)
Linus Torvalds's avatar
Linus Torvalds committed
106 107 108
#define FLAG_FORWARD_PROGRESS	(FLAG_ACKED|FLAG_DATA_SACKED)

#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
109
#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
Linus Torvalds's avatar
Linus Torvalds committed
110

111 112 113 114
#define REXMIT_NONE	0 /* no loss recovery to do */
#define REXMIT_LOST	1 /* retransmit packets marked lost */
#define REXMIT_NEW	2 /* FRTO-style transmit of unsent/new packets */

115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
#if IS_ENABLED(CONFIG_TLS_DEVICE)
static DEFINE_STATIC_KEY_FALSE(clean_acked_data_enabled);

void clean_acked_data_enable(struct inet_connection_sock *icsk,
			     void (*cad)(struct sock *sk, u32 ack_seq))
{
	icsk->icsk_clean_acked = cad;
	static_branch_inc(&clean_acked_data_enabled);
}
EXPORT_SYMBOL_GPL(clean_acked_data_enable);

void clean_acked_data_disable(struct inet_connection_sock *icsk)
{
	static_branch_dec(&clean_acked_data_enabled);
	icsk->icsk_clean_acked = NULL;
}
EXPORT_SYMBOL_GPL(clean_acked_data_disable);
#endif

134 135
static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb,
			     unsigned int len)
136 137 138 139 140 141 142 143 144 145
{
	static bool __once __read_mostly;

	if (!__once) {
		struct net_device *dev;

		__once = true;

		rcu_read_lock();
		dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
146 147 148
		if (!dev || len >= dev->mtu)
			pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
				dev ? dev->name : "Unknown driver");
149 150 151 152
		rcu_read_unlock();
	}
}

153
/* Adapt the MSS value used to make delayed ack decision to the
Linus Torvalds's avatar
Linus Torvalds committed
154
 * real world.
155
 */
156
static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
157
{
158
	struct inet_connection_sock *icsk = inet_csk(sk);
159
	const unsigned int lss = icsk->icsk_ack.last_seg_size;
160
	unsigned int len;
Linus Torvalds's avatar
Linus Torvalds committed
161

162
	icsk->icsk_ack.last_seg_size = 0;
Linus Torvalds's avatar
Linus Torvalds committed
163 164 165 166

	/* skb->len may jitter because of SACKs, even if peer
	 * sends good full-sized frames.
	 */
167
	len = skb_shinfo(skb)->gso_size ? : skb->len;
168
	if (len >= icsk->icsk_ack.rcv_mss) {
169 170
		icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
					       tcp_sk(sk)->advmss);
171 172 173 174
		/* Account for possibly-removed options */
		if (unlikely(len > icsk->icsk_ack.rcv_mss +
				   MAX_TCP_OPTION_SPACE))
			tcp_gro_dev_warn(sk, skb, len);
Linus Torvalds's avatar
Linus Torvalds committed
175 176 177 178 179 180
	} else {
		/* Otherwise, we make more careful check taking into account,
		 * that SACKs block is variable.
		 *
		 * "len" is invariant segment length, including TCP header.
		 */
181
		len += skb->data - skb_transport_header(skb);
182
		if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) ||
Linus Torvalds's avatar
Linus Torvalds committed
183 184 185 186 187 188
		    /* If PSH is not set, packet should be
		     * full sized, provided peer TCP is not badly broken.
		     * This observation (if it is correct 8)) allows
		     * to handle super-low mtu links fairly.
		     */
		    (len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
189
		     !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
Linus Torvalds's avatar
Linus Torvalds committed
190 191 192 193
			/* Subtract also invariant (if peer is RFC compliant),
			 * tcp header plus fixed timestamp option length.
			 * Resulting "len" is MSS free of SACK jitter.
			 */
194 195
			len -= tcp_sk(sk)->tcp_header_len;
			icsk->icsk_ack.last_seg_size = len;
Linus Torvalds's avatar
Linus Torvalds committed
196
			if (len == lss) {
197
				icsk->icsk_ack.rcv_mss = len;
Linus Torvalds's avatar
Linus Torvalds committed
198 199 200
				return;
			}
		}
201 202
		if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
			icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2;
203
		icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
Linus Torvalds's avatar
Linus Torvalds committed
204 205 206
	}
}

207
static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks)
Linus Torvalds's avatar
Linus Torvalds committed
208
{
209
	struct inet_connection_sock *icsk = inet_csk(sk);
210
	unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
Linus Torvalds's avatar
Linus Torvalds committed
211

212 213
	if (quickacks == 0)
		quickacks = 2;
214
	quickacks = min(quickacks, max_quickacks);
215
	if (quickacks > icsk->icsk_ack.quick)
216
		icsk->icsk_ack.quick = quickacks;
Linus Torvalds's avatar
Linus Torvalds committed
217 218
}

219
void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
Linus Torvalds's avatar
Linus Torvalds committed
220
{
221
	struct inet_connection_sock *icsk = inet_csk(sk);
222 223

	tcp_incr_quickack(sk, max_quickacks);
224 225
	icsk->icsk_ack.pingpong = 0;
	icsk->icsk_ack.ato = TCP_ATO_MIN;
Linus Torvalds's avatar
Linus Torvalds committed
226
}
227
EXPORT_SYMBOL(tcp_enter_quickack_mode);
Linus Torvalds's avatar
Linus Torvalds committed
228 229 230 231 232

/* Send ACKs quickly, if "quick" count is not exhausted
 * and the session is not interactive.
 */

233
static bool tcp_in_quickack_mode(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
234
{
235
	const struct inet_connection_sock *icsk = inet_csk(sk);
236
	const struct dst_entry *dst = __sk_dst_get(sk);
Eric Dumazet's avatar
Eric Dumazet committed
237

238 239
	return (dst && dst_metric(dst, RTAX_QUICKACK)) ||
		(icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong);
Linus Torvalds's avatar
Linus Torvalds committed
240 241
}

242
static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
243
{
244
	if (tp->ecn_flags & TCP_ECN_OK)
245 246 247
		tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
}

248
static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb)
249
{
250
	if (tcp_hdr(skb)->cwr) {
251
		tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
252 253 254 255 256

		/* If the sender is telling us it has entered CWR, then its
		 * cwnd may be very low (even just 1 packet), so we should ACK
		 * immediately.
		 */
257
		inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
258
	}
259 260
}

261
static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
262 263 264 265
{
	tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
}

266
static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
267
{
268 269
	struct tcp_sock *tp = tcp_sk(sk);

270
	switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
271
	case INET_ECN_NOT_ECT:
272
		/* Funny extension: if ECT is not set on a segment,
273 274 275 276
		 * and we already seen ECT on a previous segment,
		 * it is probably a retransmit.
		 */
		if (tp->ecn_flags & TCP_ECN_SEEN)
277
			tcp_enter_quickack_mode(sk, 2);
278 279
		break;
	case INET_ECN_CE:
280 281
		if (tcp_ca_needs_ecn(sk))
			tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
282

283 284
		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
			/* Better not delay acks, sender can have a very low cwnd */
285
			tcp_enter_quickack_mode(sk, 2);
286 287
			tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
		}
288 289
		tp->ecn_flags |= TCP_ECN_SEEN;
		break;
290
	default:
291 292
		if (tcp_ca_needs_ecn(sk))
			tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
293
		tp->ecn_flags |= TCP_ECN_SEEN;
294
		break;
295 296 297
	}
}

298
static void tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
299
{
300 301
	if (tcp_sk(sk)->ecn_flags & TCP_ECN_OK)
		__tcp_ecn_check_ce(sk, skb);
302 303 304
}

static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
305
{
306
	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))
307 308 309
		tp->ecn_flags &= ~TCP_ECN_OK;
}

310
static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
311
{
312
	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))
313 314 315
		tp->ecn_flags &= ~TCP_ECN_OK;
}

316
static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
317
{
318
	if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
Eric Dumazet's avatar
Eric Dumazet committed
319 320
		return true;
	return false;
321 322
}

Linus Torvalds's avatar
Linus Torvalds committed
323 324 325 326 327
/* Buffer size and advertised window tuning.
 *
 * 1. Tuning sk->sk_sndbuf, when connection enters established state.
 */

328
static void tcp_sndbuf_expand(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
329
{
330
	const struct tcp_sock *tp = tcp_sk(sk);
331
	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349
	int sndmem, per_mss;
	u32 nr_segs;

	/* Worst case is non GSO/TSO : each frame consumes one skb
	 * and skb->head is kmalloced using power of two area of memory
	 */
	per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
		  MAX_TCP_HEADER +
		  SKB_DATA_ALIGN(sizeof(struct skb_shared_info));

	per_mss = roundup_pow_of_two(per_mss) +
		  SKB_DATA_ALIGN(sizeof(struct sk_buff));

	nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
	nr_segs = max_t(u32, nr_segs, tp->reordering + 1);

	/* Fast Recovery (RFC 5681 3.2) :
	 * Cubic needs 1.7 factor, rounded to 2 to include
350
	 * extra cushion (application might react slowly to EPOLLOUT)
351
	 */
352 353
	sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
	sndmem *= nr_segs * per_mss;
Linus Torvalds's avatar
Linus Torvalds committed
354

355
	if (sk->sk_sndbuf < sndmem)
356
		sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]);
Linus Torvalds's avatar
Linus Torvalds committed
357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379
}

/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
 *
 * All tcp_full_space() is split to two parts: "network" buffer, allocated
 * forward and advertised in receiver window (tp->rcv_wnd) and
 * "application buffer", required to isolate scheduling/application
 * latencies from network.
 * window_clamp is maximal advertised window. It can be less than
 * tcp_full_space(), in this case tcp_full_space() - window_clamp
 * is reserved for "application" buffer. The less window_clamp is
 * the smoother our behaviour from viewpoint of network, but the lower
 * throughput and the higher sensitivity of the connection to losses. 8)
 *
 * rcv_ssthresh is more strict window_clamp used at "slow start"
 * phase to predict further behaviour of this connection.
 * It is used for two goals:
 * - to enforce header prediction at sender, even when application
 *   requires some significant "application buffer". It is check #1.
 * - to prevent pruning of receive queue because of misprediction
 *   of receiver window. Check #2.
 *
 * The scheme does not work when sender sends good segments opening
380
 * window and then starts to feed us spaghetti. But it should work
Linus Torvalds's avatar
Linus Torvalds committed
381 382 383 384
 * in common situations. Otherwise, we have to rely on queue collapsing.
 */

/* Slow part of check#2. */
385
static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
386
{
387
	struct tcp_sock *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
388
	/* Optimize this! */
389
	int truesize = tcp_win_from_space(sk, skb->truesize) >> 1;
390
	int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
Linus Torvalds's avatar
Linus Torvalds committed
391 392 393

	while (tp->rcv_ssthresh <= window) {
		if (truesize <= skb->len)
394
			return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
Linus Torvalds's avatar
Linus Torvalds committed
395 396 397 398 399 400 401

		truesize >>= 1;
		window >>= 1;
	}
	return 0;
}

402
static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
403
{
404 405
	struct tcp_sock *tp = tcp_sk(sk);

Linus Torvalds's avatar
Linus Torvalds committed
406 407 408
	/* Check #1 */
	if (tp->rcv_ssthresh < tp->window_clamp &&
	    (int)tp->rcv_ssthresh < tcp_space(sk) &&
409
	    !tcp_under_memory_pressure(sk)) {
Linus Torvalds's avatar
Linus Torvalds committed
410 411 412 413 414
		int incr;

		/* Check #2. Increase window, if skb with such overhead
		 * will fit to rcvbuf in future.
		 */
415
		if (tcp_win_from_space(sk, skb->truesize) <= skb->len)
416
			incr = 2 * tp->advmss;
Linus Torvalds's avatar
Linus Torvalds committed
417
		else
418
			incr = __tcp_grow_window(sk, skb);
Linus Torvalds's avatar
Linus Torvalds committed
419 420

		if (incr) {
421
			incr = max_t(int, incr, 2 * skb->len);
422 423
			tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
					       tp->window_clamp);
424
			inet_csk(sk)->icsk_ack.quick |= 1;
Linus Torvalds's avatar
Linus Torvalds committed
425 426 427 428
		}
	}
}

429
/* 3. Try to fixup all. It is made immediately after connection enters
Linus Torvalds's avatar
Linus Torvalds committed
430 431
 *    established state.
 */
432
void tcp_init_buffer_space(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
433
{
434
	int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win;
Linus Torvalds's avatar
Linus Torvalds committed
435 436 437 438
	struct tcp_sock *tp = tcp_sk(sk);
	int maxwin;

	if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
439
		tcp_sndbuf_expand(sk);
Linus Torvalds's avatar
Linus Torvalds committed
440

441
	tp->rcvq_space.space = min_t(u32, tp->rcv_wnd, TCP_INIT_CWND * tp->advmss);
442
	tcp_mstamp_refresh(tp);
443
	tp->rcvq_space.time = tp->tcp_mstamp;
444
	tp->rcvq_space.seq = tp->copied_seq;
Linus Torvalds's avatar
Linus Torvalds committed
445 446 447 448 449 450

	maxwin = tcp_full_space(sk);

	if (tp->window_clamp >= maxwin) {
		tp->window_clamp = maxwin;

451
		if (tcp_app_win && maxwin > 4 * tp->advmss)
Linus Torvalds's avatar
Linus Torvalds committed
452
			tp->window_clamp = max(maxwin -
453
					       (maxwin >> tcp_app_win),
Linus Torvalds's avatar
Linus Torvalds committed
454 455 456 457
					       4 * tp->advmss);
	}

	/* Force reservation of one segment. */
458
	if (tcp_app_win &&
Linus Torvalds's avatar
Linus Torvalds committed
459 460 461 462 463
	    tp->window_clamp > 2 * tp->advmss &&
	    tp->window_clamp + tp->advmss > maxwin)
		tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);

	tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
464
	tp->snd_cwnd_stamp = tcp_jiffies32;
Linus Torvalds's avatar
Linus Torvalds committed
465 466
}

467
/* 4. Recalculate window clamp after socket hit its memory bounds. */
468
static void tcp_clamp_window(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
469
{
470
	struct tcp_sock *tp = tcp_sk(sk);
471
	struct inet_connection_sock *icsk = inet_csk(sk);
472
	struct net *net = sock_net(sk);
Linus Torvalds's avatar
Linus Torvalds committed
473

474
	icsk->icsk_ack.quick = 0;
Linus Torvalds's avatar
Linus Torvalds committed
475

476
	if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] &&
477
	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
478
	    !tcp_under_memory_pressure(sk) &&
479
	    sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
480
		sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
481
				    net->ipv4.sysctl_tcp_rmem[2]);
Linus Torvalds's avatar
Linus Torvalds committed
482
	}
483
	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
484
		tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
Linus Torvalds's avatar
Linus Torvalds committed
485 486
}

Stephen Hemminger's avatar
Stephen Hemminger committed
487 488 489 490 491 492 493 494 495
/* Initialize RCV_MSS value.
 * RCV_MSS is an our guess about MSS used by the peer.
 * We haven't any direct information about the MSS.
 * It's better to underestimate the RCV_MSS rather than overestimate.
 * Overestimations make us ACKing less frequently than needed.
 * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
 */
void tcp_initialize_rcv_mss(struct sock *sk)
{
496
	const struct tcp_sock *tp = tcp_sk(sk);
Stephen Hemminger's avatar
Stephen Hemminger committed
497 498
	unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);

499
	hint = min(hint, tp->rcv_wnd / 2);
500
	hint = min(hint, TCP_MSS_DEFAULT);
Stephen Hemminger's avatar
Stephen Hemminger committed
501 502 503 504
	hint = max(hint, TCP_MIN_MSS);

	inet_csk(sk)->icsk_ack.rcv_mss = hint;
}
505
EXPORT_SYMBOL(tcp_initialize_rcv_mss);
Stephen Hemminger's avatar
Stephen Hemminger committed
506

Linus Torvalds's avatar
Linus Torvalds committed
507 508 509 510
/* Receiver "autotuning" code.
 *
 * The algorithm for RTT estimation w/o timestamps is based on
 * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
511
 * <http://public.lanl.gov/radiant/pubs.html#DRS>
Linus Torvalds's avatar
Linus Torvalds committed
512 513
 *
 * More detail on this code can be found at
514
 * <http://staff.psc.edu/jheffner/>,
Linus Torvalds's avatar
Linus Torvalds committed
515 516 517 518 519
 * though this reference is out of date.  A new paper
 * is pending.
 */
static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
{
520
	u32 new_sample = tp->rcv_rtt_est.rtt_us;
Linus Torvalds's avatar
Linus Torvalds committed
521 522 523 524 525 526 527 528 529
	long m = sample;

	if (new_sample != 0) {
		/* If we sample in larger samples in the non-timestamp
		 * case, we could grossly overestimate the RTT especially
		 * with chatty applications or bulk transfer apps which
		 * are stalled on filesystem I/O.
		 *
		 * Also, since we are only going for a minimum in the
530
		 * non-timestamp case, we do not smooth things out
531
		 * else with timestamps disabled convergence takes too
Linus Torvalds's avatar
Linus Torvalds committed
532 533 534 535 536
		 * long.
		 */
		if (!win_dep) {
			m -= (new_sample >> 3);
			new_sample += m;
537 538 539 540 541
		} else {
			m <<= 3;
			if (m < new_sample)
				new_sample = m;
		}
Linus Torvalds's avatar
Linus Torvalds committed
542
	} else {
543
		/* No previous measure. */
Linus Torvalds's avatar
Linus Torvalds committed
544 545 546
		new_sample = m << 3;
	}

547
	tp->rcv_rtt_est.rtt_us = new_sample;
Linus Torvalds's avatar
Linus Torvalds committed
548 549 550 551
}

static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
{
552 553
	u32 delta_us;

554
	if (tp->rcv_rtt_est.time == 0)
Linus Torvalds's avatar
Linus Torvalds committed
555 556 557
		goto new_measure;
	if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
		return;
558
	delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time);
559 560
	if (!delta_us)
		delta_us = 1;
561
	tcp_rcv_rtt_update(tp, delta_us, 1);
Linus Torvalds's avatar
Linus Torvalds committed
562 563 564

new_measure:
	tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
565
	tp->rcv_rtt_est.time = tp->tcp_mstamp;
Linus Torvalds's avatar
Linus Torvalds committed
566 567
}

568 569
static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
					  const struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
570
{
571
	struct tcp_sock *tp = tcp_sk(sk);
572

573 574 575 576 577 578
	if (tp->rx_opt.rcv_tsecr == tp->rcv_rtt_last_tsecr)
		return;
	tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;

	if (TCP_SKB_CB(skb)->end_seq -
	    TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) {
579
		u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
580
		u32 delta_us;
581

582 583 584 585 586 587
		if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
			if (!delta)
				delta = 1;
			delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
			tcp_rcv_rtt_update(tp, delta_us, 0);
		}
588
	}
Linus Torvalds's avatar
Linus Torvalds committed
589 590 591 592 593 594 595 596 597
}

/*
 * This function should be called every time data is copied to user space.
 * It calculates the appropriate TCP receive buffer space.
 */
void tcp_rcv_space_adjust(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
598
	u32 copied;
Linus Torvalds's avatar
Linus Torvalds committed
599
	int time;
600

601 602
	trace_tcp_rcv_space_adjust(sk);

603
	tcp_mstamp_refresh(tp);
604
	time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
605
	if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0)
Linus Torvalds's avatar
Linus Torvalds committed
606
		return;
607

608 609 610 611 612 613 614 615 616 617 618 619 620 621
	/* Number of bytes copied to user in last RTT */
	copied = tp->copied_seq - tp->rcvq_space.seq;
	if (copied <= tp->rcvq_space.space)
		goto new_measure;

	/* A bit of theory :
	 * copied = bytes received in previous RTT, our base window
	 * To cope with packet losses, we need a 2x factor
	 * To cope with slow start, and sender growing its cwin by 100 %
	 * every RTT, we need a 4x factor, because the ACK we are sending
	 * now is for the next RTT, not the current one :
	 * <prev RTT . ><current RTT .. ><next RTT .... >
	 */

622
	if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
623
	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
624
		int rcvmem, rcvbuf;
625
		u64 rcvwin, grow;
Linus Torvalds's avatar
Linus Torvalds committed
626

627 628 629
		/* minimal window to cope with packet losses, assuming
		 * steady state. Add some cushion because of small variations.
		 */
630
		rcvwin = ((u64)copied << 1) + 16 * tp->advmss;
Linus Torvalds's avatar
Linus Torvalds committed
631

632 633 634 635
		/* Accommodate for sender rate increase (eg. slow start) */
		grow = rcvwin * (copied - tp->rcvq_space.space);
		do_div(grow, tp->rcvq_space.space);
		rcvwin += (grow << 1);
Linus Torvalds's avatar
Linus Torvalds committed
636

637
		rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
638
		while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
639
			rcvmem += 128;
Linus Torvalds's avatar
Linus Torvalds committed
640

641 642 643
		do_div(rcvwin, tp->advmss);
		rcvbuf = min_t(u64, rcvwin * rcvmem,
			       sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
644 645
		if (rcvbuf > sk->sk_rcvbuf) {
			sk->sk_rcvbuf = rcvbuf;
Linus Torvalds's avatar
Linus Torvalds committed
646

647
			/* Make the window clamp follow along.  */
648
			tp->window_clamp = tcp_win_from_space(sk, rcvbuf);
Linus Torvalds's avatar
Linus Torvalds committed
649 650
		}
	}
651
	tp->rcvq_space.space = copied;
652

Linus Torvalds's avatar
Linus Torvalds committed
653 654
new_measure:
	tp->rcvq_space.seq = tp->copied_seq;
655
	tp->rcvq_space.time = tp->tcp_mstamp;
Linus Torvalds's avatar
Linus Torvalds committed
656 657 658 659 660 661 662 663 664 665 666 667
}

/* There is something which you must keep in mind when you analyze the
 * behavior of the tp->ato delayed ack timeout interval.  When a
 * connection starts up, we want to ack as quickly as possible.  The
 * problem is that "good" TCP's do slow start at the beginning of data
 * transmission.  The means that until we send the first few ACK's the
 * sender will sit on his end and only queue most of his data, because
 * he can only send snd_cwnd unacked packets at any given time.  For
 * each ACK we send, he increments snd_cwnd and transmits more of his
 * queue.  -DaveM
 */
668
static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
669
{
670
	struct tcp_sock *tp = tcp_sk(sk);
671
	struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
672 673
	u32 now;

674
	inet_csk_schedule_ack(sk);
Linus Torvalds's avatar
Linus Torvalds committed
675

676
	tcp_measure_rcv_mss(sk, skb);
Linus Torvalds's avatar
Linus Torvalds committed
677 678

	tcp_rcv_rtt_measure(tp);
679

680
	now = tcp_jiffies32;
Linus Torvalds's avatar
Linus Torvalds committed
681

682
	if (!icsk->icsk_ack.ato) {
Linus Torvalds's avatar
Linus Torvalds committed
683 684 685
		/* The _first_ data packet received, initialize
		 * delayed ACK engine.
		 */
686
		tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
687
		icsk->icsk_ack.ato = TCP_ATO_MIN;
Linus Torvalds's avatar
Linus Torvalds committed
688
	} else {
689
		int m = now - icsk->icsk_ack.lrcvtime;
Linus Torvalds's avatar
Linus Torvalds committed
690

691
		if (m <= TCP_ATO_MIN / 2) {
Linus Torvalds's avatar
Linus Torvalds committed
692
			/* The fastest case is the first. */
693 694 695 696 697 698
			icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
		} else if (m < icsk->icsk_ack.ato) {
			icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
			if (icsk->icsk_ack.ato > icsk->icsk_rto)
				icsk->icsk_ack.ato = icsk->icsk_rto;
		} else if (m > icsk->icsk_rto) {
699
			/* Too long gap. Apparently sender failed to
Linus Torvalds's avatar
Linus Torvalds committed
700 701
			 * restart window, so that we send ACKs quickly.
			 */
702
			tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
703
			sk_mem_reclaim(sk);
Linus Torvalds's avatar
Linus Torvalds committed
704 705
		}
	}
706
	icsk->icsk_ack.lrcvtime = now;
Linus Torvalds's avatar
Linus Torvalds committed
707

708
	tcp_ecn_check_ce(sk, skb);
Linus Torvalds's avatar
Linus Torvalds committed
709 710

	if (skb->len >= 128)
711
		tcp_grow_window(sk, skb);
Linus Torvalds's avatar
Linus Torvalds committed
712 713 714 715 716 717 718 719 720 721 722
}

/* Called to compute a smoothed rtt estimate. The data fed to this
 * routine either comes from timestamps, or from segments that were
 * known _not_ to have been retransmitted [see Karn/Partridge
 * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
 * piece by Van Jacobson.
 * NOTE: the next three routines used to be one big routine.
 * To save cycles in the RFC 1323 implementation it was better to break
 * it up into three procedures. -- erics
 */
723
static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
Linus Torvalds's avatar
Linus Torvalds committed
724
{
725
	struct tcp_sock *tp = tcp_sk(sk);
726 727
	long m = mrtt_us; /* RTT */
	u32 srtt = tp->srtt_us;
Linus Torvalds's avatar
Linus Torvalds committed
728 729 730 731

	/*	The following amusing code comes from Jacobson's
	 *	article in SIGCOMM '88.  Note that rtt and mdev
	 *	are scaled versions of rtt and mean deviation.
732
	 *	This is designed to be as fast as possible
Linus Torvalds's avatar
Linus Torvalds committed
733 734 735 736 737 738 739
	 *	m stands for "measurement".
	 *
	 *	On a 1990 paper the rto value is changed to:
	 *	RTO = rtt + 4 * mdev
	 *
	 * Funny. This algorithm seems to be very broken.
	 * These formulae increase RTO, when it should be decreased, increase
740
	 * too slowly, when it should be increased quickly, decrease too quickly
Linus Torvalds's avatar
Linus Torvalds committed
741 742 743 744
	 * etc. I guess in BSD RTO takes ONE value, so that it is absolutely
	 * does not matter how to _calculate_ it. Seems, it was trap
	 * that VJ failed to avoid. 8)
	 */
745 746 747
	if (srtt != 0) {
		m -= (srtt >> 3);	/* m is now error in rtt est */
		srtt += m;		/* rtt = 7/8 rtt + 1/8 new */
Linus Torvalds's avatar
Linus Torvalds committed
748 749
		if (m < 0) {
			m = -m;		/* m is now abs(error) */
750
			m -= (tp->mdev_us >> 2);   /* similar update on mdev */
Linus Torvalds's avatar
Linus Torvalds committed
751 752 753 754 755 756 757 758 759 760 761
			/* This is similar to one of Eifel findings.
			 * Eifel blocks mdev updates when rtt decreases.
			 * This solution is a bit different: we use finer gain
			 * for mdev in this case (alpha*beta).
			 * Like Eifel it also prevents growth of rto,
			 * but also it limits too fast rto decreases,
			 * happening in pure Eifel.
			 */
			if (m > 0)
				m >>= 3;
		} else {
762
			m -= (tp->mdev_us >> 2);   /* similar update on mdev */
Linus Torvalds's avatar
Linus Torvalds committed
763
		}
764 765 766 767 768
		tp->mdev_us += m;		/* mdev = 3/4 mdev + 1/4 new */
		if (tp->mdev_us > tp->mdev_max_us) {
			tp->mdev_max_us = tp->mdev_us;
			if (tp->mdev_max_us > tp->rttvar_us)
				tp->rttvar_us = tp->mdev_max_us;
Linus Torvalds's avatar
Linus Torvalds committed
769 770
		}
		if (after(tp->snd_una, tp->rtt_seq)) {
771 772
			if (tp->mdev_max_us < tp->rttvar_us)
				tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
Linus Torvalds's avatar
Linus Torvalds committed
773
			tp->rtt_seq = tp->snd_nxt;
774
			tp->mdev_max_us = tcp_rto_min_us(sk);
Linus Torvalds's avatar
Linus Torvalds committed
775 776 777
		}
	} else {
		/* no previous measure. */
778
		srtt = m << 3;		/* take the measured time to be rtt */
779 780 781
		tp->mdev_us = m << 1;	/* make sure rto = 3*rtt */
		tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
		tp->mdev_max_us = tp->rttvar_us;
Linus Torvalds's avatar
Linus Torvalds committed
782 783
		tp->rtt_seq = tp->snd_nxt;
	}
784
	tp->srtt_us = max(1U, srtt);
Linus Torvalds's avatar
Linus Torvalds committed
785 786
}

787 788 789 790 791 792
static void tcp_update_pacing_rate(struct sock *sk)
{
	const struct tcp_sock *tp = tcp_sk(sk);
	u64 rate;

	/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
793 794 795 796 797 798 799 800 801 802 803
	rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);

	/* current rate is (cwnd * mss) / srtt
	 * In Slow Start [1], set sk_pacing_rate to 200 % the current rate.
	 * In Congestion Avoidance phase, set it to 120 % the current rate.
	 *
	 * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh)
	 *	 If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
	 *	 end of slow start and should slow down.
	 */
	if (tp->snd_cwnd < tp->snd_ssthresh / 2)
804
		rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio;
805
	else
806
		rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio;
807 808 809

	rate *= max(tp->snd_cwnd, tp->packets_out);

810 811
	if (likely(tp->srtt_us))
		do_div(rate, tp->srtt_us);
812

813
	/* WRITE_ONCE() is needed because sch_fq fetches sk_pacing_rate
814 815 816
	 * without any lock. We want to make sure compiler wont store
	 * intermediate values in this location.
	 */
817 818
	WRITE_ONCE(sk->sk_pacing_rate, min_t(u64, rate,
					     sk->sk_max_pacing_rate));
819 820
}

Linus Torvalds's avatar
Linus Torvalds committed
821 822 823
/* Calculate rto without backoff.  This is the second half of Van Jacobson's
 * routine referred to above.
 */
824
static void tcp_set_rto(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
825
{
826
	const struct tcp_sock *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
827 828 829 830 831 832 833 834
	/* Old crap is replaced with new one. 8)
	 *
	 * More seriously:
	 * 1. If rtt variance happened to be less 50msec, it is hallucination.
	 *    It cannot be less due to utterly erratic ACK generation made
	 *    at least by solaris and freebsd. "Erratic ACKs" has _nothing_
	 *    to do with delayed acks, because at cwnd>2 true delack timeout
	 *    is invisible. Actually, Linux-2.4 also generates erratic
835
	 *    ACKs in some circumstances.
Linus Torvalds's avatar
Linus Torvalds committed
836
	 */
837
	inet_csk(sk)->icsk_rto = __tcp_set_rto(tp);
Linus Torvalds's avatar
Linus Torvalds committed
838 839 840 841

	/* 2. Fixups made earlier cannot be right.
	 *    If we do not estimate RTO correctly without them,
	 *    all the algo is pure shit and should be replaced
842
	 *    with correct one. It is exactly, which we pretend to do.
Linus Torvalds's avatar
Linus Torvalds committed
843 844
	 */

845 846 847
	/* NOTE: clamping at TCP_RTO_MIN is not required, current algo
	 * guarantees that rto is higher.
	 */
848
	tcp_bound_rto(sk);
Linus Torvalds's avatar
Linus Torvalds committed
849 850
}

851
__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
Linus Torvalds's avatar
Linus Torvalds committed
852 853 854
{
	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);

855
	if (!cwnd)
856
		cwnd = TCP_INIT_CWND;
Linus Torvalds's avatar
Linus Torvalds committed
857 858 859
	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}

860
/* Take a notice that peer is sending D-SACKs */
861 862
static void tcp_dsack_seen(struct tcp_sock *tp)
{
863
	tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
864
	tp->rack.dsack_seen = 1;
865
	tp->dsack_dups++;
866 867
}

868 869 870 871 872 873
/* It's reordering when higher sequence was delivered (i.e. sacked) before
 * some lower never-retransmitted sequence ("low_seq"). The maximum reordering
 * distance is approximated in full-mss packet distance ("reordering").
 */
static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
				      const int ts)
Linus Torvalds's avatar
Linus Torvalds committed
874
{
875
	struct tcp_sock *tp = tcp_sk(sk);
876 877
	const u32 mss = tp->mss_cache;
	u32 fack, metric;
878

879 880
	fack = tcp_highest_sack_seq(tp);
	if (!before(low_seq, fack))
881 882
		return;

883 884
	metric = fack - low_seq;
	if ((metric > tp->reordering * mss) && mss) {
Linus Torvalds's avatar
Linus Torvalds committed
885
#if FASTRETRANS_DEBUG > 1
886 887 888
		pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
			 tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
			 tp->reordering,
889
			 0,
890 891
			 tp->sacked_out,
			 tp->undo_marker ? tp->undo_retrans : 0);
Linus Torvalds's avatar
Linus Torvalds committed
892
#endif
893 894
		tp->reordering = min_t(u32, (metric + mss - 1) / mss,
				       sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
Linus Torvalds's avatar
Linus Torvalds committed
895
	}
Yuchung Cheng's avatar
Yuchung Cheng committed
896

897
	/* This exciting event is worth to be remembered. 8) */
898
	tp->reord_seen++;
899 900
	NET_INC_STATS(sock_net(sk),
		      ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
Linus Torvalds's avatar
Linus Torvalds committed
901 902
}

903
/* This must be called before lost_out is incremented */
904 905
static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
{
906
	if (!tp->retransmit_skb_hint ||
907 908
	    before(TCP_SKB_CB(skb)->seq,
		   TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
909
		tp->retransmit_skb_hint = skb;
910 911
}

912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927
/* Sum the number of packets on the wire we have marked as lost.
 * There are two cases we care about here:
 * a) Packet hasn't been marked lost (nor retransmitted),
 *    and this is the first loss.
 * b) Packet has been marked both lost and retransmitted,
 *    and this means we think it was lost again.
 */
static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb)
{
	__u8 sacked = TCP_SKB_CB(skb)->sacked;

	if (!(sacked & TCPCB_LOST) ||
	    ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS)))
		tp->lost += tcp_skb_pcount(skb);
}

928 929 930 931 932 933
static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
{
	if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
		tcp_verify_retransmit_hint(tp, skb);

		tp->lost_out += tcp_skb_pcount(skb);
934
		tcp_sum_lost(tp, skb);
935 936 937 938
		TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
	}
}

939
void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
940 941 942
{
	tcp_verify_retransmit_hint(tp, skb);

943
	tcp_sum_lost(tp, skb);
944 945 946 947 948 949
	if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
		tp->lost_out += tcp_skb_pcount(skb);
		TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
	}
}

Linus Torvalds's avatar
Linus Torvalds committed
950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970
/* This procedure tags the retransmission queue when SACKs arrive.
 *
 * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
 * Packets in queue with these bits set are counted in variables
 * sacked_out, retrans_out and lost_out, correspondingly.
 *
 * Valid combinations are:
 * Tag  InFlight	Description
 * 0	1		- orig segment is in flight.
 * S	0		- nothing flies, orig reached receiver.
 * L	0		- nothing flies, orig lost by net.
 * R	2		- both orig and retransmit are in flight.
 * L|R	1		- orig is lost, retransmit is in flight.
 * S|R  1		- orig reached receiver, retrans is still in flight.
 * (L|S|R is logically valid, it could occur when L|R is sacked,
 *  but it is equivalent to plain S and code short-curcuits it to S.
 *  L|S is logically invalid, it would mean -1 packet in flight 8))
 *
 * These 6 states form finite state machine, controlled by the following events:
 * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
 * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
971
 * 3. Loss detection event of two flavors:
Linus Torvalds's avatar
Linus Torvalds committed
972 973
 *	A. Scoreboard estimator decided the packet is lost.
 *	   A'. Reno "three dupacks" marks head of queue lost.
974
 *	B. SACK arrives sacking SND.NXT at the moment, when the
Linus Torvalds's avatar
Linus Torvalds committed
975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993
 *	   segment was retransmitted.
 * 4. D-SACK added new rule: D-SACK changes any tag to S.
 *
 * It is pleasant to note, that state diagram turns out to be commutative,
 * so that we are allowed not to be bothered by order of our actions,
 * when multiple events arrive simultaneously. (see the function below).
 *
 * Reordering detection.
 * --------------------
 * Reordering metric is maximal distance, which a packet can be displaced
 * in packet stream. With SACKs we can estimate it:
 *
 * 1. SACK fills old hole and the corresponding segment was not
 *    ever retransmitted -> reordering. Alas, we cannot use it
 *    when segment was retransmitted.
 * 2. The last flaw is solved with D-SACK. D-SACK arrives
 *    for retransmitted and already SACKed segment -> reordering..
 * Both of these heuristics are not used in Loss state, when we cannot
 * account for retransmits accurately.
994 995 996 997 998 999 1000
 *
 * SACK block validation.
 * ----------------------
 *
 * SACK block range validation checks that the received SACK block fits to
 * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT.
 * Note that SND.UNA is not included to the range though being valid because
1001 1002 1003 1004 1005 1006 1007 1008 1009
 * it means that the receiver is rather inconsistent with itself reporting
 * SACK reneging when it should advance SND.UNA. Such SACK block this is
 * perfectly valid, however, in light of RFC2018 which explicitly states
 * that "SACK block MUST reflect the newest segment.  Even if the newest
 * segment is going to be discarded ...", not that it looks very clever
 * in case of head skb. Due to potentional receiver driven attacks, we
 * choose to avoid immediate execution of a walk in write queue due to
 * reneging and defer head skb's loss recovery to standard loss recovery
 * procedure that will eventually trigger (nothing forbids us doing this).
1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031
 *
 * Implements also blockage to start_seq wrap-around. Problem lies in the
 * fact that though start_seq (s) is before end_seq (i.e., not reversed),
 * there's no guarantee that it will be before snd_nxt (n). The problem
 * happens when start_seq resides between end_seq wrap (e_w) and snd_nxt
 * wrap (s_w):
 *
 *         <- outs wnd ->                          <- wrapzone ->
 *         u     e      n                         u_w   e_w  s n_w
 *         |     |      |                          |     |   |  |
 * |<------------+------+----- TCP seqno space --------------+---------->|
 * ...-- <2^31 ->|                                           |<--------...
 * ...---- >2^31 ------>|                                    |<--------...
 *
 * Current code wouldn't be vulnerable but it's better still to discard such
 * crazy SACK blocks. Doing this check for start_seq alone closes somewhat
 * similar case (end_seq after snd_nxt wrap) as earlier reversed check in
 * snd_nxt wrap -> snd_una region will then become "well defined", i.e.,
 * equal to the ideal case (infinite seqno space without wrap caused issues).
 *
 * With D-SACK the lower bound is extended to cover sequence space below
 * SND.UNA down to undo_marker, which is the last point of interest. Yet
1032
 * again, D-SACK block must not to go across snd_una (for the same reason as
1033 1034 1035 1036 1037 1038 1039 1040 1041
 * for the normal SACK blocks, explained above). But there all simplicity
 * ends, TCP might receive valid D-SACKs below that. As long as they reside
 * fully below undo_marker they do not affect behavior in anyway and can
 * therefore be safely ignored. In rare cases (which are more or less
 * theoretical ones), the D-SACK will nicely cross that boundary due to skb
 * fragmentation and packet reordering past skb's retransmission. To consider
 * them correctly, the acceptable range must be extended even more though
 * the exact amount is rather hard to quantify. However, tp->max_window can
 * be used as an exaggerated estimate.
Linus Torvalds's avatar
Linus Torvalds committed
1042
 */
Eric Dumazet's avatar
Eric Dumazet committed
1043 1044
static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
				   u32 start_seq, u32 end_seq)
1045 1046 1047
{
	/* Too far in future, or reversed (interpretation is ambiguous) */
	if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq))
Eric Dumazet's avatar
Eric Dumazet committed
1048
		return false;
1049 1050 1051

	/* Nasty start_seq wrap-around check (see comments above) */
	if (!before(start_seq, tp->snd_nxt))
Eric Dumazet's avatar
Eric Dumazet committed
1052
		return false;
1053

1054
	/* In outstanding window? ...This is valid exit for D-SACKs too.
1055 1056 1057
	 * start_seq == snd_una is non-sensical (see comments above)
	 */
	if (after(start_seq, tp->snd_una))
Eric Dumazet's avatar
Eric Dumazet committed
1058
		return true;
1059 1060

	if (!is_dsack || !tp->undo_marker)
Eric Dumazet's avatar
Eric Dumazet committed
1061
		return false;
1062 1063

	/* ...Then it's D-SACK, and must reside below snd_una completely */
1064
	if (after(end_seq, tp->snd_una))
Eric Dumazet's avatar
Eric Dumazet committed
1065
		return false;
1066 1067

	if (!before(start_seq, tp->undo_marker))
Eric Dumazet's avatar
Eric Dumazet committed
1068
		return true;
1069 1070 1071

	/* Too old */
	if (!after(end_seq, tp->undo_marker))
Eric Dumazet's avatar
Eric Dumazet committed
1072
		return false;
1073 1074 1075 1076 1077 1078 1079

	/* Undo_marker boundary crossing (overestimates a lot). Known already:
	 *   start_seq < undo_marker and end_seq >= undo_marker.
	 */
	return !before(start_seq, end_seq - tp->max_window);
}

Eric Dumazet's avatar
Eric Dumazet committed
1080 1081 1082
static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
			    struct tcp_sack_block_wire *sp, int num_sacks,
			    u32 prior_snd_una)
1083
{
1084
	struct tcp_sock *tp = tcp_sk(sk);
1085 1086
	u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
	u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
Eric Dumazet's avatar
Eric Dumazet committed
1087
	bool dup_sack = false;
1088 1089

	if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
Eric Dumazet's avatar
Eric Dumazet committed
1090
		dup_sack = true;
1091
		tcp_dsack_seen(tp);
1092
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
1093
	} else if (num_sacks > 1) {
1094 1095
		u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
		u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
1096 1097 1098

		if (!after(end_seq_0, end_seq_1) &&
		    !before(start_seq_0, start_seq_1)) {
Eric Dumazet's avatar
Eric Dumazet committed
1099
			dup_sack = true;
1100
			tcp_dsack_seen(tp);
1101
			NET_INC_STATS(sock_net(sk),
1102
					LINUX_MIB_TCPDSACKOFORECV);
1103 1104 1105 1106
		}
	}

	/* D-SACK for already forgotten data... Do dumb counting. */
1107
	if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 &&
1108 1109 1110 1111 1112 1113 1114
	    !after(end_seq_0, prior_snd_una) &&
	    after(end_seq_0, tp->undo_marker))
		tp->undo_retrans--;

	return dup_sack;
}

1115
struct tcp_sacktag_state {
1116
	u32	reord;
1117 1118 1119 1120
	/* Timestamps for earliest and latest never-retransmitted segment
	 * that was SACKed. RTO needs the earliest RTT to stay conservative,
	 * but congestion control should still get an accurate delay signal.
	 */
1121 1122
	u64	first_sackt;
	u64	last_sackt;
1123
	struct rate_sample *rate;
1124
	int	flag;
1125
	unsigned int mss_now;
1126 1127
};

1128 1129 1130 1131 1132
/* Check if skb is fully within the SACK block. In presence of GSO skbs,
 * the incoming SACK may not exactly match but we can find smaller MSS
 * aligned portion of it that matches. Therefore we might need to fragment
 * which may fail and creates some hassle (caller must handle error case
 * returns).
1133 1134
 *
 * FIXME: this could be merged to shift decision code
1135
 */
1136
static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
Eric Dumazet's avatar
Eric Dumazet committed
1137
				  u32 start_seq, u32 end_seq)
1138
{
Eric Dumazet's avatar
Eric Dumazet committed
1139 1140
	int err;
	bool in_sack;
1141
	unsigned int pkt_len;
1142
	unsigned int mss;
1143 1144 1145 1146 1147 1148

	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
		  !before(end_seq, TCP_SKB_CB(skb)->end_seq);

	if (tcp_skb_pcount(skb) > 1 && !in_sack &&
	    after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
1149
		mss = tcp_skb_mss(skb);
1150 1151
		in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);

1152
		if (!in_sack) {
1153
			pkt_len = start_seq - TCP_SKB_CB(skb)->seq;
1154 1155 1156
			if (pkt_len < mss)
				pkt_len = mss;
		} else {
1157
			pkt_len = end_seq - TCP_SKB_CB(skb)->seq;
1158 1159 1160 1161 1162 1163 1164 1165 1166
			if (pkt_len < mss)
				return -EINVAL;
		}

		/* Round if necessary so that SACKs cover only full MSSes
		 * and/or the remaining small portion (if present)
		 */
		if (pkt_len > mss) {
			unsigned int new_len = (pkt_len / mss) * mss;
1167
			if (!in_sack && new_len < pkt_len)
1168 1169 1170
				new_len += mss;
			pkt_len = new_len;
		}
1171 1172 1173 1174

		if (pkt_len >= skb->len && !in_sack)
			return 0;

1175 1176
		err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
				   pkt_len, mss, GFP_ATOMIC);
1177 1178 1179 1180 1181 1182 1183
		if (err < 0)
			return err;
	}

	return in_sack;
}

1184 1185 1186 1187
/* Mark the given newly-SACKed range as such, adjusting counters and hints. */
static u8 tcp_sacktag_one(struct sock *sk,
			  struct tcp_sacktag_state *state, u8 sacked,
			  u32 start_seq, u32 end_seq,
1188
			  int dup_sack, int pcount,
1189
			  u64 xmit_time)
1190
{
1191
	struct tcp_sock *tp = tcp_sk(sk);
1192 1193 1194

	/* Account D-SACK for retransmitted packet. */
	if (dup_sack && (sacked & TCPCB_RETRANS)) {
1195
		if (tp->undo_marker && tp->undo_retrans > 0 &&
1196
		    after(end_seq, tp->undo_marker))
1197
			tp->undo_retrans--;
1198 1199 1200
		if ((sacked & TCPCB_SACKED_ACKED) &&
		    before(start_seq, state->reord))
				state->reord = start_seq;
1201 1202 1203
	}

	/* Nothing to do; acked frame is about to be dropped (was ACKed). */
1204
	if (!after(end_seq, tp->snd_una))
1205
		return sacked;
1206 1207

	if (!(sacked & TCPCB_SACKED_ACKED)) {
1208
		tcp_rack_advance(tp, sacked, end_seq, xmit_time);
1209

1210 1211 1212 1213 1214 1215
		if (sacked & TCPCB_SACKED_RETRANS) {
			/* If the segment is not tagged as lost,
			 * we do not clear RETRANS, believing
			 * that retransmission is still in flight.
			 */
			if (sacked & TCPCB_LOST) {
1216
				sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
1217 1218
				tp->lost_out -= pcount;
				tp->retrans_out -= pcount;
1219 1220 1221 1222 1223 1224
			}
		} else {
			if (!(sacked & TCPCB_RETRANS)) {
				/* New sack for not retransmitted frame,
				 * which was in hole. It is reordering.
				 */
1225
				if (before(start_seq,