summaryrefslogtreecommitdiffhomepage
path: root/pkg/tcpip/stack/tcp.go
blob: a941091b0f69c7c02a8652ca3b42651936c7db40 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package stack

import (
	"time"

	"gvisor.dev/gvisor/pkg/tcpip"
	"gvisor.dev/gvisor/pkg/tcpip/header"
	"gvisor.dev/gvisor/pkg/tcpip/internal/tcp"
	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
)

// TCPProbeFunc is the expected function type for a TCP probe function to be
// passed to stack.AddTCPProbe.
type TCPProbeFunc func(s TCPEndpointState)

// TCPCubicState is used to hold a copy of the internal cubic state when the
// TCPProbeFunc is invoked.
//
// +stateify savable
type TCPCubicState struct {
	// WLastMax is the previous wMax value.
	WLastMax float64

	// WMax is the value of the congestion window at the time of the last
	// congestion event.
	WMax float64

	// T is the time when the current congestion avoidance was entered.
	T tcpip.MonotonicTime

	// TimeSinceLastCongestion denotes the time since the current
	// congestion avoidance was entered.
	TimeSinceLastCongestion time.Duration

	// C is the cubic constant as specified in RFC8312, page 11.
	C float64

	// K is the time period (in seconds) that the above function takes to
	// increase the current window size to WMax if there are no further
	// congestion events and is calculated using the following equation:
	//
	// K = cubic_root(WMax*(1-beta_cubic)/C) (Eq. 2, page 5)
	K float64

	// Beta is the CUBIC multiplication decrease factor. That is, when a
	// congestion event is detected, CUBIC reduces its cwnd to
	// WC(0)=WMax*beta_cubic.
	Beta float64

	// WC is window computed by CUBIC at time TimeSinceLastCongestion. It's
	// calculated using the formula:
	//
	//  WC(TimeSinceLastCongestion) = C*(t-K)^3 + WMax (Eq. 1)
	WC float64

	// WEst is the window computed by CUBIC at time
	// TimeSinceLastCongestion+RTT i.e WC(TimeSinceLastCongestion+RTT).
	WEst float64
}

// TCPRACKState is used to hold a copy of the internal RACK state when the
// TCPProbeFunc is invoked.
//
// +stateify savable
type TCPRACKState struct {
	// XmitTime is the transmission timestamp of the most recent
	// acknowledged segment.
	XmitTime tcpip.MonotonicTime

	// EndSequence is the ending TCP sequence number of the most recent
	// acknowledged segment.
	EndSequence seqnum.Value

	// FACK is the highest selectively or cumulatively acknowledged
	// sequence.
	FACK seqnum.Value

	// RTT is the round trip time of the most recently delivered packet on
	// the connection (either cumulatively acknowledged or selectively
	// acknowledged) that was not marked invalid as a possible spurious
	// retransmission.
	RTT time.Duration

	// Reord is true iff reordering has been detected on this connection.
	Reord bool

	// DSACKSeen is true iff the connection has seen a DSACK.
	DSACKSeen bool

	// ReoWnd is the reordering window time used for recording packet
	// transmission times. It is used to defer the moment at which RACK
	// marks a packet lost.
	ReoWnd time.Duration

	// ReoWndIncr is the multiplier applied to adjust reorder window.
	ReoWndIncr uint8

	// ReoWndPersist is the number of loss recoveries before resetting
	// reorder window.
	ReoWndPersist int8

	// RTTSeq is the SND.NXT when RTT is updated.
	RTTSeq seqnum.Value
}

// TCPEndpointID is the unique 4 tuple that identifies a given endpoint.
//
// +stateify savable
type TCPEndpointID struct {
	// LocalPort is the local port associated with the endpoint.
	LocalPort uint16

	// LocalAddress is the local [network layer] address associated with
	// the endpoint.
	LocalAddress tcpip.Address

	// RemotePort is the remote port associated with the endpoint.
	RemotePort uint16

	// RemoteAddress it the remote [network layer] address associated with
	// the endpoint.
	RemoteAddress tcpip.Address
}

// TCPFastRecoveryState holds a copy of the internal fast recovery state of a
// TCP endpoint.
//
// +stateify savable
type TCPFastRecoveryState struct {
	// Active if true indicates the endpoint is in fast recovery. The
	// following fields are only meaningful when Active is true.
	Active bool

	// First is the first unacknowledged sequence number being recovered.
	First seqnum.Value

	// Last is the 'recover' sequence number that indicates the point at
	// which we should exit recovery barring any timeouts etc.
	Last seqnum.Value

	// MaxCwnd is the maximum value we are permitted to grow the congestion
	// window during recovery. This is set at the time we enter recovery.
	// It exists to avoid attacks where the receiver intentionally sends
	// duplicate acks to artificially inflate the sender's cwnd.
	MaxCwnd int

	// HighRxt is the highest sequence number which has been retransmitted
	// during the current loss recovery phase.  See: RFC 6675 Section 2 for
	// details.
	HighRxt seqnum.Value

	// RescueRxt is the highest sequence number which has been
	// optimistically retransmitted to prevent stalling of the ACK clock
	// when there is loss at the end of the window and no new data is
	// available for transmission.  See: RFC 6675 Section 2 for details.
	RescueRxt seqnum.Value
}

// TCPReceiverState holds a copy of the internal state of the receiver for a
// given TCP endpoint.
//
// +stateify savable
type TCPReceiverState struct {
	// RcvNxt is the TCP variable RCV.NXT.
	RcvNxt seqnum.Value

	// RcvAcc is one beyond the last acceptable sequence number. That is,
	// the "largest" sequence value that the receiver has announced to its
	// peer that it's willing to accept. This may be different than RcvNxt
	// + (last advertised receive window) if the receive window is reduced;
	// in that case we have to reduce the window as we receive more data
	// instead of shrinking it.
	RcvAcc seqnum.Value

	// RcvWndScale is the window scaling to use for inbound segments.
	RcvWndScale uint8

	// PendingBufUsed is the number of bytes pending in the receive queue.
	PendingBufUsed int
}

// TCPRTTState holds a copy of information about the endpoint's round trip
// time.
//
// +stateify savable
type TCPRTTState struct {
	// SRTT is the smoothed round trip time defined in section 2 of RFC
	// 6298.
	SRTT time.Duration

	// RTTVar is the round-trip time variation as defined in section 2 of
	// RFC 6298.
	RTTVar time.Duration

	// SRTTInited if true indicates that a valid RTT measurement has been
	// completed.
	SRTTInited bool
}

// TCPSenderState holds a copy of the internal state of the sender for a given
// TCP Endpoint.
//
// +stateify savable
type TCPSenderState struct {
	// LastSendTime is the timestamp at which we sent the last segment.
	LastSendTime tcpip.MonotonicTime

	// DupAckCount is the number of Duplicate ACKs received. It is used for
	// fast retransmit.
	DupAckCount int

	// SndCwnd is the size of the sending congestion window in packets.
	SndCwnd int

	// Ssthresh is the threshold between slow start and congestion
	// avoidance.
	Ssthresh int

	// SndCAAckCount is the number of packets acknowledged during
	// congestion avoidance. When enough packets have been ack'd (typically
	// cwnd packets), the congestion window is incremented by one.
	SndCAAckCount int

	// Outstanding is the number of packets that have been sent but not yet
	// acknowledged.
	Outstanding int

	// SackedOut is the number of packets which have been selectively
	// acked.
	SackedOut int

	// SndWnd is the send window size in bytes.
	SndWnd seqnum.Size

	// SndUna is the next unacknowledged sequence number.
	SndUna seqnum.Value

	// SndNxt is the sequence number of the next segment to be sent.
	SndNxt seqnum.Value

	// RTTMeasureSeqNum is the sequence number being used for the latest
	// RTT measurement.
	RTTMeasureSeqNum seqnum.Value

	// RTTMeasureTime is the time when the RTTMeasureSeqNum was sent.
	RTTMeasureTime tcpip.MonotonicTime

	// Closed indicates that the caller has closed the endpoint for
	// sending.
	Closed bool

	// RTO is the retransmit timeout as defined in section of 2 of RFC
	// 6298.
	RTO time.Duration

	// RTTState holds information about the endpoint's round trip time.
	RTTState TCPRTTState

	// MaxPayloadSize is the maximum size of the payload of a given
	// segment.  It is initialized on demand.
	MaxPayloadSize int

	// SndWndScale is the number of bits to shift left when reading the
	// send window size from a segment.
	SndWndScale uint8

	// MaxSentAck is the highest acknowledgement number sent till now.
	MaxSentAck seqnum.Value

	// FastRecovery holds the fast recovery state for the endpoint.
	FastRecovery TCPFastRecoveryState

	// Cubic holds the state related to CUBIC congestion control.
	Cubic TCPCubicState

	// RACKState holds the state related to RACK loss detection algorithm.
	RACKState TCPRACKState

	// RetransmitTS records the timestamp used to detect spurious recovery.
	RetransmitTS uint32

	// SpuriousRecovery indicates if the sender entered recovery spuriously.
	SpuriousRecovery bool
}

// TCPSACKInfo holds TCP SACK related information for a given TCP endpoint.
//
// +stateify savable
type TCPSACKInfo struct {
	// Blocks is the list of SACK Blocks that identify the out of order
	// segments held by a given TCP endpoint.
	Blocks []header.SACKBlock

	// ReceivedBlocks are the SACK blocks received by this endpoint from
	// the peer endpoint.
	ReceivedBlocks []header.SACKBlock

	// MaxSACKED is the highest sequence number that has been SACKED by the
	// peer.
	MaxSACKED seqnum.Value
}

// RcvBufAutoTuneParams holds state related to TCP receive buffer auto-tuning.
//
// +stateify savable
type RcvBufAutoTuneParams struct {
	// MeasureTime is the time at which the current measurement was
	// started.
	MeasureTime tcpip.MonotonicTime

	// CopiedBytes is the number of bytes copied to user space since this
	// measure began.
	CopiedBytes int

	// PrevCopiedBytes is the number of bytes copied to userspace in the
	// previous RTT period.
	PrevCopiedBytes int

	// RcvBufSize is the auto tuned receive buffer size.
	RcvBufSize int

	// RTT is the smoothed RTT as measured by observing the time between
	// when a byte is first acknowledged and the receipt of data that is at
	// least one window beyond the sequence number that was acknowledged.
	RTT time.Duration

	// RTTVar is the "round-trip time variation" as defined in section 2 of
	// RFC6298.
	RTTVar time.Duration

	// RTTMeasureSeqNumber is the highest acceptable sequence number at the
	// time this RTT measurement period began.
	RTTMeasureSeqNumber seqnum.Value

	// RTTMeasureTime is the absolute time at which the current RTT
	// measurement period began.
	RTTMeasureTime tcpip.MonotonicTime

	// Disabled is true if an explicit receive buffer is set for the
	// endpoint.
	Disabled bool
}

// TCPRcvBufState contains information about the state of an endpoint's receive
// socket buffer.
//
// +stateify savable
type TCPRcvBufState struct {
	// RcvBufUsed is the amount of bytes actually held in the receive
	// socket buffer for the endpoint.
	RcvBufUsed int

	// RcvBufAutoTuneParams is used to hold state variables to compute the
	// auto tuned receive buffer size.
	RcvAutoParams RcvBufAutoTuneParams

	// RcvClosed if true, indicates the endpoint has been closed for
	// reading.
	RcvClosed bool
}

// TCPSndBufState contains information about the state of an endpoint's send
// socket buffer.
//
// +stateify savable
type TCPSndBufState struct {
	// SndBufSize is the size of the socket send buffer.
	SndBufSize int

	// SndBufUsed is the number of bytes held in the socket send buffer.
	SndBufUsed int

	// SndClosed indicates that the endpoint has been closed for sends.
	SndClosed bool

	// PacketTooBigCount is used to notify the main protocol routine how
	// many times a "packet too big" control packet is received.
	PacketTooBigCount int

	// SndMTU is the smallest MTU seen in the control packets received.
	SndMTU int

	// AutoTuneSndBufDisabled indicates that the auto tuning of send buffer
	// is disabled.
	//
	// Must be accessed using atomic operations.
	AutoTuneSndBufDisabled uint32
}

// TCPEndpointStateInner contains the members of TCPEndpointState used directly
// (that is, not within another containing struct) within the endpoint's
// internal implementation.
//
// +stateify savable
type TCPEndpointStateInner struct {
	// TSOffset is a randomized offset added to the value of the TSVal
	// field in the timestamp option.
	TSOffset tcp.TSOffset

	// SACKPermitted is set to true if the peer sends the TCPSACKPermitted
	// option in the SYN/SYN-ACK.
	SACKPermitted bool

	// SendTSOk is used to indicate when the TS Option has been negotiated.
	// When sendTSOk is true every non-RST segment should carry a TS as per
	// RFC7323#section-1.1.
	SendTSOk bool

	// RecentTS is the timestamp that should be sent in the TSEcr field of
	// the timestamp for future segments sent by the endpoint. This field
	// is updated if required when a new segment is received by this
	// endpoint.
	RecentTS uint32
}

// TCPEndpointState is a copy of the internal state of a TCP endpoint.
//
// +stateify savable
type TCPEndpointState struct {
	// TCPEndpointStateInner contains the members of TCPEndpointState used
	// by the endpoint's internal implementation.
	TCPEndpointStateInner

	// ID is a copy of the TransportEndpointID for the endpoint.
	ID TCPEndpointID

	// SegTime denotes the absolute time when this segment was received.
	SegTime tcpip.MonotonicTime

	// RcvBufState contains information about the state of the endpoint's
	// receive socket buffer.
	RcvBufState TCPRcvBufState

	// SndBufState contains information about the state of the endpoint's
	// send socket buffer.
	SndBufState TCPSndBufState

	// SACK holds TCP SACK related information for this endpoint.
	SACK TCPSACKInfo

	// Receiver holds variables related to the TCP receiver for the
	// endpoint.
	Receiver TCPReceiverState

	// Sender holds state related to the TCP Sender for the endpoint.
	Sender TCPSenderState
}