diff options
Diffstat (limited to 'pkg/tcpip/transport')
23 files changed, 721 insertions, 828 deletions
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go index 440cb0352..94fcd72d9 100644 --- a/pkg/tcpip/transport/icmp/endpoint.go +++ b/pkg/tcpip/transport/icmp/endpoint.go @@ -49,6 +49,7 @@ const ( // +stateify savable type endpoint struct { stack.TransportEndpointInfo + tcpip.DefaultSocketOptionsHandler // The following fields are initialized at creation time and are // immutable. @@ -71,7 +72,7 @@ type endpoint struct { // shutdownFlags represent the current shutdown state of the endpoint. shutdownFlags tcpip.ShutdownFlags state endpointState - route stack.Route `state:"manual"` + route *stack.Route `state:"manual"` ttl uint8 stats tcpip.TransportEndpointStats `state:"nosave"` // linger is used for SO_LINGER socket option. @@ -85,7 +86,7 @@ type endpoint struct { } func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { - return &endpoint{ + ep := &endpoint{ stack: s, TransportEndpointInfo: stack.TransportEndpointInfo{ NetProto: netProto, @@ -96,7 +97,9 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProt sndBufSize: 32 * 1024, state: stateInitial, uniqueID: s.UniqueID(), - }, nil + } + ep.ops.InitHandler(ep) + return ep, nil } // UniqueID implements stack.TransportEndpoint.UniqueID. @@ -129,7 +132,10 @@ func (e *endpoint) Close() { } e.rcvMu.Unlock() - e.route.Release() + if e.route != nil { + e.route.Release() + e.route = nil + } // Update the state. e.state = stateClosed @@ -142,6 +148,7 @@ func (e *endpoint) Close() { // ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf. func (e *endpoint) ModerateRecvBuf(copied int) {} +// SetOwner implements tcpip.Endpoint.SetOwner. func (e *endpoint) SetOwner(owner tcpip.PacketOwner) { e.owner = owner } @@ -267,26 +274,8 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c } } - var route *stack.Route - if to == nil { - route = &e.route - - if route.IsResolutionRequired() { - // Promote lock to exclusive if using a shared route, - // given that it may need to change in Route.Resolve() - // call below. - e.mu.RUnlock() - defer e.mu.RLock() - - e.mu.Lock() - defer e.mu.Unlock() - - // Recheck state after lock was re-acquired. - if e.state != stateConnected { - return 0, nil, tcpip.ErrInvalidEndpointState - } - } - } else { + route := e.route + if to != nil { // Reject destination address if it goes through a different // NIC than the endpoint was bound to. nicID := to.NIC @@ -310,7 +299,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c } defer r.Release() - route = &r + route = r } if route.IsResolutionRequired() { @@ -361,11 +350,6 @@ func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error { return nil } -// SetSockOptBool sets a socket option. Currently not supported. -func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error { - return nil -} - // SetSockOptInt sets a socket option. Currently not supported. func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error { switch opt { @@ -378,17 +362,6 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error { return nil } -// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool. -func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) { - switch opt { - case tcpip.KeepaliveEnabledOption, tcpip.AcceptConnOption: - return false, nil - - default: - return false, tcpip.ErrUnknownProtocolOption - } -} - // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { switch opt { @@ -857,6 +830,7 @@ func (*endpoint) LastError() *tcpip.Error { return nil } +// SocketOptions implements tcpip.Endpoint.SocketOptions. func (e *endpoint) SocketOptions() *tcpip.SocketOptions { return &e.ops } diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go index 3bff3755a..3666bac0f 100644 --- a/pkg/tcpip/transport/packet/endpoint.go +++ b/pkg/tcpip/transport/packet/endpoint.go @@ -60,6 +60,8 @@ type packet struct { // +stateify savable type endpoint struct { stack.TransportEndpointInfo + tcpip.DefaultSocketOptionsHandler + // The following fields are initialized at creation time and are // immutable. stack *stack.Stack `state:"manual"` @@ -107,6 +109,7 @@ func NewEndpoint(s *stack.Stack, cooked bool, netProto tcpip.NetworkProtocolNumb rcvBufSizeMax: 32 * 1024, sndBufSize: 32 * 1024, } + ep.ops.InitHandler(ep) // Override with stack defaults. var ss stack.SendBufferSizeOption @@ -318,11 +321,6 @@ func (ep *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error { } } -// SetSockOptBool implements tcpip.Endpoint.SetSockOptBool. -func (ep *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error { - return tcpip.ErrUnknownProtocolOption -} - // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt. func (ep *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error { switch opt { @@ -390,16 +388,6 @@ func (ep *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error { } } -// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool. -func (*endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) { - switch opt { - case tcpip.AcceptConnOption: - return false, nil - default: - return false, tcpip.ErrNotSupported - } -} - // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. func (ep *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { switch opt { @@ -551,8 +539,10 @@ func (ep *endpoint) Stats() tcpip.EndpointStats { return &ep.stats } +// SetOwner implements tcpip.Endpoint.SetOwner. func (ep *endpoint) SetOwner(owner tcpip.PacketOwner) {} +// SocketOptions implements tcpip.Endpoint.SocketOptions. func (ep *endpoint) SocketOptions() *tcpip.SocketOptions { return &ep.ops } diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go index 4ae1f92ab..0840a4b3d 100644 --- a/pkg/tcpip/transport/raw/endpoint.go +++ b/pkg/tcpip/transport/raw/endpoint.go @@ -58,12 +58,13 @@ type rawPacket struct { // +stateify savable type endpoint struct { stack.TransportEndpointInfo + tcpip.DefaultSocketOptionsHandler + // The following fields are initialized at creation time and are // immutable. stack *stack.Stack `state:"manual"` waiterQueue *waiter.Queue associated bool - hdrIncluded bool // The following fields are used to manage the receive queue and are // protected by rcvMu. @@ -82,7 +83,7 @@ type endpoint struct { bound bool // route is the route to a remote network endpoint. It is set via // Connect(), and is valid only when conneted is true. - route stack.Route `state:"manual"` + route *stack.Route `state:"manual"` stats tcpip.TransportEndpointStats `state:"nosave"` // linger is used for SO_LINGER socket option. linger tcpip.LingerOption @@ -114,8 +115,9 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProt rcvBufSizeMax: 32 * 1024, sndBufSizeMax: 32 * 1024, associated: associated, - hdrIncluded: !associated, } + e.ops.InitHandler(e) + e.ops.SetHeaderIncluded(!associated) // Override with stack defaults. var ss stack.SendBufferSizeOption @@ -170,9 +172,11 @@ func (e *endpoint) Close() { e.rcvList.Remove(e.rcvList.Front()) } - if e.connected { + e.connected = false + + if e.route != nil { e.route.Release() - e.connected = false + e.route = nil } e.closed = true @@ -266,7 +270,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c // If this is an unassociated socket and callee provided a nonzero // destination address, route using that address. - if e.hdrIncluded { + if e.ops.GetHeaderIncluded() { ip := header.IPv4(payloadBytes) if !ip.IsValid(len(payloadBytes)) { e.mu.RUnlock() @@ -296,7 +300,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c } if e.route.IsResolutionRequired() { - savedRoute := &e.route + savedRoute := e.route // Promote lock to exclusive if using a shared route, // given that it may need to change in finishWrite. e.mu.RUnlock() @@ -304,7 +308,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c // Make sure that the route didn't change during the // time we didn't hold the lock. - if !e.connected || savedRoute != &e.route { + if !e.connected || savedRoute != e.route { e.mu.Unlock() return 0, nil, tcpip.ErrInvalidEndpointState } @@ -314,7 +318,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c return n, ch, err } - n, ch, err := e.finishWrite(payloadBytes, &e.route) + n, ch, err := e.finishWrite(payloadBytes, e.route) e.mu.RUnlock() return n, ch, err } @@ -335,7 +339,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c return 0, nil, err } - n, ch, err := e.finishWrite(payloadBytes, &route) + n, ch, err := e.finishWrite(payloadBytes, route) route.Release() e.mu.RUnlock() return n, ch, err @@ -356,7 +360,7 @@ func (e *endpoint) finishWrite(payloadBytes []byte, route *stack.Route) (int64, } } - if e.hdrIncluded { + if e.ops.GetHeaderIncluded() { pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ Data: buffer.View(payloadBytes).ToVectorisedView(), }) @@ -531,18 +535,6 @@ func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error { } } -// SetSockOptBool implements tcpip.Endpoint.SetSockOptBool. -func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error { - switch opt { - case tcpip.IPHdrIncludedOption: - e.mu.Lock() - e.hdrIncluded = v - e.mu.Unlock() - return nil - } - return tcpip.ErrUnknownProtocolOption -} - // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt. func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error { switch opt { @@ -601,23 +593,6 @@ func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error { } } -// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool. -func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) { - switch opt { - case tcpip.KeepaliveEnabledOption, tcpip.AcceptConnOption: - return false, nil - - case tcpip.IPHdrIncludedOption: - e.mu.Lock() - v := e.hdrIncluded - e.mu.Unlock() - return v, nil - - default: - return false, tcpip.ErrUnknownProtocolOption - } -} - // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { switch opt { @@ -756,10 +731,12 @@ func (e *endpoint) Stats() tcpip.EndpointStats { // Wait implements stack.TransportEndpoint.Wait. func (*endpoint) Wait() {} +// LastError implements tcpip.Endpoint.LastError. func (*endpoint) LastError() *tcpip.Error { return nil } +// SocketOptions implements tcpip.Endpoint.SocketOptions. func (e *endpoint) SocketOptions() *tcpip.SocketOptions { return &e.ops } diff --git a/pkg/tcpip/transport/raw/endpoint_state.go b/pkg/tcpip/transport/raw/endpoint_state.go index 7d97cbdc7..4a7e1c039 100644 --- a/pkg/tcpip/transport/raw/endpoint_state.go +++ b/pkg/tcpip/transport/raw/endpoint_state.go @@ -73,7 +73,13 @@ func (e *endpoint) Resume(s *stack.Stack) { // If the endpoint is connected, re-connect. if e.connected { var err *tcpip.Error - e.route, err = e.stack.FindRoute(e.RegisterNICID, e.BindAddr, e.route.RemoteAddress, e.NetProto, false) + // TODO(gvisor.dev/issue/4906): Properly restore the route with the right + // remote address. We used to pass e.remote.RemoteAddress which was + // effectively the empty address but since moving e.route to hold a pointer + // to a route instead of the route by value, we pass the empty address + // directly. Obviously this was always wrong since we should provide the + // remote address we were connected to, to properly restore the route. + e.route, err = e.stack.FindRoute(e.RegisterNICID, e.BindAddr, "", e.NetProto, false) if err != nil { panic(err) } diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD index 518449602..cf232b508 100644 --- a/pkg/tcpip/transport/tcp/BUILD +++ b/pkg/tcpip/transport/tcp/BUILD @@ -1,4 +1,4 @@ -load("//tools:defs.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library", "go_test", "more_shards") load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) @@ -45,7 +45,9 @@ go_library( "rcv.go", "rcv_state.go", "reno.go", + "reno_recovery.go", "sack.go", + "sack_recovery.go", "sack_scoreboard.go", "segment.go", "segment_heap.go", @@ -91,7 +93,7 @@ go_test( "tcp_test.go", "tcp_timestamp_test.go", ], - shard_count = 10, + shard_count = more_shards, deps = [ ":tcp", "//pkg/rand", diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go index 6e5adc383..3e1041cbe 100644 --- a/pkg/tcpip/transport/tcp/accept.go +++ b/pkg/tcpip/transport/tcp/accept.go @@ -213,7 +213,7 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i route.ResolveWith(s.remoteLinkAddr) n := newEndpoint(l.stack, netProto, queue) - n.v6only = l.v6Only + n.ops.SetV6Only(l.v6Only) n.ID = s.id n.boundNICID = s.nicID n.route = route @@ -599,7 +599,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) *tcpip.Er ack: s.sequenceNumber + 1, rcvWnd: ctx.rcvWnd, } - if err := e.sendSynTCP(&route, fields, synOpts); err != nil { + if err := e.sendSynTCP(route, fields, synOpts); err != nil { return err } e.stack.Stats().TCP.ListenOverflowSynCookieSent.Increment() @@ -752,7 +752,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) *tcpip.Er // its own goroutine and is responsible for handling connection requests. func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) { e.mu.Lock() - v6Only := e.v6only + v6Only := e.ops.GetV6Only() ctx := newListenContext(e.stack, e, rcvWnd, v6Only, e.NetProto) defer func() { diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go index ac6d879a7..c944dccc0 100644 --- a/pkg/tcpip/transport/tcp/connect.go +++ b/pkg/tcpip/transport/tcp/connect.go @@ -16,6 +16,7 @@ package tcp import ( "encoding/binary" + "math" "time" "gvisor.dev/gvisor/pkg/rand" @@ -133,7 +134,7 @@ func FindWndScale(wnd seqnum.Size) int { return 0 } - max := seqnum.Size(0xffff) + max := seqnum.Size(math.MaxUint16) s := 0 for wnd > max && s < header.MaxWndScale { s++ @@ -300,7 +301,7 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error { if ttl == 0 { ttl = h.ep.route.DefaultTTL() } - h.ep.sendSynTCP(&h.ep.route, tcpFields{ + h.ep.sendSynTCP(h.ep.route, tcpFields{ id: h.ep.ID, ttl: ttl, tos: h.ep.sendTOS, @@ -361,7 +362,7 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error { SACKPermitted: h.ep.sackPermitted, MSS: h.ep.amss, } - h.ep.sendSynTCP(&h.ep.route, tcpFields{ + h.ep.sendSynTCP(h.ep.route, tcpFields{ id: h.ep.ID, ttl: h.ep.ttl, tos: h.ep.sendTOS, @@ -496,7 +497,7 @@ func (h *handshake) resolveRoute() *tcpip.Error { h.ep.mu.Lock() } if n¬ifyError != 0 { - return h.ep.LastError() + return h.ep.lastErrorLocked() } } @@ -547,7 +548,7 @@ func (h *handshake) start() *tcpip.Error { } h.sendSYNOpts = synOpts - h.ep.sendSynTCP(&h.ep.route, tcpFields{ + h.ep.sendSynTCP(h.ep.route, tcpFields{ id: h.ep.ID, ttl: h.ep.ttl, tos: h.ep.sendTOS, @@ -575,7 +576,6 @@ func (h *handshake) complete() *tcpip.Error { return err } defer timer.stop() - for h.state != handshakeCompleted { // Unlock before blocking, and reacquire again afterwards (h.ep.mu is held // throughout handshake processing). @@ -597,7 +597,7 @@ func (h *handshake) complete() *tcpip.Error { // the connection with another ACK or data (as ACKs are never // retransmitted on their own). if h.active || !h.acked || h.deferAccept != 0 && time.Since(h.startTime) > h.deferAccept { - h.ep.sendSynTCP(&h.ep.route, tcpFields{ + h.ep.sendSynTCP(h.ep.route, tcpFields{ id: h.ep.ID, ttl: h.ep.ttl, tos: h.ep.sendTOS, @@ -631,9 +631,8 @@ func (h *handshake) complete() *tcpip.Error { h.ep.mu.Lock() } if n¬ifyError != 0 { - return h.ep.LastError() + return h.ep.lastErrorLocked() } - case wakerForNewSegment: if err := h.processSegments(); err != nil { return err @@ -820,8 +819,8 @@ func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso data = data.Clone(nil) optLen := len(tf.opts) - if tf.rcvWnd > 0xffff { - tf.rcvWnd = 0xffff + if tf.rcvWnd > math.MaxUint16 { + tf.rcvWnd = math.MaxUint16 } mss := int(gso.MSS) @@ -865,8 +864,8 @@ func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso // network endpoint and under the provided identity. func sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stack.GSO, owner tcpip.PacketOwner) *tcpip.Error { optLen := len(tf.opts) - if tf.rcvWnd > 0xffff { - tf.rcvWnd = 0xffff + if tf.rcvWnd > math.MaxUint16 { + tf.rcvWnd = math.MaxUint16 } if r.Loop&stack.PacketLoop == 0 && gso != nil && gso.Type == stack.GSOSW && int(gso.MSS) < data.Size() { @@ -941,7 +940,7 @@ func (e *endpoint) sendRaw(data buffer.VectorisedView, flags byte, seq, ack seqn sackBlocks = e.sack.Blocks[:e.sack.NumBlocks] } options := e.makeOptions(sackBlocks) - err := e.sendTCP(&e.route, tcpFields{ + err := e.sendTCP(e.route, tcpFields{ id: e.ID, ttl: e.ttl, tos: e.sendTOS, @@ -1002,7 +1001,7 @@ func (e *endpoint) resetConnectionLocked(err *tcpip.Error) { // Only send a reset if the connection is being aborted for a reason // other than receiving a reset. e.setEndpointState(StateError) - e.HardError = err + e.hardError = err if err != tcpip.ErrConnectionReset && err != tcpip.ErrTimeout { // The exact sequence number to be used for the RST is the same as the // one used by Linux. We need to handle the case of window being shrunk @@ -1080,7 +1079,7 @@ func (e *endpoint) transitionToStateCloseLocked() { // to any other listening endpoint. We reply with RST if we cannot find one. func (e *endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) { ep := e.stack.FindTransportEndpoint(e.NetProto, e.TransProto, e.ID, s.nicID) - if ep == nil && e.NetProto == header.IPv6ProtocolNumber && e.EndpointInfo.TransportEndpointInfo.ID.LocalAddress.To4() != "" { + if ep == nil && e.NetProto == header.IPv6ProtocolNumber && e.TransportEndpointInfo.ID.LocalAddress.To4() != "" { // Dual-stack socket, try IPv4. ep = e.stack.FindTransportEndpoint(header.IPv4ProtocolNumber, e.TransProto, e.ID, s.nicID) } @@ -1141,7 +1140,7 @@ func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) { // delete the TCB, and return. case StateCloseWait: e.transitionToStateCloseLocked() - e.HardError = tcpip.ErrAborted + e.hardError = tcpip.ErrAborted e.notifyProtocolGoroutine(notifyTickleWorker) return false, nil default: @@ -1286,7 +1285,7 @@ func (e *endpoint) keepaliveTimerExpired() *tcpip.Error { userTimeout := e.userTimeout e.keepalive.Lock() - if !e.keepalive.enabled || !e.keepalive.timer.checkExpiration() { + if !e.SocketOptions().GetKeepAlive() || !e.keepalive.timer.checkExpiration() { e.keepalive.Unlock() return nil } @@ -1323,7 +1322,7 @@ func (e *endpoint) resetKeepaliveTimer(receivedData bool) { } // Start the keepalive timer IFF it's enabled and there is no pending // data to send. - if !e.keepalive.enabled || e.snd == nil || e.snd.sndUna != e.snd.sndNxt { + if !e.SocketOptions().GetKeepAlive() || e.snd == nil || e.snd.sndUna != e.snd.sndNxt { e.keepalive.timer.disable() e.keepalive.Unlock() return @@ -1353,7 +1352,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{ epilogue := func() { // e.mu is expected to be hold upon entering this section. - if e.snd != nil { e.snd.resendTimer.cleanup() } @@ -1383,7 +1381,7 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{ e.lastErrorMu.Unlock() e.setEndpointState(StateError) - e.HardError = err + e.hardError = err e.workerCleanup = true // Lock released below. @@ -1638,7 +1636,7 @@ func (e *endpoint) handleTimeWaitSegments() (extendTimeWait bool, reuseTW func() } extTW, newSyn := e.rcv.handleTimeWaitSegment(s) if newSyn { - info := e.EndpointInfo.TransportEndpointInfo + info := e.TransportEndpointInfo newID := info.ID newID.RemoteAddress = "" newID.RemotePort = 0 diff --git a/pkg/tcpip/transport/tcp/dual_stack_test.go b/pkg/tcpip/transport/tcp/dual_stack_test.go index a6f25896b..1d1b01a6c 100644 --- a/pkg/tcpip/transport/tcp/dual_stack_test.go +++ b/pkg/tcpip/transport/tcp/dual_stack_test.go @@ -405,14 +405,6 @@ func testV4Accept(t *testing.T, c *context.Context) { } } - // Make sure we get the same error when calling the original ep and the - // new one. This validates that v4-mapped endpoints are still able to - // query the V6Only flag, whereas pure v4 endpoints are not. - _, expected := c.EP.GetSockOptBool(tcpip.V6OnlyOption) - if _, err := nep.GetSockOptBool(tcpip.V6OnlyOption); err != expected { - t.Fatalf("GetSockOpt returned unexpected value: got %v, want %v", err, expected) - } - // Check the peer address. addr, err := nep.GetRemoteAddress() if err != nil { @@ -530,12 +522,12 @@ func TestV6AcceptOnV6(t *testing.T) { c.WQ.EventRegister(&we, waiter.EventIn) defer c.WQ.EventUnregister(&we) var addr tcpip.FullAddress - nep, _, err := c.EP.Accept(&addr) + _, _, err := c.EP.Accept(&addr) if err == tcpip.ErrWouldBlock { // Wait for connection to be established. select { case <-ch: - nep, _, err = c.EP.Accept(&addr) + _, _, err = c.EP.Accept(&addr) if err != nil { t.Fatalf("Accept failed: %v", err) } @@ -548,12 +540,6 @@ func TestV6AcceptOnV6(t *testing.T) { if addr.Addr != context.TestV6Addr { t.Errorf("Unexpected remote address: got %s, want %s", addr.Addr, context.TestV6Addr) } - - // Make sure we can still query the v6 only status of the new endpoint, - // that is, that it is in fact a v6 socket. - if _, err := nep.GetSockOptBool(tcpip.V6OnlyOption); err != nil { - t.Errorf("GetSockOptBool(tcpip.V6OnlyOption) failed: %s", err) - } } func TestV4AcceptOnV4(t *testing.T) { diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index 4f4f4c65e..87eda2efb 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -310,16 +310,12 @@ type Stats struct { func (*Stats) IsEndpointStats() {} // EndpointInfo holds useful information about a transport endpoint which -// can be queried by monitoring tools. +// can be queried by monitoring tools. This exists to allow tcp-only state to +// be exposed. // // +stateify savable type EndpointInfo struct { stack.TransportEndpointInfo - - // HardError is meaningful only when state is stateError. It stores the - // error to be returned when read/write syscalls are called and the - // endpoint is in this state. HardError is protected by endpoint mu. - HardError *tcpip.Error `state:".(string)"` } // IsEndpointInfo is an empty method to implement the tcpip.EndpointInfo @@ -367,6 +363,7 @@ func (*EndpointInfo) IsEndpointInfo() {} // +stateify savable type endpoint struct { EndpointInfo + tcpip.DefaultSocketOptionsHandler // endpointEntry is used to queue endpoints for processing to the // a given tcp processor goroutine. @@ -386,6 +383,11 @@ type endpoint struct { waiterQueue *waiter.Queue `state:"wait"` uniqueID uint64 + // hardError is meaningful only when state is stateError. It stores the + // error to be returned when read/write syscalls are called and the + // endpoint is in this state. hardError is protected by endpoint mu. + hardError *tcpip.Error `state:".(string)"` + // lastError represents the last error that the endpoint reported; // access to it is protected by the following mutex. lastErrorMu sync.Mutex `state:"nosave"` @@ -421,7 +423,10 @@ type endpoint struct { // mu protects all endpoint fields unless documented otherwise. mu must // be acquired before interacting with the endpoint fields. - mu sync.Mutex `state:"nosave"` + // + // During handshake, mu is locked by the protocol listen goroutine and + // released by the handshake completion goroutine. + mu sync.CrossGoroutineMutex `state:"nosave"` ownedByUser uint32 // state must be read/set using the EndpointState()/setEndpointState() @@ -436,9 +441,8 @@ type endpoint struct { isPortReserved bool `state:"manual"` isRegistered bool `state:"manual"` boundNICID tcpip.NICID - route stack.Route `state:"manual"` + route *stack.Route `state:"manual"` ttl uint8 - v6only bool isConnectNotified bool // h stores a reference to the current handshake state if the endpoint is in @@ -506,24 +510,9 @@ type endpoint struct { // delay is a boolean (0 is false) and must be accessed atomically. delay uint32 - // cork holds back segments until full. - // - // cork is a boolean (0 is false) and must be accessed atomically. - cork uint32 - // scoreboard holds TCP SACK Scoreboard information for this endpoint. scoreboard *SACKScoreboard - // The options below aren't implemented, but we remember the user - // settings because applications expect to be able to set/query these - // options. - - // slowAck holds the negated state of quick ack. It is stubbed out and - // does nothing. - // - // slowAck is a boolean (0 is false) and must be accessed atomically. - slowAck uint32 - // segmentQueue is used to hand received segments to the protocol // goroutine. Segments are queued as long as the queue is not full, // and dropped when it is. @@ -701,7 +690,7 @@ func (e *endpoint) UniqueID() uint64 { // // If userMSS is non-zero and is not greater than the maximum possible MSS for // r, it will be used; otherwise, the maximum possible MSS will be used. -func calculateAdvertisedMSS(userMSS uint16, r stack.Route) uint16 { +func calculateAdvertisedMSS(userMSS uint16, r *stack.Route) uint16 { // The maximum possible MSS is dependent on the route. // TODO(b/143359391): Respect TCP Min and Max size. maxMSS := uint16(r.MTU() - header.TCPMinimumSize) @@ -850,7 +839,6 @@ func (e *endpoint) recentTimestamp() uint32 { // +stateify savable type keepalive struct { sync.Mutex `state:"nosave"` - enabled bool idle time.Duration interval time.Duration count int @@ -884,6 +872,9 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue windowClamp: DefaultReceiveBufferSize, maxSynRetries: DefaultSynRetries, } + e.ops.InitHandler(e) + e.ops.SetMulticastLoop(true) + e.ops.SetQuickAck(true) var ss tcpip.TCPSendBufferSizeRangeOption if err := s.TransportProtocolOption(ProtocolNumber, &ss); err == nil { @@ -907,7 +898,7 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue var de tcpip.TCPDelayEnabled if err := s.TransportProtocolOption(ProtocolNumber, &de); err == nil && de { - e.SetSockOptBool(tcpip.DelayOption, true) + e.ops.SetDelayOption(true) } var tcpLT tcpip.TCPLingerTimeoutOption @@ -1169,7 +1160,11 @@ func (e *endpoint) cleanupLocked() { e.boundPortFlags = ports.Flags{} e.boundDest = tcpip.FullAddress{} - e.route.Release() + if e.route != nil { + e.route.Release() + e.route = nil + } + e.stack.CompleteTransportEndpointCleanup(e) tcpip.DeleteDanglingEndpoint(e) } @@ -1279,11 +1274,20 @@ func (e *endpoint) ModerateRecvBuf(copied int) { e.rcvListMu.Unlock() } +// SetOwner implements tcpip.Endpoint.SetOwner. func (e *endpoint) SetOwner(owner tcpip.PacketOwner) { e.owner = owner } -func (e *endpoint) LastError() *tcpip.Error { +// Preconditions: e.mu must be held to call this function. +func (e *endpoint) hardErrorLocked() *tcpip.Error { + err := e.hardError + e.hardError = nil + return err +} + +// Preconditions: e.mu must be held to call this function. +func (e *endpoint) lastErrorLocked() *tcpip.Error { e.lastErrorMu.Lock() defer e.lastErrorMu.Unlock() err := e.lastError @@ -1291,6 +1295,16 @@ func (e *endpoint) LastError() *tcpip.Error { return err } +// LastError implements tcpip.Endpoint.LastError. +func (e *endpoint) LastError() *tcpip.Error { + e.LockUser() + defer e.UnlockUser() + if err := e.hardErrorLocked(); err != nil { + return err + } + return e.lastErrorLocked() +} + // Read reads data from the endpoint. func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) { e.LockUser() @@ -1312,9 +1326,11 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, bufUsed := e.rcvBufUsed if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 { e.rcvListMu.Unlock() - he := e.HardError if s == StateError { - return buffer.View{}, tcpip.ControlMessages{}, he + if err := e.hardErrorLocked(); err != nil { + return buffer.View{}, tcpip.ControlMessages{}, err + } + return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrClosedForReceive } e.stats.ReadErrors.NotConnected.Increment() return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrNotConnected @@ -1370,9 +1386,13 @@ func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) { // indicating the reason why it's not writable. // Caller must hold e.mu and e.sndBufMu func (e *endpoint) isEndpointWritableLocked() (int, *tcpip.Error) { + // The endpoint cannot be written to if it's not connected. switch s := e.EndpointState(); { case s == StateError: - return 0, e.HardError + if err := e.hardErrorLocked(); err != nil { + return 0, err + } + return 0, tcpip.ErrClosedForSend case !s.connecting() && !s.connected(): return 0, tcpip.ErrClosedForSend case s.connecting(): @@ -1486,7 +1506,7 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro // but has some pending unread data. if s := e.EndpointState(); !s.connected() && s != StateClose { if s == StateError { - return 0, tcpip.ControlMessages{}, e.HardError + return 0, tcpip.ControlMessages{}, e.hardErrorLocked() } e.stats.ReadErrors.InvalidEndpointState.Increment() return 0, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState @@ -1602,72 +1622,39 @@ func (e *endpoint) windowCrossedACKThresholdLocked(deltaBefore int) (crossed boo return false, false } -// SetSockOptBool sets a socket option. -func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error { - switch opt { - - case tcpip.CorkOption: - e.LockUser() - if !v { - atomic.StoreUint32(&e.cork, 0) - - // Handle the corked data. - e.sndWaker.Assert() - } else { - atomic.StoreUint32(&e.cork, 1) - } - e.UnlockUser() - - case tcpip.DelayOption: - if v { - atomic.StoreUint32(&e.delay, 1) - } else { - atomic.StoreUint32(&e.delay, 0) - - // Handle delayed data. - e.sndWaker.Assert() - } - - case tcpip.KeepaliveEnabledOption: - e.keepalive.Lock() - e.keepalive.enabled = v - e.keepalive.Unlock() - e.notifyProtocolGoroutine(notifyKeepaliveChanged) - - case tcpip.QuickAckOption: - o := uint32(1) - if v { - o = 0 - } - atomic.StoreUint32(&e.slowAck, o) - - case tcpip.ReuseAddressOption: - e.LockUser() - e.portFlags.TupleOnly = v - e.UnlockUser() - - case tcpip.ReusePortOption: - e.LockUser() - e.portFlags.LoadBalanced = v - e.UnlockUser() +// OnReuseAddressSet implements tcpip.SocketOptionsHandler.OnReuseAddressSet. +func (e *endpoint) OnReuseAddressSet(v bool) { + e.LockUser() + e.portFlags.TupleOnly = v + e.UnlockUser() +} - case tcpip.V6OnlyOption: - // We only recognize this option on v6 endpoints. - if e.NetProto != header.IPv6ProtocolNumber { - return tcpip.ErrInvalidEndpointState - } +// OnReusePortSet implements tcpip.SocketOptionsHandler.OnReusePortSet. +func (e *endpoint) OnReusePortSet(v bool) { + e.LockUser() + e.portFlags.LoadBalanced = v + e.UnlockUser() +} - // We only allow this to be set when we're in the initial state. - if e.EndpointState() != StateInitial { - return tcpip.ErrInvalidEndpointState - } +// OnKeepAliveSet implements tcpip.SocketOptionsHandler.OnKeepAliveSet. +func (e *endpoint) OnKeepAliveSet(v bool) { + e.notifyProtocolGoroutine(notifyKeepaliveChanged) +} - e.LockUser() - e.v6only = v - e.UnlockUser() +// OnDelayOptionSet implements tcpip.SocketOptionsHandler.OnDelayOptionSet. +func (e *endpoint) OnDelayOptionSet(v bool) { + if !v { + // Handle delayed data. + e.sndWaker.Assert() } +} - return nil +// OnCorkOptionSet implements tcpip.SocketOptionsHandler.OnCorkOptionSet. +func (e *endpoint) OnCorkOptionSet(v bool) { + if !v { + // Handle the corked data. + e.sndWaker.Assert() + } } // SetSockOptInt sets a socket option. @@ -1949,67 +1936,6 @@ func (e *endpoint) readyReceiveSize() (int, *tcpip.Error) { return e.rcvBufUsed, nil } -// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool. -func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) { - switch opt { - - case tcpip.CorkOption: - return atomic.LoadUint32(&e.cork) != 0, nil - - case tcpip.DelayOption: - return atomic.LoadUint32(&e.delay) != 0, nil - - case tcpip.KeepaliveEnabledOption: - e.keepalive.Lock() - v := e.keepalive.enabled - e.keepalive.Unlock() - - return v, nil - - case tcpip.QuickAckOption: - v := atomic.LoadUint32(&e.slowAck) == 0 - return v, nil - - case tcpip.ReuseAddressOption: - e.LockUser() - v := e.portFlags.TupleOnly - e.UnlockUser() - - return v, nil - - case tcpip.ReusePortOption: - e.LockUser() - v := e.portFlags.LoadBalanced - e.UnlockUser() - - return v, nil - - case tcpip.V6OnlyOption: - // We only recognize this option on v6 endpoints. - if e.NetProto != header.IPv6ProtocolNumber { - return false, tcpip.ErrUnknownProtocolOption - } - - e.LockUser() - v := e.v6only - e.UnlockUser() - - return v, nil - - case tcpip.MulticastLoopOption: - return true, nil - - case tcpip.AcceptConnOption: - e.LockUser() - defer e.UnlockUser() - - return e.EndpointState() == StateListen, nil - - default: - return false, tcpip.ErrUnknownProtocolOption - } -} - // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { switch opt { @@ -2166,7 +2092,7 @@ func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error { // checkV4MappedLocked determines the effective network protocol and converts // addr to its canonical form. func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) { - unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.v6only) + unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.ops.GetV6Only()) if err != nil { return tcpip.FullAddress{}, 0, err } @@ -2243,7 +2169,10 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc return tcpip.ErrAlreadyConnecting case StateError: - return e.HardError + if err := e.hardErrorLocked(); err != nil { + return err + } + return tcpip.ErrConnectionAborted default: return tcpip.ErrInvalidEndpointState @@ -2417,7 +2346,7 @@ func (e *endpoint) startMainLoop(handshake bool) *tcpip.Error { e.lastErrorMu.Unlock() e.setEndpointState(StateError) - e.HardError = err + e.hardError = err // Call cleanupLocked to free up any reservations. e.cleanupLocked() @@ -2697,7 +2626,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err *tcpip.Error) { // v6only set to false. if netProto == header.IPv6ProtocolNumber { stackHasV4 := e.stack.CheckNetworkProtocol(header.IPv4ProtocolNumber) - alsoBindToV4 := !e.v6only && addr.Addr == "" && stackHasV4 + alsoBindToV4 := !e.ops.GetV6Only() && addr.Addr == "" && stackHasV4 if alsoBindToV4 { netProtos = append(netProtos, header.IPv4ProtocolNumber) } @@ -2782,7 +2711,7 @@ func (e *endpoint) getRemoteAddress() tcpip.FullAddress { func (*endpoint) HandlePacket(stack.TransportEndpointID, *stack.PacketBuffer) { // TCP HandlePacket is not required anymore as inbound packets first - // land at the Dispatcher which then can either delivery using the + // land at the Dispatcher which then can either deliver using the // worker go routine or directly do the invoke the tcp processing inline // based on the state of the endpoint. } @@ -3161,7 +3090,7 @@ func (e *endpoint) State() uint32 { func (e *endpoint) Info() tcpip.EndpointInfo { e.LockUser() // Make a copy of the endpoint info. - ret := e.EndpointInfo + ret := e.TransportEndpointInfo e.UnlockUser() return &ret } @@ -3187,6 +3116,7 @@ func (e *endpoint) Wait() { } } +// SocketOptions implements tcpip.Endpoint.SocketOptions. func (e *endpoint) SocketOptions() *tcpip.SocketOptions { return &e.ops } diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go index bb901c0f8..ba67176b5 100644 --- a/pkg/tcpip/transport/tcp/endpoint_state.go +++ b/pkg/tcpip/transport/tcp/endpoint_state.go @@ -321,21 +321,21 @@ func (e *endpoint) loadRecentTSTime(unix unixTime) { } // saveHardError is invoked by stateify. -func (e *EndpointInfo) saveHardError() string { - if e.HardError == nil { +func (e *endpoint) saveHardError() string { + if e.hardError == nil { return "" } - return e.HardError.String() + return e.hardError.String() } // loadHardError is invoked by stateify. -func (e *EndpointInfo) loadHardError(s string) { +func (e *endpoint) loadHardError(s string) { if s == "" { return } - e.HardError = tcpip.StringToError(s) + e.hardError = tcpip.StringToError(s) } // saveMeasureTime is invoked by stateify. diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go index 2329aca4b..672159eed 100644 --- a/pkg/tcpip/transport/tcp/protocol.go +++ b/pkg/tcpip/transport/tcp/protocol.go @@ -250,7 +250,7 @@ func replyWithReset(stack *stack.Stack, s *segment, tos, ttl uint8) *tcpip.Error ttl = route.DefaultTTL() } - return sendTCP(&route, tcpFields{ + return sendTCP(route, tcpFields{ id: s.id, ttl: ttl, tos: tos, diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go index 8e0b7c843..f2b1b68da 100644 --- a/pkg/tcpip/transport/tcp/rcv.go +++ b/pkg/tcpip/transport/tcp/rcv.go @@ -16,6 +16,7 @@ package tcp import ( "container/heap" + "math" "time" "gvisor.dev/gvisor/pkg/tcpip" @@ -48,6 +49,10 @@ type receiver struct { rcvWndScale uint8 + // prevBufused is the snapshot of endpoint rcvBufUsed taken when we + // advertise a receive window. + prevBufUsed int + closed bool // pendingRcvdSegments is bounded by the receive buffer size of the @@ -80,9 +85,9 @@ func (r *receiver) acceptable(segSeq seqnum.Value, segLen seqnum.Size) bool { // outgoing packets, we should use what we have advertised for acceptability // test. scaledWindowSize := r.rcvWnd >> r.rcvWndScale - if scaledWindowSize > 0xffff { + if scaledWindowSize > math.MaxUint16 { // This is what we actually put in the Window field. - scaledWindowSize = 0xffff + scaledWindowSize = math.MaxUint16 } advertisedWindowSize := scaledWindowSize << r.rcvWndScale return header.Acceptable(segSeq, segLen, r.rcvNxt, r.rcvNxt.Add(advertisedWindowSize)) @@ -106,6 +111,34 @@ func (r *receiver) currentWindow() (curWnd seqnum.Size) { func (r *receiver) getSendParams() (rcvNxt seqnum.Value, rcvWnd seqnum.Size) { newWnd := r.ep.selectWindow() curWnd := r.currentWindow() + unackLen := int(r.ep.snd.maxSentAck.Size(r.rcvNxt)) + bufUsed := r.ep.receiveBufferUsed() + + // Grow the right edge of the window only for payloads larger than the + // the segment overhead OR if the application is actively consuming data. + // + // Avoiding growing the right edge otherwise, addresses a situation below: + // An application has been slow in reading data and we have burst of + // incoming segments lengths < segment overhead. Here, our available free + // memory would reduce drastically when compared to the advertised receive + // window. + // + // For example: With incoming 512 bytes segments, segment overhead of + // 552 bytes (at the time of writing this comment), with receive window + // starting from 1MB and with rcvAdvWndScale being 1, buffer would reach 0 + // when the curWnd is still 19436 bytes, because for every incoming segment + // newWnd would reduce by (552+512) >> rcvAdvWndScale (current value 1), + // while curWnd would reduce by 512 bytes. + // Such a situation causes us to keep tail dropping the incoming segments + // and never advertise zero receive window to the peer. + // + // Linux does a similar check for minimal sk_buff size (128): + // https://github.com/torvalds/linux/blob/d5beb3140f91b1c8a3d41b14d729aefa4dcc58bc/net/ipv4/tcp_input.c#L783 + // + // Also, if the application is reading the data, we keep growing the right + // edge, as we are still advertising a window that we think can be serviced. + toGrow := unackLen >= SegSize || bufUsed <= r.prevBufUsed + // Update rcvAcc only if new window is > previously advertised window. We // should never shrink the acceptable sequence space once it has been // advertised the peer. If we shrink the acceptable sequence space then we @@ -115,7 +148,7 @@ func (r *receiver) getSendParams() (rcvNxt seqnum.Value, rcvWnd seqnum.Size) { // rcvWUP rcvNxt rcvAcc new rcvAcc // <=====curWnd ===> // <========= newWnd > curWnd ========= > - if r.rcvNxt.Add(seqnum.Size(curWnd)).LessThan(r.rcvNxt.Add(seqnum.Size(newWnd))) { + if r.rcvNxt.Add(seqnum.Size(curWnd)).LessThan(r.rcvNxt.Add(seqnum.Size(newWnd))) && toGrow { // If the new window moves the right edge, then update rcvAcc. r.rcvAcc = r.rcvNxt.Add(seqnum.Size(newWnd)) } else { @@ -130,11 +163,24 @@ func (r *receiver) getSendParams() (rcvNxt seqnum.Value, rcvWnd seqnum.Size) { // receiver's estimated RTT. r.rcvWnd = newWnd r.rcvWUP = r.rcvNxt + r.prevBufUsed = bufUsed scaledWnd := r.rcvWnd >> r.rcvWndScale if scaledWnd == 0 { // Increment a metric if we are advertising an actual zero window. r.ep.stats.ReceiveErrors.ZeroRcvWindowState.Increment() } + + // If we started off with a window larger than what can he held in + // the 16bit window field, we ceil the value to the max value. + // While ceiling, we still do not want to grow the right edge when + // not applicable. + if scaledWnd > math.MaxUint16 { + if toGrow { + scaledWnd = seqnum.Size(math.MaxUint16) + } else { + scaledWnd = seqnum.Size(uint16(scaledWnd)) + } + } return r.rcvNxt, scaledWnd } diff --git a/pkg/tcpip/transport/tcp/reno_recovery.go b/pkg/tcpip/transport/tcp/reno_recovery.go new file mode 100644 index 000000000..2aa708e97 --- /dev/null +++ b/pkg/tcpip/transport/tcp/reno_recovery.go @@ -0,0 +1,67 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tcp + +// renoRecovery stores the variables related to TCP Reno loss recovery +// algorithm. +// +// +stateify savable +type renoRecovery struct { + s *sender +} + +func newRenoRecovery(s *sender) *renoRecovery { + return &renoRecovery{s: s} +} + +func (rr *renoRecovery) DoRecovery(rcvdSeg *segment, fastRetransmit bool) { + ack := rcvdSeg.ackNumber + snd := rr.s + + // We are in fast recovery mode. Ignore the ack if it's out of range. + if !ack.InRange(snd.sndUna, snd.sndNxt+1) { + return + } + + // Don't count this as a duplicate if it is carrying data or + // updating the window. + if rcvdSeg.logicalLen() != 0 || snd.sndWnd != rcvdSeg.window { + return + } + + // Inflate the congestion window if we're getting duplicate acks + // for the packet we retransmitted. + if !fastRetransmit && ack == snd.fr.first { + // We received a dup, inflate the congestion window by 1 packet + // if we're not at the max yet. Only inflate the window if + // regular FastRecovery is in use, RFC6675 does not require + // inflating cwnd on duplicate ACKs. + if snd.sndCwnd < snd.fr.maxCwnd { + snd.sndCwnd++ + } + return + } + + // A partial ack was received. Retransmit this packet and remember it + // so that we don't retransmit it again. + // + // We don't inflate the window because we're putting the same packet + // back onto the wire. + // + // N.B. The retransmit timer will be reset by the caller. + snd.fr.first = ack + snd.dupAckCount = 0 + snd.resendSegment() +} diff --git a/pkg/tcpip/transport/tcp/sack_recovery.go b/pkg/tcpip/transport/tcp/sack_recovery.go new file mode 100644 index 000000000..7e813fa96 --- /dev/null +++ b/pkg/tcpip/transport/tcp/sack_recovery.go @@ -0,0 +1,120 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tcp + +import "gvisor.dev/gvisor/pkg/tcpip/seqnum" + +// sackRecovery stores the variables related to TCP SACK loss recovery +// algorithm. +// +// +stateify savable +type sackRecovery struct { + s *sender +} + +func newSACKRecovery(s *sender) *sackRecovery { + return &sackRecovery{s: s} +} + +// handleSACKRecovery implements the loss recovery phase as described in RFC6675 +// section 5, step C. +func (sr *sackRecovery) handleSACKRecovery(limit int, end seqnum.Value) (dataSent bool) { + snd := sr.s + snd.SetPipe() + + if smss := int(snd.ep.scoreboard.SMSS()); limit > smss { + // Cap segment size limit to s.smss as SACK recovery requires + // that all retransmissions or new segments send during recovery + // be of <= SMSS. + limit = smss + } + + nextSegHint := snd.writeList.Front() + for snd.outstanding < snd.sndCwnd { + var nextSeg *segment + var rescueRtx bool + nextSeg, nextSegHint, rescueRtx = snd.NextSeg(nextSegHint) + if nextSeg == nil { + return dataSent + } + if !snd.isAssignedSequenceNumber(nextSeg) || snd.sndNxt.LessThanEq(nextSeg.sequenceNumber) { + // New data being sent. + + // Step C.3 described below is handled by + // maybeSendSegment which increments sndNxt when + // a segment is transmitted. + // + // Step C.3 "If any of the data octets sent in + // (C.1) are above HighData, HighData must be + // updated to reflect the transmission of + // previously unsent data." + // + // We pass s.smss as the limit as the Step 2) requires that + // new data sent should be of size s.smss or less. + if sent := snd.maybeSendSegment(nextSeg, limit, end); !sent { + return dataSent + } + dataSent = true + snd.outstanding++ + snd.writeNext = nextSeg.Next() + continue + } + + // Now handle the retransmission case where we matched either step 1,3 or 4 + // of the NextSeg algorithm. + // RFC 6675, Step C.4. + // + // "The estimate of the amount of data outstanding in the network + // must be updated by incrementing pipe by the number of octets + // transmitted in (C.1)." + snd.outstanding++ + dataSent = true + snd.sendSegment(nextSeg) + + segEnd := nextSeg.sequenceNumber.Add(nextSeg.logicalLen()) + if rescueRtx { + // We do the last part of rule (4) of NextSeg here to update + // RescueRxt as until this point we don't know if we are going + // to use the rescue transmission. + snd.fr.rescueRxt = snd.fr.last + } else { + // RFC 6675, Step C.2 + // + // "If any of the data octets sent in (C.1) are below + // HighData, HighRxt MUST be set to the highest sequence + // number of the retransmitted segment unless NextSeg () + // rule (4) was invoked for this retransmission." + snd.fr.highRxt = segEnd - 1 + } + } + return dataSent +} + +func (sr *sackRecovery) DoRecovery(rcvdSeg *segment, fastRetransmit bool) { + snd := sr.s + if fastRetransmit { + snd.resendSegment() + } + + // We are in fast recovery mode. Ignore the ack if it's out of range. + if ack := rcvdSeg.ackNumber; !ack.InRange(snd.sndUna, snd.sndNxt+1) { + return + } + + // RFC 6675 recovery algorithm step C 1-5. + end := snd.sndUna.Add(snd.sndWnd) + dataSent := sr.handleSACKRecovery(snd.maxPayloadSize, end) + snd.postXmit(dataSent) +} diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go index 2091989cc..5ef73ec74 100644 --- a/pkg/tcpip/transport/tcp/segment.go +++ b/pkg/tcpip/transport/tcp/segment.go @@ -204,7 +204,7 @@ func (s *segment) payloadSize() int { // segMemSize is the amount of memory used to hold the segment data and // the associated metadata. func (s *segment) segMemSize() int { - return segSize + s.data.Size() + return SegSize + s.data.Size() } // parse populates the sequence & ack numbers, flags, and window fields of the diff --git a/pkg/tcpip/transport/tcp/segment_unsafe.go b/pkg/tcpip/transport/tcp/segment_unsafe.go index 0ab7b8f56..392ff0859 100644 --- a/pkg/tcpip/transport/tcp/segment_unsafe.go +++ b/pkg/tcpip/transport/tcp/segment_unsafe.go @@ -19,5 +19,6 @@ import ( ) const ( - segSize = int(unsafe.Sizeof(segment{})) + // SegSize is the minimal size of the segment overhead. + SegSize = int(unsafe.Sizeof(segment{})) ) diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go index 0e0fdf14c..baec762e1 100644 --- a/pkg/tcpip/transport/tcp/snd.go +++ b/pkg/tcpip/transport/tcp/snd.go @@ -18,7 +18,6 @@ import ( "fmt" "math" "sort" - "sync/atomic" "time" "gvisor.dev/gvisor/pkg/sleep" @@ -92,6 +91,17 @@ type congestionControl interface { PostRecovery() } +// lossRecovery is an interface that must be implemented by any supported +// loss recovery algorithm. +type lossRecovery interface { + // DoRecovery is invoked when loss is detected and segments need + // to be retransmitted. The cumulative or selective ACK is passed along + // with the flag which identifies whether the connection entered fast + // retransmit with this ACK and to retransmit the first unacknowledged + // segment. + DoRecovery(rcvdSeg *segment, fastRetransmit bool) +} + // sender holds the state necessary to send TCP segments. // // +stateify savable @@ -108,6 +118,9 @@ type sender struct { // fr holds state related to fast recovery. fr fastRecovery + // lr is the loss recovery algorithm used by the sender. + lr lossRecovery + // sndCwnd is the congestion window, in packets. sndCwnd int @@ -276,6 +289,8 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint s.cc = s.initCongestionControl(ep.cc) + s.lr = s.initLossRecovery() + // A negative sndWndScale means that no scaling is in use, otherwise we // store the scaling value. if sndWndScale > 0 { @@ -330,6 +345,14 @@ func (s *sender) initCongestionControl(congestionControlName tcpip.CongestionCon } } +// initLossRecovery initiates the loss recovery algorithm for the sender. +func (s *sender) initLossRecovery() lossRecovery { + if s.ep.sackPermitted { + return newSACKRecovery(s) + } + return newRenoRecovery(s) +} + // updateMaxPayloadSize updates the maximum payload size based on the given // MTU. If this is in response to "packet too big" control packets (indicated // by the count argument), it also reduces the number of outstanding packets and @@ -550,7 +573,7 @@ func (s *sender) retransmitTimerExpired() bool { // We were attempting fast recovery but were not successful. // Leave the state. We don't need to update ssthresh because it // has already been updated when entered fast-recovery. - s.leaveFastRecovery() + s.leaveRecovery() } s.state = RTORecovery @@ -789,7 +812,7 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se } if !nextTooBig && seg.data.Size() < available { // Segment is not full. - if s.outstanding > 0 && atomic.LoadUint32(&s.ep.delay) != 0 { + if s.outstanding > 0 && s.ep.ops.GetDelayOption() { // Nagle's algorithm. From Wikipedia: // Nagle's algorithm works by // combining a number of small @@ -808,7 +831,7 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se // send space and MSS. // TODO(gvisor.dev/issue/2833): Drain the held segments after a // timeout. - if seg.data.Size() < s.maxPayloadSize && atomic.LoadUint32(&s.ep.cork) != 0 { + if seg.data.Size() < s.maxPayloadSize && s.ep.ops.GetCorkOption() { return false } } @@ -913,79 +936,6 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se return true } -// handleSACKRecovery implements the loss recovery phase as described in RFC6675 -// section 5, step C. -func (s *sender) handleSACKRecovery(limit int, end seqnum.Value) (dataSent bool) { - s.SetPipe() - - if smss := int(s.ep.scoreboard.SMSS()); limit > smss { - // Cap segment size limit to s.smss as SACK recovery requires - // that all retransmissions or new segments send during recovery - // be of <= SMSS. - limit = smss - } - - nextSegHint := s.writeList.Front() - for s.outstanding < s.sndCwnd { - var nextSeg *segment - var rescueRtx bool - nextSeg, nextSegHint, rescueRtx = s.NextSeg(nextSegHint) - if nextSeg == nil { - return dataSent - } - if !s.isAssignedSequenceNumber(nextSeg) || s.sndNxt.LessThanEq(nextSeg.sequenceNumber) { - // New data being sent. - - // Step C.3 described below is handled by - // maybeSendSegment which increments sndNxt when - // a segment is transmitted. - // - // Step C.3 "If any of the data octets sent in - // (C.1) are above HighData, HighData must be - // updated to reflect the transmission of - // previously unsent data." - // - // We pass s.smss as the limit as the Step 2) requires that - // new data sent should be of size s.smss or less. - if sent := s.maybeSendSegment(nextSeg, limit, end); !sent { - return dataSent - } - dataSent = true - s.outstanding++ - s.writeNext = nextSeg.Next() - continue - } - - // Now handle the retransmission case where we matched either step 1,3 or 4 - // of the NextSeg algorithm. - // RFC 6675, Step C.4. - // - // "The estimate of the amount of data outstanding in the network - // must be updated by incrementing pipe by the number of octets - // transmitted in (C.1)." - s.outstanding++ - dataSent = true - s.sendSegment(nextSeg) - - segEnd := nextSeg.sequenceNumber.Add(nextSeg.logicalLen()) - if rescueRtx { - // We do the last part of rule (4) of NextSeg here to update - // RescueRxt as until this point we don't know if we are going - // to use the rescue transmission. - s.fr.rescueRxt = s.fr.last - } else { - // RFC 6675, Step C.2 - // - // "If any of the data octets sent in (C.1) are below - // HighData, HighRxt MUST be set to the highest sequence - // number of the retransmitted segment unless NextSeg () - // rule (4) was invoked for this retransmission." - s.fr.highRxt = segEnd - 1 - } - } - return dataSent -} - func (s *sender) sendZeroWindowProbe() { ack, win := s.ep.rcv.getSendParams() s.unackZeroWindowProbes++ @@ -1014,6 +964,30 @@ func (s *sender) disableZeroWindowProbing() { s.resendTimer.disable() } +func (s *sender) postXmit(dataSent bool) { + if dataSent { + // We sent data, so we should stop the keepalive timer to ensure + // that no keepalives are sent while there is pending data. + s.ep.disableKeepaliveTimer() + } + + // If the sender has advertized zero receive window and we have + // data to be sent out, start zero window probing to query the + // the remote for it's receive window size. + if s.writeNext != nil && s.sndWnd == 0 { + s.enableZeroWindowProbing() + } + + // Enable the timer if we have pending data and it's not enabled yet. + if !s.resendTimer.enabled() && s.sndUna != s.sndNxt { + s.resendTimer.enable(s.rto) + } + // If we have no more pending data, start the keepalive timer. + if s.sndUna == s.sndNxt { + s.ep.resetKeepaliveTimer(false) + } +} + // sendData sends new data segments. It is called when data becomes available or // when the send window opens up. func (s *sender) sendData() { @@ -1034,55 +1008,29 @@ func (s *sender) sendData() { } var dataSent bool - - // RFC 6675 recovery algorithm step C 1-5. - if s.fr.active && s.ep.sackPermitted { - dataSent = s.handleSACKRecovery(s.maxPayloadSize, end) - } else { - for seg := s.writeNext; seg != nil && s.outstanding < s.sndCwnd; seg = seg.Next() { - cwndLimit := (s.sndCwnd - s.outstanding) * s.maxPayloadSize - if cwndLimit < limit { - limit = cwndLimit - } - if s.isAssignedSequenceNumber(seg) && s.ep.sackPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) { - // Move writeNext along so that we don't try and scan data that - // has already been SACKED. - s.writeNext = seg.Next() - continue - } - if sent := s.maybeSendSegment(seg, limit, end); !sent { - break - } - dataSent = true - s.outstanding += s.pCount(seg) + for seg := s.writeNext; seg != nil && s.outstanding < s.sndCwnd; seg = seg.Next() { + cwndLimit := (s.sndCwnd - s.outstanding) * s.maxPayloadSize + if cwndLimit < limit { + limit = cwndLimit + } + if s.isAssignedSequenceNumber(seg) && s.ep.sackPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) { + // Move writeNext along so that we don't try and scan data that + // has already been SACKED. s.writeNext = seg.Next() + continue } + if sent := s.maybeSendSegment(seg, limit, end); !sent { + break + } + dataSent = true + s.outstanding += s.pCount(seg) + s.writeNext = seg.Next() } - if dataSent { - // We sent data, so we should stop the keepalive timer to ensure - // that no keepalives are sent while there is pending data. - s.ep.disableKeepaliveTimer() - } - - // If the sender has advertized zero receive window and we have - // data to be sent out, start zero window probing to query the - // the remote for it's receive window size. - if s.writeNext != nil && s.sndWnd == 0 { - s.enableZeroWindowProbing() - } - - // Enable the timer if we have pending data and it's not enabled yet. - if !s.resendTimer.enabled() && s.sndUna != s.sndNxt { - s.resendTimer.enable(s.rto) - } - // If we have no more pending data, start the keepalive timer. - if s.sndUna == s.sndNxt { - s.ep.resetKeepaliveTimer(false) - } + s.postXmit(dataSent) } -func (s *sender) enterFastRecovery() { +func (s *sender) enterRecovery() { s.fr.active = true // Save state to reflect we're now in fast recovery. // @@ -1104,7 +1052,7 @@ func (s *sender) enterFastRecovery() { s.ep.stack.Stats().TCP.FastRecovery.Increment() } -func (s *sender) leaveFastRecovery() { +func (s *sender) leaveRecovery() { s.fr.active = false s.fr.maxCwnd = 0 s.dupAckCount = 0 @@ -1115,57 +1063,6 @@ func (s *sender) leaveFastRecovery() { s.cc.PostRecovery() } -func (s *sender) handleFastRecovery(seg *segment) (rtx bool) { - ack := seg.ackNumber - // We are in fast recovery mode. Ignore the ack if it's out of - // range. - if !ack.InRange(s.sndUna, s.sndNxt+1) { - return false - } - - // Leave fast recovery if it acknowledges all the data covered by - // this fast recovery session. - if s.fr.last.LessThan(ack) { - s.leaveFastRecovery() - return false - } - - if s.ep.sackPermitted { - // When SACK is enabled we let retransmission be governed by - // the SACK logic. - return false - } - - // Don't count this as a duplicate if it is carrying data or - // updating the window. - if seg.logicalLen() != 0 || s.sndWnd != seg.window { - return false - } - - // Inflate the congestion window if we're getting duplicate acks - // for the packet we retransmitted. - if ack == s.fr.first { - // We received a dup, inflate the congestion window by 1 packet - // if we're not at the max yet. Only inflate the window if - // regular FastRecovery is in use, RFC6675 does not require - // inflating cwnd on duplicate ACKs. - if s.sndCwnd < s.fr.maxCwnd { - s.sndCwnd++ - } - return false - } - - // A partial ack was received. Retransmit this packet and - // remember it so that we don't retransmit it again. We don't - // inflate the window because we're putting the same packet back - // onto the wire. - // - // N.B. The retransmit timer will be reset by the caller. - s.fr.first = ack - s.dupAckCount = 0 - return true -} - // isAssignedSequenceNumber relies on the fact that we only set flags once a // sequencenumber is assigned and that is only done right before we send the // segment. As a result any segment that has a non-zero flag has a valid @@ -1228,14 +1125,11 @@ func (s *sender) SetPipe() { s.outstanding = pipe } -// checkDuplicateAck is called when an ack is received. It manages the state -// related to duplicate acks and determines if a retransmit is needed according -// to the rules in RFC 6582 (NewReno). -func (s *sender) checkDuplicateAck(seg *segment) (rtx bool) { +// detectLoss is called when an ack is received and returns whether a loss is +// detected. It manages the state related to duplicate acks and determines if +// a retransmit is needed according to the rules in RFC 6582 (NewReno). +func (s *sender) detectLoss(seg *segment) (fastRetransmit bool) { ack := seg.ackNumber - if s.fr.active { - return s.handleFastRecovery(seg) - } // We're not in fast recovery yet. A segment is considered a duplicate // only if it doesn't carry any data and doesn't update the send window, @@ -1266,14 +1160,14 @@ func (s *sender) checkDuplicateAck(seg *segment) (rtx bool) { // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 2 // // We only do the check here, the incrementing of last to the highest - // sequence number transmitted till now is done when enterFastRecovery + // sequence number transmitted till now is done when enterRecovery // is invoked. if !s.fr.last.LessThan(seg.ackNumber) { s.dupAckCount = 0 return false } s.cc.HandleNDupAcks() - s.enterFastRecovery() + s.enterRecovery() s.dupAckCount = 0 return true } @@ -1415,14 +1309,23 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) { s.SetPipe() } - // Count the duplicates and do the fast retransmit if needed. - rtx := s.checkDuplicateAck(rcvdSeg) + ack := rcvdSeg.ackNumber + fastRetransmit := false + // Do not leave fast recovery, if the ACK is out of range. + if s.fr.active { + // Leave fast recovery if it acknowledges all the data covered by + // this fast recovery session. + if ack.InRange(s.sndUna, s.sndNxt+1) && s.fr.last.LessThan(ack) { + s.leaveRecovery() + } + } else { + // Detect loss by counting the duplicates and enter recovery. + fastRetransmit = s.detectLoss(rcvdSeg) + } // Stash away the current window size. s.sndWnd = rcvdSeg.window - ack := rcvdSeg.ackNumber - // Disable zero window probing if remote advertizes a non-zero receive // window. This can be with an ACK to the zero window probe (where the // acknumber refers to the already acknowledged byte) OR to any previously @@ -1539,19 +1442,24 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) { s.resendTimer.disable() } } + // Now that we've popped all acknowledged data from the retransmit // queue, retransmit if needed. - if rtx { - s.resendSegment() + if s.fr.active { + s.lr.DoRecovery(rcvdSeg, fastRetransmit) + // When SACK is enabled data sending is governed by steps in + // RFC 6675 Section 5 recovery steps A-C. + // See: https://tools.ietf.org/html/rfc6675#section-5. + if s.ep.sackPermitted { + return + } } // Send more data now that some of the pending data has been ack'd, or // that the window opened up, or the congestion window was inflated due // to a duplicate ack during fast recovery. This will also re-enable // the retransmit timer if needed. - if !s.ep.sackPermitted || s.fr.active || s.dupAckCount == 0 || rcvdSeg.hasNewSACKInfo { - s.sendData() - } + s.sendData() } // sendSegment sends the specified segment. diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go index 9f0fb41e3..1759ebea9 100644 --- a/pkg/tcpip/transport/tcp/tcp_test.go +++ b/pkg/tcpip/transport/tcp/tcp_test.go @@ -75,9 +75,6 @@ func TestGiveUpConnect(t *testing.T) { // Wait for ep to become writable. <-notifyCh - if err := ep.LastError(); err != tcpip.ErrAborted { - t.Fatalf("got ep.LastError() = %s, want = %s", err, tcpip.ErrAborted) - } // Call Connect again to retreive the handshake failure status // and stats updates. @@ -267,7 +264,7 @@ func TestTCPResetsSentNoICMP(t *testing.T) { } // Read outgoing ICMP stats and check no ICMP DstUnreachable was recorded. - sent := stats.ICMP.V4PacketsSent + sent := stats.ICMP.V4.PacketsSent if got, want := sent.DstUnreachable.Value(), uint64(0); got != want { t.Errorf("got ICMP DstUnreachable.Value() = %d, want = %d", got, want) } @@ -2532,10 +2529,10 @@ func TestSegmentMerging(t *testing.T) { { "cork", func(ep tcpip.Endpoint) { - ep.SetSockOptBool(tcpip.CorkOption, true) + ep.SocketOptions().SetCorkOption(true) }, func(ep tcpip.Endpoint) { - ep.SetSockOptBool(tcpip.CorkOption, false) + ep.SocketOptions().SetCorkOption(false) }, }, } @@ -2627,7 +2624,7 @@ func TestDelay(t *testing.T) { c.CreateConnected(789, 30000, -1 /* epRcvBuf */) - c.EP.SetSockOptBool(tcpip.DelayOption, true) + c.EP.SocketOptions().SetDelayOption(true) var allData []byte for i, data := range [][]byte{{0}, {1, 2, 3, 4}, {5, 6, 7}, {8, 9}, {10}, {11}} { @@ -2675,7 +2672,7 @@ func TestUndelay(t *testing.T) { c.CreateConnected(789, 30000, -1 /* epRcvBuf */) - c.EP.SetSockOptBool(tcpip.DelayOption, true) + c.EP.SocketOptions().SetDelayOption(true) allData := [][]byte{{0}, {1, 2, 3}} for i, data := range allData { @@ -2708,7 +2705,7 @@ func TestUndelay(t *testing.T) { // Check that we don't get the second packet yet. c.CheckNoPacketTimeout("delayed second packet transmitted", 100*time.Millisecond) - c.EP.SetSockOptBool(tcpip.DelayOption, false) + c.EP.SocketOptions().SetDelayOption(false) // Check that data is received. second := c.GetPacket() @@ -2745,8 +2742,8 @@ func TestMSSNotDelayed(t *testing.T) { fn func(tcpip.Endpoint) }{ {"no-op", func(tcpip.Endpoint) {}}, - {"delay", func(ep tcpip.Endpoint) { ep.SetSockOptBool(tcpip.DelayOption, true) }}, - {"cork", func(ep tcpip.Endpoint) { ep.SetSockOptBool(tcpip.CorkOption, true) }}, + {"delay", func(ep tcpip.Endpoint) { ep.SocketOptions().SetDelayOption(true) }}, + {"cork", func(ep tcpip.Endpoint) { ep.SocketOptions().SetCorkOption(true) }}, } for _, test := range tests { @@ -3198,6 +3195,11 @@ loop: case tcpip.ErrWouldBlock: select { case <-ch: + // Expect the state to be StateError and subsequent Reads to fail with HardError. + if _, _, err := c.EP.Read(nil); err != tcpip.ErrConnectionReset { + t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrConnectionReset) + } + break loop case <-time.After(1 * time.Second): t.Fatalf("Timed out waiting for reset to arrive") } @@ -3207,14 +3209,10 @@ loop: t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrConnectionReset) } } - // Expect the state to be StateError and subsequent Reads to fail with HardError. - if _, _, err := c.EP.Read(nil); err != tcpip.ErrConnectionReset { - t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrConnectionReset) - } + if tcp.EndpointState(c.EP.State()) != tcp.StateError { t.Fatalf("got EP state is not StateError") } - if got := c.Stack().Stats().TCP.EstablishedResets.Value(); got != 1 { t.Errorf("got stats.TCP.EstablishedResets.Value() = %d, want = 1", got) } @@ -4193,9 +4191,7 @@ func TestReusePort(t *testing.T) { if err != nil { t.Fatalf("NewEndpoint failed; %s", err) } - if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil { - t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err) - } + c.EP.SocketOptions().SetReuseAddress(true) if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil { t.Fatalf("Bind failed: %s", err) } @@ -4205,9 +4201,7 @@ func TestReusePort(t *testing.T) { if err != nil { t.Fatalf("NewEndpoint failed; %s", err) } - if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil { - t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err) - } + c.EP.SocketOptions().SetReuseAddress(true) if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil { t.Fatalf("Bind failed: %s", err) } @@ -4218,9 +4212,7 @@ func TestReusePort(t *testing.T) { if err != nil { t.Fatalf("NewEndpoint failed; %s", err) } - if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil { - t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err) - } + c.EP.SocketOptions().SetReuseAddress(true) if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil { t.Fatalf("Bind failed: %s", err) } @@ -4233,9 +4225,7 @@ func TestReusePort(t *testing.T) { if err != nil { t.Fatalf("NewEndpoint failed; %s", err) } - if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil { - t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err) - } + c.EP.SocketOptions().SetReuseAddress(true) if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil { t.Fatalf("Bind failed: %s", err) } @@ -4246,9 +4236,7 @@ func TestReusePort(t *testing.T) { if err != nil { t.Fatalf("NewEndpoint failed; %s", err) } - if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil { - t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err) - } + c.EP.SocketOptions().SetReuseAddress(true) if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil { t.Fatalf("Bind failed: %s", err) } @@ -4261,9 +4249,7 @@ func TestReusePort(t *testing.T) { if err != nil { t.Fatalf("NewEndpoint failed; %s", err) } - if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil { - t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err) - } + c.EP.SocketOptions().SetReuseAddress(true) if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil { t.Fatalf("Bind failed: %s", err) } @@ -4656,13 +4642,9 @@ func TestConnectAvoidsBoundPorts(t *testing.T) { switch network { case "ipv4": case "ipv6": - if err := ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil { - t.Fatalf("SetSockOptBool(V6OnlyOption(true)) failed: %s", err) - } + ep.SocketOptions().SetV6Only(true) case "dual": - if err := ep.SetSockOptBool(tcpip.V6OnlyOption, false); err != nil { - t.Fatalf("SetSockOptBool(V6OnlyOption(false)) failed: %s", err) - } + ep.SocketOptions().SetV6Only(false) default: t.Fatalf("unknown network: '%s'", network) } @@ -4998,9 +4980,7 @@ func TestKeepalive(t *testing.T) { if err := c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 5); err != nil { t.Fatalf("c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 5): %s", err) } - if err := c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true); err != nil { - t.Fatalf("c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true): %s", err) - } + c.EP.SocketOptions().SetKeepAlive(true) // 5 unacked keepalives are sent. ACK each one, and check that the // connection stays alive after 5. @@ -6118,10 +6098,13 @@ func TestReceiveBufferAutoTuningApplicationLimited(t *testing.T) { // Introduce a 25ms latency by delaying the first byte. latency := 25 * time.Millisecond time.Sleep(latency) - rawEP.SendPacketWithTS([]byte{1}, tsVal) + // Send an initial payload with atleast segment overhead size. The receive + // window would not grow for smaller segments. + rawEP.SendPacketWithTS(make([]byte, tcp.SegSize), tsVal) pkt := rawEP.VerifyAndReturnACKWithTS(tsVal) rcvWnd := header.TCP(header.IPv4(pkt).Payload()).WindowSize() + time.Sleep(25 * time.Millisecond) // Allocate a large enough payload for the test. @@ -6394,10 +6377,7 @@ func checkDelayOption(t *testing.T, c *context.Context, wantDelayEnabled tcpip.T if err != nil { t.Fatalf("NewEndPoint(tcp, ipv4, new(waiter.Queue)) failed: %s", err) } - gotDelayOption, err := ep.GetSockOptBool(tcpip.DelayOption) - if err != nil { - t.Fatalf("ep.GetSockOptBool(tcpip.DelayOption) failed: %s", err) - } + gotDelayOption := ep.SocketOptions().GetDelayOption() if gotDelayOption != wantDelayOption { t.Errorf("ep.GetSockOptBool(tcpip.DelayOption) got: %t, want: %t", gotDelayOption, wantDelayOption) } @@ -7250,9 +7230,7 @@ func TestKeepaliveWithUserTimeout(t *testing.T) { if err := c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 10); err != nil { t.Fatalf("c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 10): %s", err) } - if err := c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true); err != nil { - t.Fatalf("c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true): %s", err) - } + c.EP.SocketOptions().SetKeepAlive(true) // Set userTimeout to be the duration to be 1 keepalive // probes. Which means that after the first probe is sent diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go index e6aa4fc4b..010a23e45 100644 --- a/pkg/tcpip/transport/tcp/testing/context/context.go +++ b/pkg/tcpip/transport/tcp/testing/context/context.go @@ -592,9 +592,7 @@ func (c *Context) CreateV6Endpoint(v6only bool) { c.t.Fatalf("NewEndpoint failed: %v", err) } - if err := c.EP.SetSockOptBool(tcpip.V6OnlyOption, v6only); err != nil { - c.t.Fatalf("SetSockOpt failed failed: %v", err) - } + c.EP.SocketOptions().SetV6Only(v6only) } // GetV6Packet reads a single packet from the link layer endpoint of the context diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD index c78549424..7ebae63d8 100644 --- a/pkg/tcpip/transport/udp/BUILD +++ b/pkg/tcpip/transport/udp/BUILD @@ -56,6 +56,7 @@ go_test( "//pkg/tcpip/network/ipv4", "//pkg/tcpip/network/ipv6", "//pkg/tcpip/stack", + "//pkg/tcpip/transport/icmp", "//pkg/waiter", ], ) diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go index 57976d4e3..5043e7aa5 100644 --- a/pkg/tcpip/transport/udp/endpoint.go +++ b/pkg/tcpip/transport/udp/endpoint.go @@ -16,8 +16,8 @@ package udp import ( "fmt" + "sync/atomic" - "gvisor.dev/gvisor/pkg/sleep" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" @@ -77,6 +77,7 @@ func (s EndpointState) String() string { // +stateify savable type endpoint struct { stack.TransportEndpointInfo + tcpip.DefaultSocketOptionsHandler // The following fields are initialized at creation time and do not // change throughout the lifetime of the endpoint. @@ -94,21 +95,20 @@ type endpoint struct { rcvClosed bool // The following fields are protected by the mu mutex. - mu sync.RWMutex `state:"nosave"` - sndBufSize int - sndBufSizeMax int + mu sync.RWMutex `state:"nosave"` + sndBufSize int + sndBufSizeMax int + // state must be read/set using the EndpointState()/setEndpointState() + // methods. state EndpointState - route stack.Route `state:"manual"` + route *stack.Route `state:"manual"` dstPort uint16 - v6only bool ttl uint8 multicastTTL uint8 multicastAddr tcpip.Address multicastNICID tcpip.NICID - multicastLoop bool portFlags ports.Flags bindToDevice tcpip.NICID - noChecksum bool lastErrorMu sync.Mutex `state:"nosave"` lastError *tcpip.Error `state:".(string)"` @@ -122,17 +122,6 @@ type endpoint struct { // applied while sending packets. Defaults to 0 as on Linux. sendTOS uint8 - // receiveTOS determines if the incoming IPv4 TOS header field is passed - // as ancillary data to ControlMessages on Read. - receiveTOS bool - - // receiveTClass determines if the incoming IPv6 TClass header field is - // passed as ancillary data to ControlMessages on Read. - receiveTClass bool - - // receiveIPPacketInfo determines if the packet info is returned by Read. - receiveIPPacketInfo bool - // shutdownFlags represent the current shutdown state of the endpoint. shutdownFlags tcpip.ShutdownFlags @@ -188,13 +177,14 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue // // Linux defaults to TTL=1. multicastTTL: 1, - multicastLoop: true, rcvBufSizeMax: 32 * 1024, sndBufSizeMax: 32 * 1024, multicastMemberships: make(map[multicastMembership]struct{}), state: StateInitial, uniqueID: s.UniqueID(), } + e.ops.InitHandler(e) + e.ops.SetMulticastLoop(true) // Override with stack defaults. var ss stack.SendBufferSizeOption @@ -210,6 +200,20 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue return e } +// setEndpointState updates the state of the endpoint to state atomically. This +// method is unexported as the only place we should update the state is in this +// package but we allow the state to be read freely without holding e.mu. +// +// Precondition: e.mu must be held to call this method. +func (e *endpoint) setEndpointState(state EndpointState) { + atomic.StoreUint32((*uint32)(&e.state), uint32(state)) +} + +// EndpointState() returns the current state of the endpoint. +func (e *endpoint) EndpointState() EndpointState { + return EndpointState(atomic.LoadUint32((*uint32)(&e.state))) +} + // UniqueID implements stack.TransportEndpoint.UniqueID. func (e *endpoint) UniqueID() uint64 { return e.uniqueID @@ -235,7 +239,7 @@ func (e *endpoint) Close() { e.mu.Lock() e.shutdownFlags = tcpip.ShutdownRead | tcpip.ShutdownWrite - switch e.state { + switch e.EndpointState() { case StateBound, StateConnected: e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice) e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice, tcpip.FullAddress{}) @@ -258,10 +262,13 @@ func (e *endpoint) Close() { } e.rcvMu.Unlock() - e.route.Release() + if e.route != nil { + e.route.Release() + e.route = nil + } // Update the state. - e.state = StateClosed + e.setEndpointState(StateClosed) e.mu.Unlock() @@ -303,21 +310,16 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess HasTimestamp: true, Timestamp: p.timestamp, } - e.mu.RLock() - receiveTOS := e.receiveTOS - receiveTClass := e.receiveTClass - receiveIPPacketInfo := e.receiveIPPacketInfo - e.mu.RUnlock() - if receiveTOS { + if e.ops.GetReceiveTOS() { cm.HasTOS = true cm.TOS = p.tos } - if receiveTClass { + if e.ops.GetReceiveTClass() { cm.HasTClass = true // Although TClass is an 8-bit value it's read in the CMsg as a uint32. cm.TClass = uint32(p.tos) } - if receiveIPPacketInfo { + if e.ops.GetReceivePacketInfo() { cm.HasIPPacketInfo = true cm.PacketInfo = p.packetInfo } @@ -330,7 +332,7 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess // // Returns true for retry if preparation should be retried. func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err *tcpip.Error) { - switch e.state { + switch e.EndpointState() { case StateInitial: case StateConnected: return false, nil @@ -352,7 +354,7 @@ func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err *tcpi // The state changed when we released the shared locked and re-acquired // it in exclusive mode. Try again. - if e.state != StateInitial { + if e.EndpointState() != StateInitial { return true, nil } @@ -367,9 +369,9 @@ func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err *tcpi // connectRoute establishes a route to the specified interface or the // configured multicast interface if no interface is specified and the // specified address is a multicast address. -func (e *endpoint) connectRoute(nicID tcpip.NICID, addr tcpip.FullAddress, netProto tcpip.NetworkProtocolNumber) (stack.Route, tcpip.NICID, *tcpip.Error) { +func (e *endpoint) connectRoute(nicID tcpip.NICID, addr tcpip.FullAddress, netProto tcpip.NetworkProtocolNumber) (*stack.Route, tcpip.NICID, *tcpip.Error) { localAddr := e.ID.LocalAddress - if isBroadcastOrMulticast(localAddr) { + if e.isBroadcastOrMulticast(nicID, netProto, localAddr) { // A packet can only originate from a unicast address (i.e., an interface). localAddr = "" } @@ -384,9 +386,9 @@ func (e *endpoint) connectRoute(nicID tcpip.NICID, addr tcpip.FullAddress, netPr } // Find a route to the desired destination. - r, err := e.stack.FindRoute(nicID, localAddr, addr.Addr, netProto, e.multicastLoop) + r, err := e.stack.FindRoute(nicID, localAddr, addr.Addr, netProto, e.ops.GetMulticastLoop()) if err != nil { - return stack.Route{}, 0, err + return nil, 0, err } return r, nicID, nil } @@ -429,7 +431,13 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c to := opts.To e.mu.RLock() - defer e.mu.RUnlock() + lockReleased := false + defer func() { + if lockReleased { + return + } + e.mu.RUnlock() + }() // If we've shutdown with SHUT_WR we are in an invalid state for sending. if e.shutdownFlags&tcpip.ShutdownWrite != 0 { @@ -448,36 +456,9 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c } } - var route *stack.Route - var resolve func(waker *sleep.Waker) (ch <-chan struct{}, err *tcpip.Error) - var dstPort uint16 - if to == nil { - route = &e.route - dstPort = e.dstPort - resolve = func(waker *sleep.Waker) (ch <-chan struct{}, err *tcpip.Error) { - // Promote lock to exclusive if using a shared route, given that it may - // need to change in Route.Resolve() call below. - e.mu.RUnlock() - e.mu.Lock() - - // Recheck state after lock was re-acquired. - if e.state != StateConnected { - err = tcpip.ErrInvalidEndpointState - } - if err == nil && route.IsResolutionRequired() { - ch, err = route.Resolve(waker) - } - - e.mu.Unlock() - e.mu.RLock() - - // Recheck state after lock was re-acquired. - if e.state != StateConnected { - err = tcpip.ErrInvalidEndpointState - } - return - } - } else { + route := e.route + dstPort := e.dstPort + if to != nil { // Reject destination address if it goes through a different // NIC than the endpoint was bound to. nicID := to.NIC @@ -505,9 +486,8 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c } defer r.Release() - route = &r + route = r dstPort = dst.Port - resolve = route.Resolve } if !e.ops.GetBroadcast() && route.IsOutboundBroadcast() { @@ -515,7 +495,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c } if route.IsResolutionRequired() { - if ch, err := resolve(nil); err != nil { + if ch, err := route.Resolve(nil); err != nil { if err == tcpip.ErrWouldBlock { return 0, ch, tcpip.ErrNoLinkAddress } @@ -541,7 +521,24 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c useDefaultTTL = false } - if err := sendUDP(route, buffer.View(v).ToVectorisedView(), e.ID.LocalPort, dstPort, ttl, useDefaultTTL, e.sendTOS, e.owner, e.noChecksum); err != nil { + localPort := e.ID.LocalPort + sendTOS := e.sendTOS + owner := e.owner + noChecksum := e.SocketOptions().GetNoChecksum() + lockReleased = true + e.mu.RUnlock() + + // Do not hold lock when sending as loopback is synchronous and if the UDP + // datagram ends up generating an ICMP response then it can result in a + // deadlock where the ICMP response handling ends up acquiring this endpoint's + // mutex using e.mu.RLock() in endpoint.HandleControlPacket which can cause a + // deadlock if another caller is trying to acquire e.mu in exclusive mode w/ + // e.mu.Lock(). Since e.mu.Lock() prevents any new read locks to ensure the + // lock can be eventually acquired. + // + // See: https://golang.org/pkg/sync/#RWMutex for details on why recursive read + // locking is prohibited. + if err := sendUDP(route, buffer.View(v).ToVectorisedView(), localPort, dstPort, ttl, useDefaultTTL, sendTOS, owner, noChecksum); err != nil { return 0, nil, err } return int64(len(v)), nil, nil @@ -552,66 +549,18 @@ func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) { return 0, tcpip.ControlMessages{}, nil } -// SetSockOptBool implements tcpip.Endpoint.SetSockOptBool. -func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error { - switch opt { - case tcpip.MulticastLoopOption: - e.mu.Lock() - e.multicastLoop = v - e.mu.Unlock() - - case tcpip.NoChecksumOption: - e.mu.Lock() - e.noChecksum = v - e.mu.Unlock() - - case tcpip.ReceiveTOSOption: - e.mu.Lock() - e.receiveTOS = v - e.mu.Unlock() - - case tcpip.ReceiveTClassOption: - // We only support this option on v6 endpoints. - if e.NetProto != header.IPv6ProtocolNumber { - return tcpip.ErrNotSupported - } - - e.mu.Lock() - e.receiveTClass = v - e.mu.Unlock() - - case tcpip.ReceiveIPPacketInfoOption: - e.mu.Lock() - e.receiveIPPacketInfo = v - e.mu.Unlock() - - case tcpip.ReuseAddressOption: - e.mu.Lock() - e.portFlags.MostRecent = v - e.mu.Unlock() - - case tcpip.ReusePortOption: - e.mu.Lock() - e.portFlags.LoadBalanced = v - e.mu.Unlock() - - case tcpip.V6OnlyOption: - // We only recognize this option on v6 endpoints. - if e.NetProto != header.IPv6ProtocolNumber { - return tcpip.ErrInvalidEndpointState - } - - e.mu.Lock() - defer e.mu.Unlock() - - // We only allow this to be set when we're in the initial state. - if e.state != StateInitial { - return tcpip.ErrInvalidEndpointState - } +// OnReuseAddressSet implements tcpip.SocketOptionsHandler.OnReuseAddressSet. +func (e *endpoint) OnReuseAddressSet(v bool) { + e.mu.Lock() + e.portFlags.MostRecent = v + e.mu.Unlock() +} - e.v6only = v - } - return nil +// OnReusePortSet implements tcpip.SocketOptionsHandler.OnReusePortSet. +func (e *endpoint) OnReusePortSet(v bool) { + e.mu.Lock() + e.portFlags.LoadBalanced = v + e.mu.Unlock() } // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt. @@ -823,81 +772,6 @@ func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error { return nil } -// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool. -func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) { - switch opt { - case tcpip.KeepaliveEnabledOption: - return false, nil - - case tcpip.MulticastLoopOption: - e.mu.RLock() - v := e.multicastLoop - e.mu.RUnlock() - return v, nil - - case tcpip.NoChecksumOption: - e.mu.RLock() - v := e.noChecksum - e.mu.RUnlock() - return v, nil - - case tcpip.ReceiveTOSOption: - e.mu.RLock() - v := e.receiveTOS - e.mu.RUnlock() - return v, nil - - case tcpip.ReceiveTClassOption: - // We only support this option on v6 endpoints. - if e.NetProto != header.IPv6ProtocolNumber { - return false, tcpip.ErrNotSupported - } - - e.mu.RLock() - v := e.receiveTClass - e.mu.RUnlock() - return v, nil - - case tcpip.ReceiveIPPacketInfoOption: - e.mu.RLock() - v := e.receiveIPPacketInfo - e.mu.RUnlock() - return v, nil - - case tcpip.ReuseAddressOption: - e.mu.RLock() - v := e.portFlags.MostRecent - e.mu.RUnlock() - - return v, nil - - case tcpip.ReusePortOption: - e.mu.RLock() - v := e.portFlags.LoadBalanced - e.mu.RUnlock() - - return v, nil - - case tcpip.V6OnlyOption: - // We only recognize this option on v6 endpoints. - if e.NetProto != header.IPv6ProtocolNumber { - return false, tcpip.ErrUnknownProtocolOption - } - - e.mu.RLock() - v := e.v6only - e.mu.RUnlock() - - return v, nil - - case tcpip.AcceptConnOption: - return false, nil - - default: - return false, tcpip.ErrUnknownProtocolOption - } -} - // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { switch opt { @@ -1036,7 +910,7 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u // checkV4MappedLocked determines the effective network protocol and converts // addr to its canonical form. func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) { - unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.v6only) + unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.ops.GetV6Only()) if err != nil { return tcpip.FullAddress{}, 0, err } @@ -1048,7 +922,7 @@ func (e *endpoint) Disconnect() *tcpip.Error { e.mu.Lock() defer e.mu.Unlock() - if e.state != StateConnected { + if e.EndpointState() != StateConnected { return nil } var ( @@ -1071,7 +945,7 @@ func (e *endpoint) Disconnect() *tcpip.Error { if err != nil { return err } - e.state = StateBound + e.setEndpointState(StateBound) boundPortFlags = e.boundPortFlags } else { if e.ID.LocalPort != 0 { @@ -1079,14 +953,14 @@ func (e *endpoint) Disconnect() *tcpip.Error { e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, boundPortFlags, e.boundBindToDevice, tcpip.FullAddress{}) e.boundPortFlags = ports.Flags{} } - e.state = StateInitial + e.setEndpointState(StateInitial) } e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, boundPortFlags, e.boundBindToDevice) e.ID = id e.boundBindToDevice = btd e.route.Release() - e.route = stack.Route{} + e.route = nil e.dstPort = 0 return nil @@ -1104,7 +978,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { nicID := addr.NIC var localPort uint16 - switch e.state { + switch e.EndpointState() { case StateInitial: case StateBound, StateConnected: localPort = e.ID.LocalPort @@ -1139,7 +1013,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { RemoteAddress: r.RemoteAddress, } - if e.state == StateInitial { + if e.EndpointState() == StateInitial { id.LocalAddress = r.LocalAddress } @@ -1147,7 +1021,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { // packets on a different network protocol, so we register both even if // v6only is set to false and this is an ipv6 endpoint. netProtos := []tcpip.NetworkProtocolNumber{netProto} - if netProto == header.IPv6ProtocolNumber && !e.v6only { + if netProto == header.IPv6ProtocolNumber && !e.ops.GetV6Only() { netProtos = []tcpip.NetworkProtocolNumber{ header.IPv4ProtocolNumber, header.IPv6ProtocolNumber, @@ -1173,7 +1047,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { e.RegisterNICID = nicID e.effectiveNetProtos = netProtos - e.state = StateConnected + e.setEndpointState(StateConnected) e.rcvMu.Lock() e.rcvReady = true @@ -1195,7 +1069,7 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error { // A socket in the bound state can still receive multicast messages, // so we need to notify waiters on shutdown. - if e.state != StateBound && e.state != StateConnected { + if state := e.EndpointState(); state != StateBound && state != StateConnected { return tcpip.ErrNotConnected } @@ -1246,7 +1120,7 @@ func (e *endpoint) registerWithStack(nicID tcpip.NICID, netProtos []tcpip.Networ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error { // Don't allow binding once endpoint is not in the initial state // anymore. - if e.state != StateInitial { + if e.EndpointState() != StateInitial { return tcpip.ErrInvalidEndpointState } @@ -1259,7 +1133,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error { // wildcard (empty) address, and this is an IPv6 endpoint with v6only // set to false. netProtos := []tcpip.NetworkProtocolNumber{netProto} - if netProto == header.IPv6ProtocolNumber && !e.v6only && addr.Addr == "" { + if netProto == header.IPv6ProtocolNumber && !e.ops.GetV6Only() && addr.Addr == "" { netProtos = []tcpip.NetworkProtocolNumber{ header.IPv6ProtocolNumber, header.IPv4ProtocolNumber, @@ -1267,7 +1141,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error { } nicID := addr.NIC - if len(addr.Addr) != 0 && !isBroadcastOrMulticast(addr.Addr) { + if len(addr.Addr) != 0 && !e.isBroadcastOrMulticast(addr.NIC, netProto, addr.Addr) { // A local unicast address was specified, verify that it's valid. nicID = e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr) if nicID == 0 { @@ -1290,7 +1164,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error { e.effectiveNetProtos = netProtos // Mark endpoint as bound. - e.state = StateBound + e.setEndpointState(StateBound) e.rcvMu.Lock() e.rcvReady = true @@ -1322,7 +1196,7 @@ func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { defer e.mu.RUnlock() addr := e.ID.LocalAddress - if e.state == StateConnected { + if e.EndpointState() == StateConnected { addr = e.route.LocalAddress } @@ -1338,7 +1212,7 @@ func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) { e.mu.RLock() defer e.mu.RUnlock() - if e.state != StateConnected { + if e.EndpointState() != StateConnected { return tcpip.FullAddress{}, tcpip.ErrNotConnected } @@ -1470,25 +1344,20 @@ func (e *endpoint) HandlePacket(id stack.TransportEndpointID, pkt *stack.PacketB // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket. func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt *stack.PacketBuffer) { if typ == stack.ControlPortUnreachable { - e.mu.RLock() - if e.state == StateConnected { + if e.EndpointState() == StateConnected { e.lastErrorMu.Lock() e.lastError = tcpip.ErrConnectionRefused e.lastErrorMu.Unlock() - e.mu.RUnlock() e.waiterQueue.Notify(waiter.EventErr) return } - e.mu.RUnlock() } } // State implements tcpip.Endpoint.State. func (e *endpoint) State() uint32 { - e.mu.Lock() - defer e.mu.Unlock() - return uint32(e.state) + return uint32(e.EndpointState()) } // Info returns a copy of the endpoint info. @@ -1508,14 +1377,16 @@ func (e *endpoint) Stats() tcpip.EndpointStats { // Wait implements tcpip.Endpoint.Wait. func (*endpoint) Wait() {} -func isBroadcastOrMulticast(a tcpip.Address) bool { - return a == header.IPv4Broadcast || header.IsV4MulticastAddress(a) || header.IsV6MulticastAddress(a) +func (e *endpoint) isBroadcastOrMulticast(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, addr tcpip.Address) bool { + return addr == header.IPv4Broadcast || header.IsV4MulticastAddress(addr) || header.IsV6MulticastAddress(addr) || e.stack.IsSubnetBroadcast(nicID, netProto, addr) } +// SetOwner implements tcpip.Endpoint.SetOwner. func (e *endpoint) SetOwner(owner tcpip.PacketOwner) { e.owner = owner } +// SocketOptions implements tcpip.Endpoint.SocketOptions. func (e *endpoint) SocketOptions() *tcpip.SocketOptions { return &e.ops } diff --git a/pkg/tcpip/transport/udp/endpoint_state.go b/pkg/tcpip/transport/udp/endpoint_state.go index 858c99a45..13b72dc88 100644 --- a/pkg/tcpip/transport/udp/endpoint_state.go +++ b/pkg/tcpip/transport/udp/endpoint_state.go @@ -98,7 +98,8 @@ func (e *endpoint) Resume(s *stack.Stack) { } } - if e.state != StateBound && e.state != StateConnected { + state := e.EndpointState() + if state != StateBound && state != StateConnected { return } @@ -113,12 +114,12 @@ func (e *endpoint) Resume(s *stack.Stack) { } var err *tcpip.Error - if e.state == StateConnected { - e.route, err = e.stack.FindRoute(e.RegisterNICID, e.ID.LocalAddress, e.ID.RemoteAddress, netProto, e.multicastLoop) + if state == StateConnected { + e.route, err = e.stack.FindRoute(e.RegisterNICID, e.ID.LocalAddress, e.ID.RemoteAddress, netProto, e.ops.GetMulticastLoop()) if err != nil { panic(err) } - } else if len(e.ID.LocalAddress) != 0 && !isBroadcastOrMulticast(e.ID.LocalAddress) { // stateBound + } else if len(e.ID.LocalAddress) != 0 && !e.isBroadcastOrMulticast(e.RegisterNICID, netProto, e.ID.LocalAddress) { // stateBound // A local unicast address is specified, verify that it's valid. if e.stack.CheckLocalAddress(e.RegisterNICID, netProto, e.ID.LocalAddress) == 0 { panic(tcpip.ErrBadLocalAddress) diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go index 764ad0857..e384f52dd 100644 --- a/pkg/tcpip/transport/udp/udp_test.go +++ b/pkg/tcpip/transport/udp/udp_test.go @@ -32,6 +32,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/tcpip/transport/icmp" "gvisor.dev/gvisor/pkg/tcpip/transport/udp" "gvisor.dev/gvisor/pkg/waiter" ) @@ -54,6 +55,7 @@ const ( stackPort = 1234 testAddr = "\x0a\x00\x00\x02" testPort = 4096 + invalidPort = 8192 multicastAddr = "\xe8\x2b\xd3\xea" multicastV6Addr = "\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" broadcastAddr = header.IPv4Broadcast @@ -295,7 +297,8 @@ func newDualTestContext(t *testing.T, mtu uint32) *testContext { t.Helper() return newDualTestContextWithOptions(t, mtu, stack.Options{ NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol}, - TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol}, + TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol, icmp.NewProtocol6, icmp.NewProtocol4}, + HandleLocal: true, }) } @@ -360,9 +363,7 @@ func (c *testContext) createEndpointForFlow(flow testFlow) { c.createEndpoint(flow.sockProto()) if flow.isV6Only() { - if err := c.ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil { - c.t.Fatalf("SetSockOptBool failed: %s", err) - } + c.ep.SocketOptions().SetV6Only(true) } else if flow.isBroadcast() { c.ep.SocketOptions().SetBroadcast(true) } @@ -972,7 +973,7 @@ func testFailingWrite(c *testContext, flow testFlow, wantErr *tcpip.Error) { // provided. func testWrite(c *testContext, flow testFlow, checkers ...checker.NetworkChecker) uint16 { c.t.Helper() - return testWriteInternal(c, flow, true, checkers...) + return testWriteAndVerifyInternal(c, flow, true, checkers...) } // testWriteWithoutDestination sends a packet of the given test flow from the @@ -981,10 +982,10 @@ func testWrite(c *testContext, flow testFlow, checkers ...checker.NetworkChecker // checker functions provided. func testWriteWithoutDestination(c *testContext, flow testFlow, checkers ...checker.NetworkChecker) uint16 { c.t.Helper() - return testWriteInternal(c, flow, false, checkers...) + return testWriteAndVerifyInternal(c, flow, false, checkers...) } -func testWriteInternal(c *testContext, flow testFlow, setDest bool, checkers ...checker.NetworkChecker) uint16 { +func testWriteNoVerify(c *testContext, flow testFlow, setDest bool) buffer.View { c.t.Helper() // Take a snapshot of the stats to validate them at the end of the test. epstats := c.ep.Stats().(*tcpip.TransportEndpointStats).Clone() @@ -1006,6 +1007,12 @@ func testWriteInternal(c *testContext, flow testFlow, setDest bool, checkers ... c.t.Fatalf("Bad number of bytes written: got %v, want %v", n, len(payload)) } c.checkEndpointWriteStats(1, epstats, err) + return payload +} + +func testWriteAndVerifyInternal(c *testContext, flow testFlow, setDest bool, checkers ...checker.NetworkChecker) uint16 { + c.t.Helper() + payload := testWriteNoVerify(c, flow, setDest) // Received the packet and check the payload. b := c.getPacketAndVerify(flow, checkers...) var udp header.UDP @@ -1150,6 +1157,39 @@ func TestV4WriteOnConnected(t *testing.T) { testWriteWithoutDestination(c, unicastV4) } +func TestWriteOnConnectedInvalidPort(t *testing.T) { + protocols := map[string]tcpip.NetworkProtocolNumber{ + "ipv4": ipv4.ProtocolNumber, + "ipv6": ipv6.ProtocolNumber, + } + for name, pn := range protocols { + t.Run(name, func(t *testing.T) { + c := newDualTestContext(t, defaultMTU) + defer c.cleanup() + + c.createEndpoint(pn) + if err := c.ep.Connect(tcpip.FullAddress{Addr: stackAddr, Port: invalidPort}); err != nil { + c.t.Fatalf("Connect failed: %s", err) + } + writeOpts := tcpip.WriteOptions{ + To: &tcpip.FullAddress{Addr: stackAddr, Port: invalidPort}, + } + payload := buffer.View(newPayload()) + n, _, err := c.ep.Write(tcpip.SlicePayload(payload), writeOpts) + if err != nil { + c.t.Fatalf("c.ep.Write(...) = %+s, want nil", err) + } + if got, want := n, int64(len(payload)); got != want { + c.t.Fatalf("c.ep.Write(...) wrote %d bytes, want %d bytes", got, want) + } + + if err := c.ep.LastError(); err != tcpip.ErrConnectionRefused { + c.t.Fatalf("expected c.ep.LastError() == ErrConnectionRefused, got: %+v", err) + } + }) + } +} + // TestWriteOnBoundToV4Multicast checks that we can send packets out of a socket // that is bound to a V4 multicast address. func TestWriteOnBoundToV4Multicast(t *testing.T) { @@ -1372,9 +1412,7 @@ func TestReadIPPacketInfo(t *testing.T) { } } - if err := c.ep.SetSockOptBool(tcpip.ReceiveIPPacketInfoOption, true); err != nil { - t.Fatalf("c.ep.SetSockOptBool(tcpip.ReceiveIPPacketInfoOption, true): %s", err) - } + c.ep.SocketOptions().SetReceivePacketInfo(true) testRead(c, test.flow, checker.ReceiveIPPacketInfo(tcpip.IPPacketInfo{ NIC: 1, @@ -1412,16 +1450,12 @@ func TestNoChecksum(t *testing.T) { c.createEndpointForFlow(flow) // Disable the checksum generation. - if err := c.ep.SetSockOptBool(tcpip.NoChecksumOption, true); err != nil { - t.Fatalf("SetSockOptBool failed: %s", err) - } + c.ep.SocketOptions().SetNoChecksum(true) // This option is effective on IPv4 only. testWrite(c, flow, checker.UDP(checker.NoChecksum(flow.isV4()))) // Enable the checksum generation. - if err := c.ep.SetSockOptBool(tcpip.NoChecksumOption, false); err != nil { - t.Fatalf("SetSockOptBool failed: %s", err) - } + c.ep.SocketOptions().SetNoChecksum(false) testWrite(c, flow, checker.UDP(checker.NoChecksum(false))) }) } @@ -1591,13 +1625,15 @@ func TestSetTClass(t *testing.T) { } func TestReceiveTosTClass(t *testing.T) { + const RcvTOSOpt = "ReceiveTosOption" + const RcvTClassOpt = "ReceiveTClassOption" + testCases := []struct { - name string - getReceiveOption tcpip.SockOptBool - tests []testFlow + name string + tests []testFlow }{ - {"ReceiveTosOption", tcpip.ReceiveTOSOption, []testFlow{unicastV4, broadcast}}, - {"ReceiveTClassOption", tcpip.ReceiveTClassOption, []testFlow{unicastV4in6, unicastV6, unicastV6Only, broadcastIn6}}, + {RcvTOSOpt, []testFlow{unicastV4, broadcast}}, + {RcvTClassOpt, []testFlow{unicastV4in6, unicastV6, unicastV6Only, broadcastIn6}}, } for _, testCase := range testCases { for _, flow := range testCase.tests { @@ -1606,29 +1642,32 @@ func TestReceiveTosTClass(t *testing.T) { defer c.cleanup() c.createEndpointForFlow(flow) - option := testCase.getReceiveOption name := testCase.name - // Verify that setting and reading the option works. - v, err := c.ep.GetSockOptBool(option) - if err != nil { - c.t.Errorf("GetSockOptBool(%s) failed: %s", name, err) + var optionGetter func() bool + var optionSetter func(bool) + switch name { + case RcvTOSOpt: + optionGetter = c.ep.SocketOptions().GetReceiveTOS + optionSetter = c.ep.SocketOptions().SetReceiveTOS + case RcvTClassOpt: + optionGetter = c.ep.SocketOptions().GetReceiveTClass + optionSetter = c.ep.SocketOptions().SetReceiveTClass + default: + t.Fatalf("unkown test variant: %s", name) } + + // Verify that setting and reading the option works. + v := optionGetter() // Test for expected default value. if v != false { c.t.Errorf("got GetSockOptBool(%s) = %t, want = %t", name, v, false) } want := true - if err := c.ep.SetSockOptBool(option, want); err != nil { - c.t.Fatalf("SetSockOptBool(%s, %t) failed: %s", name, want, err) - } - - got, err := c.ep.GetSockOptBool(option) - if err != nil { - c.t.Errorf("GetSockOptBool(%s) failed: %s", name, err) - } + optionSetter(want) + got := optionGetter() if got != want { c.t.Errorf("got GetSockOptBool(%s) = %t, want = %t", name, got, want) } @@ -1638,10 +1677,10 @@ func TestReceiveTosTClass(t *testing.T) { if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil { c.t.Fatalf("Bind failed: %s", err) } - switch option { - case tcpip.ReceiveTClassOption: + switch name { + case RcvTClassOpt: testRead(c, flow, checker.ReceiveTClass(testTOS)) - case tcpip.ReceiveTOSOption: + case RcvTOSOpt: testRead(c, flow, checker.ReceiveTOS(testTOS)) default: t.Fatalf("unknown test variant: %s", name) |