diff options
-rwxr-xr-x | pkg/sentry/platform/ring0/defs_impl.go | 6 | ||||
-rw-r--r-- | pkg/tcpip/stack/nic.go | 63 | ||||
-rw-r--r-- | pkg/tcpip/stack/stack.go | 32 | ||||
-rw-r--r-- | pkg/tcpip/tcpip.go | 9 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/accept.go | 30 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/connect.go | 19 | ||||
-rw-r--r-- | runsc/boot/loader.go | 49 |
7 files changed, 122 insertions, 86 deletions
diff --git a/pkg/sentry/platform/ring0/defs_impl.go b/pkg/sentry/platform/ring0/defs_impl.go index d4bfc5a4a..8efc3825f 100755 --- a/pkg/sentry/platform/ring0/defs_impl.go +++ b/pkg/sentry/platform/ring0/defs_impl.go @@ -1,14 +1,14 @@ package ring0 import ( - "syscall" - "fmt" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" - "gvisor.dev/gvisor/pkg/sentry/usermem" "io" "reflect" + "syscall" + + "gvisor.dev/gvisor/pkg/sentry/usermem" ) var ( diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go index ae56e0ffd..43719085e 100644 --- a/pkg/tcpip/stack/nic.go +++ b/pkg/tcpip/stack/nic.go @@ -36,13 +36,13 @@ type NIC struct { demux *transportDemuxer - mu sync.RWMutex - spoofing bool - promiscuous bool - primary map[tcpip.NetworkProtocolNumber]*ilist.List - endpoints map[NetworkEndpointID]*referencedNetworkEndpoint - subnets []tcpip.Subnet - mcastJoins map[NetworkEndpointID]int32 + mu sync.RWMutex + spoofing bool + promiscuous bool + primary map[tcpip.NetworkProtocolNumber]*ilist.List + endpoints map[NetworkEndpointID]*referencedNetworkEndpoint + addressRanges []tcpip.Subnet + mcastJoins map[NetworkEndpointID]int32 stats NICStats } @@ -224,7 +224,17 @@ func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address t // the caller or if the address is found in the NIC's subnets. createTempEP := spoofingOrPromiscuous if !createTempEP { - for _, sn := range n.subnets { + for _, sn := range n.addressRanges { + // Skip the subnet address. + if address == sn.ID() { + continue + } + // For now just skip the broadcast address, until we support it. + // FIXME(b/137608825): Add support for sending/receiving directed + // (subnet) broadcast. + if address == sn.Broadcast() { + continue + } if sn.Contains(address) { createTempEP = true break @@ -381,45 +391,38 @@ func (n *NIC) Addresses() []tcpip.ProtocolAddress { return addrs } -// AddSubnet adds a new subnet to n, so that it starts accepting packets -// targeted at the given address and network protocol. -func (n *NIC) AddSubnet(protocol tcpip.NetworkProtocolNumber, subnet tcpip.Subnet) { +// AddAddressRange adds a range of addresses to n, so that it starts accepting +// packets targeted at the given addresses and network protocol. The range is +// given by a subnet address, and all addresses contained in the subnet are +// used except for the subnet address itself and the subnet's broadcast +// address. +func (n *NIC) AddAddressRange(protocol tcpip.NetworkProtocolNumber, subnet tcpip.Subnet) { n.mu.Lock() - n.subnets = append(n.subnets, subnet) + n.addressRanges = append(n.addressRanges, subnet) n.mu.Unlock() } -// RemoveSubnet removes the given subnet from n. -func (n *NIC) RemoveSubnet(subnet tcpip.Subnet) { +// RemoveAddressRange removes the given address range from n. +func (n *NIC) RemoveAddressRange(subnet tcpip.Subnet) { n.mu.Lock() // Use the same underlying array. - tmp := n.subnets[:0] - for _, sub := range n.subnets { + tmp := n.addressRanges[:0] + for _, sub := range n.addressRanges { if sub != subnet { tmp = append(tmp, sub) } } - n.subnets = tmp + n.addressRanges = tmp n.mu.Unlock() } -// ContainsSubnet reports whether this NIC contains the given subnet. -func (n *NIC) ContainsSubnet(subnet tcpip.Subnet) bool { - for _, s := range n.Subnets() { - if s == subnet { - return true - } - } - return false -} - // Subnets returns the Subnets associated with this NIC. -func (n *NIC) Subnets() []tcpip.Subnet { +func (n *NIC) AddressRanges() []tcpip.Subnet { n.mu.RLock() defer n.mu.RUnlock() - sns := make([]tcpip.Subnet, 0, len(n.subnets)+len(n.endpoints)) + sns := make([]tcpip.Subnet, 0, len(n.addressRanges)+len(n.endpoints)) for nid := range n.endpoints { sn, err := tcpip.NewSubnet(nid.LocalAddress, tcpip.AddressMask(strings.Repeat("\xff", len(nid.LocalAddress)))) if err != nil { @@ -429,7 +432,7 @@ func (n *NIC) Subnets() []tcpip.Subnet { } sns = append(sns, sn) } - return append(sns, n.subnets...) + return append(sns, n.addressRanges...) } func (n *NIC) removeEndpointLocked(r *referencedNetworkEndpoint) { diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go index 1d5e84a8b..6beca6ae8 100644 --- a/pkg/tcpip/stack/stack.go +++ b/pkg/tcpip/stack/stack.go @@ -702,14 +702,14 @@ func (s *Stack) CheckNIC(id tcpip.NICID) bool { } // NICSubnets returns a map of NICIDs to their associated subnets. -func (s *Stack) NICSubnets() map[tcpip.NICID][]tcpip.Subnet { +func (s *Stack) NICAddressRanges() map[tcpip.NICID][]tcpip.Subnet { s.mu.RLock() defer s.mu.RUnlock() nics := map[tcpip.NICID][]tcpip.Subnet{} for id, nic := range s.nics { - nics[id] = append(nics[id], nic.Subnets()...) + nics[id] = append(nics[id], nic.AddressRanges()...) } return nics } @@ -810,45 +810,35 @@ func (s *Stack) AddProtocolAddressWithOptions(id tcpip.NICID, protocolAddress tc return nic.AddAddress(protocolAddress, peb) } -// AddSubnet adds a subnet range to the specified NIC. -func (s *Stack) AddSubnet(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, subnet tcpip.Subnet) *tcpip.Error { +// AddAddressRange adds a range of addresses to the specified NIC. The range is +// given by a subnet address, and all addresses contained in the subnet are +// used except for the subnet address itself and the subnet's broadcast +// address. +func (s *Stack) AddAddressRange(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, subnet tcpip.Subnet) *tcpip.Error { s.mu.RLock() defer s.mu.RUnlock() if nic, ok := s.nics[id]; ok { - nic.AddSubnet(protocol, subnet) + nic.AddAddressRange(protocol, subnet) return nil } return tcpip.ErrUnknownNICID } -// RemoveSubnet removes the subnet range from the specified NIC. -func (s *Stack) RemoveSubnet(id tcpip.NICID, subnet tcpip.Subnet) *tcpip.Error { +// RemoveAddressRange removes the range of addresses from the specified NIC. +func (s *Stack) RemoveAddressRange(id tcpip.NICID, subnet tcpip.Subnet) *tcpip.Error { s.mu.RLock() defer s.mu.RUnlock() if nic, ok := s.nics[id]; ok { - nic.RemoveSubnet(subnet) + nic.RemoveAddressRange(subnet) return nil } return tcpip.ErrUnknownNICID } -// ContainsSubnet reports whether the specified NIC contains the specified -// subnet. -func (s *Stack) ContainsSubnet(id tcpip.NICID, subnet tcpip.Subnet) (bool, *tcpip.Error) { - s.mu.RLock() - defer s.mu.RUnlock() - - if nic, ok := s.nics[id]; ok { - return nic.ContainsSubnet(subnet), nil - } - - return false, tcpip.ErrUnknownNICID -} - // RemoveAddress removes an existing network-layer address from the specified // NIC. func (s *Stack) RemoveAddress(id tcpip.NICID, addr tcpip.Address) *tcpip.Error { diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index 05aa42c98..418e771d2 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -219,6 +219,15 @@ func (s *Subnet) Mask() AddressMask { return s.mask } +// Broadcast returns the subnet's broadcast address. +func (s *Subnet) Broadcast() Address { + addr := []byte(s.address) + for i := range addr { + addr[i] |= ^s.mask[i] + } + return Address(addr) +} + // NICID is a number that uniquely identifies a NIC. type NICID int32 diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go index e9c5099ea..0802e984e 100644 --- a/pkg/tcpip/transport/tcp/accept.go +++ b/pkg/tcpip/transport/tcp/accept.go @@ -143,6 +143,15 @@ func decSynRcvdCount() { synRcvdCount.Unlock() } +// synCookiesInUse() returns true if the synRcvdCount is greater than +// SynRcvdCountThreshold. +func synCookiesInUse() bool { + synRcvdCount.Lock() + v := synRcvdCount.value + synRcvdCount.Unlock() + return v >= SynRcvdCountThreshold +} + // newListenContext creates a new listen context. func newListenContext(stk *stack.Stack, listenEP *endpoint, rcvWnd seqnum.Size, v6only bool, netProto tcpip.NetworkProtocolNumber) *listenContext { l := &listenContext{ @@ -446,6 +455,27 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) { return } + if !synCookiesInUse() { + // Send a reset as this is an ACK for which there is no + // half open connections and we are not using cookies + // yet. + // + // The only time we should reach here when a connection + // was opened and closed really quickly and a delayed + // ACK was received from the sender. + replyWithReset(s) + return + } + + // Since SYN cookies are in use this is potentially an ACK to a + // SYN-ACK we sent but don't have a half open connection state + // as cookies are being used to protect against a potential SYN + // flood. In such cases validate the cookie and if valid create + // a fully connected endpoint and deliver to the accept queue. + // + // If not, silently drop the ACK to avoid leaking information + // when under a potential syn flood attack. + // // Validate the cookie. data, ok := ctx.isCookieValid(s.id, s.ackNumber-1, s.sequenceNumber-1) if !ok || int(data) >= len(mssTable) { diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go index 00d2ae524..21038a65a 100644 --- a/pkg/tcpip/transport/tcp/connect.go +++ b/pkg/tcpip/transport/tcp/connect.go @@ -720,13 +720,18 @@ func (e *endpoint) handleClose() *tcpip.Error { return nil } -// resetConnectionLocked sends a RST segment and puts the endpoint in an error -// state with the given error code. This method must only be called from the -// protocol goroutine. +// resetConnectionLocked puts the endpoint in an error state with the given +// error code and sends a RST if and only if the error is not ErrConnectionReset +// indicating that the connection is being reset due to receiving a RST. This +// method must only be called from the protocol goroutine. func (e *endpoint) resetConnectionLocked(err *tcpip.Error) { - e.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck|header.TCPFlagRst, e.snd.sndUna, e.rcv.rcvNxt, 0) + // Only send a reset if the connection is being aborted for a reason + // other than receiving a reset. e.state = StateError e.hardError = err + if err != tcpip.ErrConnectionReset { + e.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck|header.TCPFlagRst, e.snd.sndUna, e.rcv.rcvNxt, 0) + } } // completeWorkerLocked is called by the worker goroutine when it's about to @@ -806,7 +811,7 @@ func (e *endpoint) keepaliveTimerExpired() *tcpip.Error { if e.keepalive.unacked >= e.keepalive.count { e.keepalive.Unlock() - return tcpip.ErrConnectionReset + return tcpip.ErrTimeout } // RFC1122 4.2.3.6: TCP keepalive is a dataless ACK with @@ -1068,6 +1073,10 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error { e.workMu.Lock() if err := funcs[v].f(); err != nil { e.mu.Lock() + // Ensure we release all endpoint registration and route + // references as the connection is now in an error + // state. + e.workerCleanup = true e.resetConnectionLocked(err) // Lock released below. epilogue() diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 19b738705..823a34619 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -760,26 +760,34 @@ func (l *Loader) destroyContainer(cid string) error { if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil { return fmt.Errorf("sending SIGKILL to all container processes: %v", err) } + // Wait for all processes that belong to the container to exit (including + // exec'd processes). + for _, t := range l.k.TaskSet().Root.Tasks() { + if t.ContainerID() == cid { + t.ThreadGroup().WaitExited() + } + } + + // At this point, all processes inside of the container have exited, + // releasing all references to the container's MountNamespace and + // causing all submounts and overlays to be unmounted. + // + // Since the container's MountNamespace has been released, + // MountNamespace.destroy() will have executed, but that function may + // trigger async close operations. We must wait for those to complete + // before returning, otherwise the caller may kill the gofer before + // they complete, causing a cascade of failing RPCs. + fs.AsyncBarrier() } - // Remove all container thread groups from the map. + // No more failure from this point on. Remove all container thread groups + // from the map. for key := range l.processes { if key.cid == cid { delete(l.processes, key) } } - // At this point, all processes inside of the container have exited, - // releasing all references to the container's MountNamespace and - // causing all submounts and overlays to be unmounted. - // - // Since the container's MountNamespace has been released, - // MountNamespace.destroy() will have executed, but that function may - // trigger async close operations. We must wait for those to complete - // before returning, otherwise the caller may kill the gofer before - // they complete, causing a cascade of failing RPCs. - fs.AsyncBarrier() - log.Debugf("Container destroyed %q", cid) return nil } @@ -1037,21 +1045,8 @@ func (l *Loader) signalAllProcesses(cid string, signo int32) error { // the signal is delivered. This prevents process leaks when SIGKILL is // sent to the entire container. l.k.Pause() - if err := l.k.SendContainerSignal(cid, &arch.SignalInfo{Signo: signo}); err != nil { - l.k.Unpause() - return err - } - l.k.Unpause() - - // If SIGKILLing all processes, wait for them to exit. - if linux.Signal(signo) == linux.SIGKILL { - for _, t := range l.k.TaskSet().Root.Tasks() { - if t.ContainerID() == cid { - t.ThreadGroup().WaitExited() - } - } - } - return nil + defer l.k.Unpause() + return l.k.SendContainerSignal(cid, &arch.SignalInfo{Signo: signo}) } // threadGroupFromID same as threadGroupFromIDLocked except that it acquires |