// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package stack import ( "fmt" "sync" "time" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" ) const ( // immediateDuration is a duration of zero for scheduling work that needs to // be done immediately but asynchronously to avoid deadlock. immediateDuration time.Duration = 0 ) // NeighborEntry describes a neighboring device in the local network. type NeighborEntry struct { Addr tcpip.Address LinkAddr tcpip.LinkAddress State NeighborState UpdatedAtNanos int64 } // NeighborState defines the state of a NeighborEntry within the Neighbor // Unreachability Detection state machine, as per RFC 4861 section 7.3.2 and // RFC 7048. type NeighborState uint8 const ( // Unknown means reachability has not been verified yet. This is the initial // state of entries that have been created automatically by the Neighbor // Unreachability Detection state machine. Unknown NeighborState = iota // Incomplete means that there is an outstanding request to resolve the // address. Incomplete // Reachable means the path to the neighbor is functioning properly for both // receive and transmit paths. Reachable // Stale means reachability to the neighbor is unknown, but packets are still // able to be transmitted to the possibly stale link address. Stale // Delay means reachability to the neighbor is unknown and pending // confirmation from an upper-level protocol like TCP, but packets are still // able to be transmitted to the possibly stale link address. Delay // Probe means a reachability confirmation is actively being sought by // periodically retransmitting reachability probes until a reachability // confirmation is received, or until the maximum number of probes has been // sent. Probe // Static describes entries that have been explicitly added by the user. They // do not expire and are not deleted until explicitly removed. Static // Unreachable means reachability confirmation failed; the maximum number of // reachability probes has been sent and no replies have been received. // // TODO(gvisor.dev/issue/5472): Add the following sentence when we implement // RFC 7048: "Packets continue to be sent to the neighbor while // re-attempting to resolve the address." Unreachable ) type timer struct { // done indicates to the timer that the timer was stopped. done *bool timer tcpip.Timer } // neighborEntry implements a neighbor entry's individual node behavior, as per // RFC 4861 section 7.3.3. Neighbor Unreachability Detection operates in // parallel with the sending of packets to a neighbor, necessitating the // entry's lock to be acquired for all operations. type neighborEntry struct { neighborEntryEntry cache *neighborCache // nudState points to the Neighbor Unreachability Detection configuration. nudState *NUDState mu struct { sync.RWMutex neigh NeighborEntry // done is closed when address resolution is complete. It is nil iff s is // incomplete and resolution is not yet in progress. done chan struct{} // onResolve is called with the result of address resolution. onResolve []func(LinkResolutionResult) isRouter bool timer timer } } // newNeighborEntry creates a neighbor cache entry starting at the default // state, Unknown. Transition out of Unknown by calling either // `handlePacketQueuedLocked` or `handleProbeLocked` on the newly created // neighborEntry. func newNeighborEntry(cache *neighborCache, remoteAddr tcpip.Address, nudState *NUDState) *neighborEntry { n := &neighborEntry{ cache: cache, nudState: nudState, } n.mu.Lock() n.mu.neigh = NeighborEntry{ Addr: remoteAddr, State: Unknown, } n.mu.Unlock() return n } // newStaticNeighborEntry creates a neighbor cache entry starting at the // Static state. The entry can only transition out of Static by directly // calling `setStateLocked`. func newStaticNeighborEntry(cache *neighborCache, addr tcpip.Address, linkAddr tcpip.LinkAddress, state *NUDState) *neighborEntry { entry := NeighborEntry{ Addr: addr, LinkAddr: linkAddr, State: Static, UpdatedAtNanos: cache.nic.stack.clock.NowNanoseconds(), } n := &neighborEntry{ cache: cache, nudState: state, } n.mu.Lock() n.mu.neigh = entry n.mu.Unlock() return n } // notifyCompletionLocked notifies those waiting for address resolution, with // the link address if resolution completed successfully. // // Precondition: e.mu MUST be locked. func (e *neighborEntry) notifyCompletionLocked(succeeded bool) { res := LinkResolutionResult{LinkAddress: e.mu.neigh.LinkAddr, Success: succeeded} for _, callback := range e.mu.onResolve { callback(res) } e.mu.onResolve = nil if ch := e.mu.done; ch != nil { close(ch) e.mu.done = nil // Dequeue the pending packets in a new goroutine to not hold up the current // goroutine as writing packets may be a costly operation. // // At the time of writing, when writing packets, a neighbor's link address // is resolved (which ends up obtaining the entry's lock) while holding the // link resolution queue's lock. Dequeuing packets in a new goroutine avoids // a lock ordering violation. go e.cache.nic.linkResQueue.dequeue(ch, e.mu.neigh.LinkAddr, succeeded) } } // dispatchAddEventLocked signals to stack's NUD Dispatcher that the entry has // been added. // // Precondition: e.mu MUST be locked. func (e *neighborEntry) dispatchAddEventLocked() { if nudDisp := e.cache.nic.stack.nudDisp; nudDisp != nil { nudDisp.OnNeighborAdded(e.cache.nic.id, e.mu.neigh) } } // dispatchChangeEventLocked signals to stack's NUD Dispatcher that the entry // has changed state or link-layer address. // // Precondition: e.mu MUST be locked. func (e *neighborEntry) dispatchChangeEventLocked() { if nudDisp := e.cache.nic.stack.nudDisp; nudDisp != nil { nudDisp.OnNeighborChanged(e.cache.nic.id, e.mu.neigh) } } // dispatchRemoveEventLocked signals to stack's NUD Dispatcher that the entry // has been removed. // // Precondition: e.mu MUST be locked. func (e *neighborEntry) dispatchRemoveEventLocked() { if nudDisp := e.cache.nic.stack.nudDisp; nudDisp != nil { nudDisp.OnNeighborRemoved(e.cache.nic.id, e.mu.neigh) } } // cancelTimerLocked cancels the currently scheduled action, if there is one. // Entries in Unknown, Stale, or Static state do not have a scheduled action. // // Precondition: e.mu MUST be locked. func (e *neighborEntry) cancelTimerLocked() { if e.mu.timer.timer != nil { e.mu.timer.timer.Stop() *e.mu.timer.done = true e.mu.timer = timer{} } } // removeLocked prepares the entry for removal. // // Precondition: e.mu MUST be locked. func (e *neighborEntry) removeLocked() { e.mu.neigh.UpdatedAtNanos = e.cache.nic.stack.clock.NowNanoseconds() e.dispatchRemoveEventLocked() e.cancelTimerLocked() e.notifyCompletionLocked(false /* succeeded */) } // setStateLocked transitions the entry to the specified state immediately. // // Follows the logic defined in RFC 4861 section 7.3.3. // // Precondition: e.mu MUST be locked. func (e *neighborEntry) setStateLocked(next NeighborState) { e.cancelTimerLocked() prev := e.mu.neigh.State e.mu.neigh.State = next e.mu.neigh.UpdatedAtNanos = e.cache.nic.stack.clock.NowNanoseconds() config := e.nudState.Config() switch next { case Incomplete: panic(fmt.Sprintf("should never transition to Incomplete with setStateLocked; neigh = %#v, prev state = %s", e.mu.neigh, prev)) case Reachable: // Protected by e.mu. done := false e.mu.timer = timer{ done: &done, timer: e.cache.nic.stack.Clock().AfterFunc(e.nudState.ReachableTime(), func() { e.mu.Lock() defer e.mu.Unlock() if done { // The timer was stopped because the entry changed state. return } e.setStateLocked(Stale) e.dispatchChangeEventLocked() }), } case Delay: // Protected by e.mu. done := false e.mu.timer = timer{ done: &done, timer: e.cache.nic.stack.Clock().AfterFunc(config.DelayFirstProbeTime, func() { e.mu.Lock() defer e.mu.Unlock() if done { // The timer was stopped because the entry changed state. return } e.setStateLocked(Probe) e.dispatchChangeEventLocked() }), } case Probe: // Protected by e.mu. done := false remaining := config.MaxUnicastProbes addr := e.mu.neigh.Addr linkAddr := e.mu.neigh.LinkAddr // Send a probe in another gorountine to free this thread of execution // for finishing the state transition. This is necessary to escape the // currently held lock so we can send the probe message without holding // a shared lock. e.mu.timer = timer{ done: &done, timer: e.cache.nic.stack.Clock().AfterFunc(0, func() { var err tcpip.Error timedoutResolution := remaining == 0 if !timedoutResolution { err = e.cache.linkRes.LinkAddressRequest(addr, "" /* localAddr */, linkAddr) } e.mu.Lock() defer e.mu.Unlock() if done { // The timer was stopped because the entry changed state. return } if timedoutResolution || err != nil { e.setStateLocked(Unreachable) e.dispatchChangeEventLocked() return } remaining-- e.mu.timer.timer.Reset(config.RetransmitTimer) }), } case Unreachable: e.notifyCompletionLocked(false /* succeeded */) case Unknown, Stale, Static: // Do nothing default: panic(fmt.Sprintf("Invalid state transition from %q to %q", prev, next)) } } // handlePacketQueuedLocked advances the state machine according to a packet // being queued for outgoing transmission. // // Follows the logic defined in RFC 4861 section 7.3.3. // // Precondition: e.mu MUST be locked. func (e *neighborEntry) handlePacketQueuedLocked(localAddr tcpip.Address) { switch e.mu.neigh.State { case Unknown, Unreachable: prev := e.mu.neigh.State e.mu.neigh.State = Incomplete e.mu.neigh.UpdatedAtNanos = e.cache.nic.stack.clock.NowNanoseconds() switch prev { case Unknown: e.dispatchAddEventLocked() case Unreachable: e.dispatchChangeEventLocked() e.cache.nic.stats.Neighbor.UnreachableEntryLookups.Increment() } config := e.nudState.Config() // Protected by e.mu. done := false remaining := config.MaxMulticastProbes addr := e.mu.neigh.Addr // Send a probe in another gorountine to free this thread of execution // for finishing the state transition. This is necessary to escape the // currently held lock so we can send the probe message without holding // a shared lock. e.mu.timer = timer{ done: &done, timer: e.cache.nic.stack.Clock().AfterFunc(0, func() { var err tcpip.Error timedoutResolution := remaining == 0 if !timedoutResolution { // As per RFC 4861 section 7.2.2: // // If the source address of the packet prompting the solicitation is // the same as one of the addresses assigned to the outgoing interface, // that address SHOULD be placed in the IP Source Address of the // outgoing solicitation. // err = e.cache.linkRes.LinkAddressRequest(addr, localAddr, "" /* linkAddr */) } e.mu.Lock() defer e.mu.Unlock() if done { // The timer was stopped because the entry changed state. return } if timedoutResolution || err != nil { e.setStateLocked(Unreachable) e.dispatchChangeEventLocked() return } remaining-- e.mu.timer.timer.Reset(config.RetransmitTimer) }), } case Stale: e.setStateLocked(Delay) e.dispatchChangeEventLocked() case Incomplete, Reachable, Delay, Probe, Static: // Do nothing default: panic(fmt.Sprintf("Invalid cache entry state: %s", e.mu.neigh.State)) } } // handleProbeLocked processes an incoming neighbor probe (e.g. ARP request or // Neighbor Solicitation for ARP or NDP, respectively). // // Follows the logic defined in RFC 4861 section 7.2.3. // // Precondition: e.mu MUST be locked. func (e *neighborEntry) handleProbeLocked(remoteLinkAddr tcpip.LinkAddress) { // Probes MUST be silently discarded if the target address is tentative, does // not exist, or not bound to the NIC as per RFC 4861 section 7.2.3. These // checks MUST be done by the NetworkEndpoint. switch e.mu.neigh.State { case Unknown: e.mu.neigh.LinkAddr = remoteLinkAddr e.setStateLocked(Stale) e.dispatchAddEventLocked() case Incomplete: // "If an entry already exists, and the cached link-layer address // differs from the one in the received Source Link-Layer option, the // cached address should be replaced by the received address, and the // entry's reachability state MUST be set to STALE." // - RFC 4861 section 7.2.3 e.mu.neigh.LinkAddr = remoteLinkAddr e.setStateLocked(Stale) e.notifyCompletionLocked(true /* succeeded */) e.dispatchChangeEventLocked() case Reachable, Delay, Probe: if e.mu.neigh.LinkAddr != remoteLinkAddr { e.mu.neigh.LinkAddr = remoteLinkAddr e.setStateLocked(Stale) e.dispatchChangeEventLocked() } case Stale: if e.mu.neigh.LinkAddr != remoteLinkAddr { e.mu.neigh.LinkAddr = remoteLinkAddr e.dispatchChangeEventLocked() } case Unreachable: // TODO(gvisor.dev/issue/5472): Do not change the entry if the link // address is the same, as per RFC 7048. e.mu.neigh.LinkAddr = remoteLinkAddr e.setStateLocked(Stale) e.dispatchChangeEventLocked() case Static: // Do nothing default: panic(fmt.Sprintf("Invalid cache entry state: %s", e.mu.neigh.State)) } } // handleConfirmationLocked processes an incoming neighbor confirmation // (e.g. ARP reply or Neighbor Advertisement for ARP or NDP, respectively). // // Follows the state machine defined by RFC 4861 section 7.2.5. // // TODO(gvisor.dev/issue/2277): To protect against ARP poisoning and other // attacks against NDP functions, Secure Neighbor Discovery (SEND) Protocol // should be deployed where preventing access to the broadcast segment might // not be possible. SEND uses RSA key pairs to produce Cryptographically // Generated Addresses (CGA), as defined in RFC 3972. This ensures that the // claimed source of an NDP message is the owner of the claimed address. // // Precondition: e.mu MUST be locked. func (e *neighborEntry) handleConfirmationLocked(linkAddr tcpip.LinkAddress, flags ReachabilityConfirmationFlags) { switch e.mu.neigh.State { case Incomplete: if len(linkAddr) == 0 { // "If the link layer has addresses and no Target Link-Layer Address // option is included, the receiving node SHOULD silently discard the // received advertisement." - RFC 4861 section 7.2.5 break } e.mu.neigh.LinkAddr = linkAddr if flags.Solicited { e.setStateLocked(Reachable) } else { e.setStateLocked(Stale) } e.dispatchChangeEventLocked() e.mu.isRouter = flags.IsRouter e.notifyCompletionLocked(true /* succeeded */) // "Note that the Override flag is ignored if the entry is in the // INCOMPLETE state." - RFC 4861 section 7.2.5 case Reachable, Stale, Delay, Probe: isLinkAddrDifferent := len(linkAddr) != 0 && e.mu.neigh.LinkAddr != linkAddr if isLinkAddrDifferent { if !flags.Override { if e.mu.neigh.State == Reachable { e.setStateLocked(Stale) e.dispatchChangeEventLocked() } break } e.mu.neigh.LinkAddr = linkAddr if !flags.Solicited { if e.mu.neigh.State != Stale { e.setStateLocked(Stale) e.dispatchChangeEventLocked() } else { // Notify the LinkAddr change, even though NUD state hasn't changed. e.dispatchChangeEventLocked() } break } } if flags.Solicited && (flags.Override || !isLinkAddrDifferent) { wasReachable := e.mu.neigh.State == Reachable // Set state to Reachable again to refresh timers. e.setStateLocked(Reachable) e.notifyCompletionLocked(true /* succeeded */) if !wasReachable { e.dispatchChangeEventLocked() } } if e.mu.isRouter && !flags.IsRouter && header.IsV6UnicastAddress(e.mu.neigh.Addr) { // "In those cases where the IsRouter flag changes from TRUE to FALSE as // a result of this update, the node MUST remove that router from the // Default Router List and update the Destination Cache entries for all // destinations using that neighbor as a router as specified in Section // 7.3.3. This is needed to detect when a node that is used as a router // stops forwarding packets due to being configured as a host." // - RFC 4861 section 7.2.5 // // TODO(gvisor.dev/issue/4085): Remove the special casing we do for IPv6 // here. ep, ok := e.cache.nic.networkEndpoints[header.IPv6ProtocolNumber] if !ok { panic(fmt.Sprintf("have a neighbor entry for an IPv6 router but no IPv6 network endpoint")) } if ndpEP, ok := ep.(NDPEndpoint); ok { ndpEP.InvalidateDefaultRouter(e.mu.neigh.Addr) } } e.mu.isRouter = flags.IsRouter case Unknown, Unreachable, Static: // Do nothing default: panic(fmt.Sprintf("Invalid cache entry state: %s", e.mu.neigh.State)) } } // handleUpperLevelConfirmationLocked processes an incoming upper-level protocol // (e.g. TCP acknowledgements) reachability confirmation. // // Precondition: e.mu MUST be locked. func (e *neighborEntry) handleUpperLevelConfirmationLocked() { switch e.mu.neigh.State { case Reachable, Stale, Delay, Probe: wasReachable := e.mu.neigh.State == Reachable // Set state to Reachable again to refresh timers. e.setStateLocked(Reachable) if !wasReachable { e.dispatchChangeEventLocked() } case Unknown, Incomplete, Unreachable, Static: // Do nothing default: panic(fmt.Sprintf("Invalid cache entry state: %s", e.mu.neigh.State)) } }