// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package ipv6 contains the implementation of the ipv6 network protocol. package ipv6 import ( "encoding/binary" "fmt" "hash/fnv" "math" "reflect" "sort" "sync/atomic" "time" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/header/parse" "gvisor.dev/gvisor/pkg/tcpip/network/hash" "gvisor.dev/gvisor/pkg/tcpip/network/internal/fragmentation" "gvisor.dev/gvisor/pkg/tcpip/network/internal/ip" "gvisor.dev/gvisor/pkg/tcpip/stack" ) const ( // ReassembleTimeout controls how long a fragment will be held. // As per RFC 8200 section 4.5: // // If insufficient fragments are received to complete reassembly of a packet // within 60 seconds of the reception of the first-arriving fragment of that // packet, reassembly of that packet must be abandoned. // // Linux also uses 60 seconds for reassembly timeout: // https://github.com/torvalds/linux/blob/47ec5303d73ea344e84f46660fff693c57641386/include/net/ipv6.h#L456 ReassembleTimeout = 60 * time.Second // ProtocolNumber is the ipv6 protocol number. ProtocolNumber = header.IPv6ProtocolNumber // maxPayloadSize is the maximum size that can be encoded in the 16-bit // PayloadLength field of the ipv6 header. maxPayloadSize = 0xffff // DefaultTTL is the default hop limit for IPv6 Packets egressed by // Netstack. DefaultTTL = 64 // buckets for fragment identifiers buckets = 2048 ) // policyTable is the default policy table defined in RFC 6724 section 2.1. // // A more human-readable version: // // Prefix Precedence Label // ::1/128 50 0 // ::/0 40 1 // ::ffff:0:0/96 35 4 // 2002::/16 30 2 // 2001::/32 5 5 // fc00::/7 3 13 // ::/96 1 3 // fec0::/10 1 11 // 3ffe::/16 1 12 // // The table is sorted by prefix length so longest-prefix match can be easily // achieved. // // We willingly left out ::/96, fec0::/10 and 3ffe::/16 since those prefix // assignments are deprecated. // // As per RFC 4291 section 2.5.5.1 (for ::/96), // // The "IPv4-Compatible IPv6 address" is now deprecated because the // current IPv6 transition mechanisms no longer use these addresses. // New or updated implementations are not required to support this // address type. // // As per RFC 3879 section 4 (for fec0::/10), // // This document formally deprecates the IPv6 site-local unicast prefix // defined in [RFC3513], i.e., 1111111011 binary or FEC0::/10. // // As per RFC 3701 section 1 (for 3ffe::/16), // // As clearly stated in [TEST-NEW], the addresses for the 6bone are // temporary and will be reclaimed in the future. It further states // that all users of these addresses (within the 3FFE::/16 prefix) will // be required to renumber at some time in the future. // // and section 2, // // Thus after the pTLA allocation cutoff date January 1, 2004, it is // REQUIRED that no new 6bone 3FFE pTLAs be allocated. // // MUST NOT BE MODIFIED. var policyTable = [...]struct { subnet tcpip.Subnet label uint8 }{ // ::1/128 { subnet: header.IPv6Loopback.WithPrefix().Subnet(), label: 0, }, // ::ffff:0:0/96 { subnet: header.IPv4MappedIPv6Subnet, label: 4, }, // 2001::/32 (Teredo prefix as per RFC 4380 section 2.6). { subnet: tcpip.AddressWithPrefix{ Address: "\x20\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", PrefixLen: 32, }.Subnet(), label: 5, }, // 2002::/16 (6to4 prefix as per RFC 3056 section 2). { subnet: tcpip.AddressWithPrefix{ Address: "\x20\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", PrefixLen: 16, }.Subnet(), label: 2, }, // fc00::/7 (Unique local addresses as per RFC 4193 section 3.1). { subnet: tcpip.AddressWithPrefix{ Address: "\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", PrefixLen: 7, }.Subnet(), label: 13, }, // ::/0 { subnet: header.IPv6EmptySubnet, label: 1, }, } func getLabel(addr tcpip.Address) uint8 { for _, p := range policyTable { if p.subnet.Contains(addr) { return p.label } } panic(fmt.Sprintf("should have a label for address = %s", addr)) } var _ stack.DuplicateAddressDetector = (*endpoint)(nil) var _ stack.LinkAddressResolver = (*endpoint)(nil) var _ stack.LinkResolvableNetworkEndpoint = (*endpoint)(nil) var _ stack.GroupAddressableEndpoint = (*endpoint)(nil) var _ stack.AddressableEndpoint = (*endpoint)(nil) var _ stack.NetworkEndpoint = (*endpoint)(nil) var _ stack.NDPEndpoint = (*endpoint)(nil) var _ NDPEndpoint = (*endpoint)(nil) type endpoint struct { nic stack.NetworkInterface dispatcher stack.TransportDispatcher protocol *protocol stack *stack.Stack stats sharedStats // enabled is set to 1 when the endpoint is enabled and 0 when it is // disabled. // // Must be accessed using atomic operations. enabled uint32 mu struct { sync.RWMutex addressableEndpointState stack.AddressableEndpointState ndp ndpState mld mldState } // dad is used to check if an arbitrary address is already assigned to some // neighbor. // // Note: this is different from mu.ndp.dad which is used to perform DAD for // addresses that are assigned to the interface. Removing an address aborts // DAD; if we had used the same state, handlers for a removed address would // not be called with the actual DAD result. // // LOCK ORDERING: mu > dad.mu. dad struct { mu struct { sync.Mutex dad ip.DAD } } } // NICNameFromID is a function that returns a stable name for the specified NIC, // even if different NIC IDs are used to refer to the same NIC in different // program runs. It is used when generating opaque interface identifiers (IIDs). // If the NIC was created with a name, it is passed to NICNameFromID. // // NICNameFromID SHOULD return unique NIC names so unique opaque IIDs are // generated for the same prefix on differnt NICs. type NICNameFromID func(tcpip.NICID, string) string // OpaqueInterfaceIdentifierOptions holds the options related to the generation // of opaque interface indentifiers (IIDs) as defined by RFC 7217. type OpaqueInterfaceIdentifierOptions struct { // NICNameFromID is a function that returns a stable name for a specified NIC, // even if the NIC ID changes over time. // // Must be specified to generate the opaque IID. NICNameFromID NICNameFromID // SecretKey is a pseudo-random number used as the secret key when generating // opaque IIDs as defined by RFC 7217. The key SHOULD be at least // header.OpaqueIIDSecretKeyMinBytes bytes and MUST follow minimum randomness // requirements for security as outlined by RFC 4086. SecretKey MUST NOT // change between program runs, unless explicitly changed. // // OpaqueInterfaceIdentifierOptions takes ownership of SecretKey. SecretKey // MUST NOT be modified after Stack is created. // // May be nil, but a nil value is highly discouraged to maintain // some level of randomness between nodes. SecretKey []byte } // CheckDuplicateAddress implements stack.DuplicateAddressDetector. func (e *endpoint) CheckDuplicateAddress(addr tcpip.Address, h stack.DADCompletionHandler) stack.DADCheckAddressDisposition { e.dad.mu.Lock() defer e.dad.mu.Unlock() return e.dad.mu.dad.CheckDuplicateAddressLocked(addr, h) } // SetDADConfigurations implements stack.DuplicateAddressDetector. func (e *endpoint) SetDADConfigurations(c stack.DADConfigurations) { e.mu.Lock() defer e.mu.Unlock() e.dad.mu.Lock() defer e.dad.mu.Unlock() e.mu.ndp.dad.SetConfigsLocked(c) e.dad.mu.dad.SetConfigsLocked(c) } // DuplicateAddressProtocol implements stack.DuplicateAddressDetector. func (*endpoint) DuplicateAddressProtocol() tcpip.NetworkProtocolNumber { return ProtocolNumber } // HandleLinkResolutionFailure implements stack.LinkResolvableNetworkEndpoint. func (e *endpoint) HandleLinkResolutionFailure(pkt *stack.PacketBuffer) { // handleControl expects the entire offending packet to be in the packet // buffer's data field. pkt = stack.NewPacketBuffer(stack.PacketBufferOptions{ Data: buffer.NewVectorisedView(pkt.Size(), pkt.Views()), }) pkt.NICID = e.nic.ID() pkt.NetworkProtocolNumber = ProtocolNumber e.handleControl(&icmpv6DestinationAddressUnreachableSockError{}, pkt) } // onAddressAssignedLocked handles an address being assigned. // // Precondition: e.mu must be exclusively locked. func (e *endpoint) onAddressAssignedLocked(addr tcpip.Address) { // As per RFC 2710 section 3, // // All MLD messages described in this document are sent with a link-local // IPv6 Source Address, ... // // If we just completed DAD for a link-local address, then attempt to send any // queued MLD reports. Note, we may have sent reports already for some of the // groups before we had a valid link-local address to use as the source for // the MLD messages, but that was only so that MLD snooping switches are aware // of our membership to groups - routers would not have handled those reports. // // As per RFC 3590 section 4, // // MLD Report and Done messages are sent with a link-local address as // the IPv6 source address, if a valid address is available on the // interface. If a valid link-local address is not available (e.g., one // has not been configured), the message is sent with the unspecified // address (::) as the IPv6 source address. // // Once a valid link-local address is available, a node SHOULD generate // new MLD Report messages for all multicast addresses joined on the // interface. // // Routers receiving an MLD Report or Done message with the unspecified // address as the IPv6 source address MUST silently discard the packet // without taking any action on the packets contents. // // Snooping switches MUST manage multicast forwarding state based on MLD // Report and Done messages sent with the unspecified address as the // IPv6 source address. if header.IsV6LinkLocalAddress(addr) { e.mu.mld.sendQueuedReports() } } // InvalidateDefaultRouter implements stack.NDPEndpoint. func (e *endpoint) InvalidateDefaultRouter(rtr tcpip.Address) { e.mu.Lock() defer e.mu.Unlock() e.mu.ndp.invalidateDefaultRouter(rtr) } // SetNDPConfigurations implements NDPEndpoint. func (e *endpoint) SetNDPConfigurations(c NDPConfigurations) { c.validate() e.mu.Lock() defer e.mu.Unlock() e.mu.ndp.configs = c } // hasTentativeAddr returns true if addr is tentative on e. func (e *endpoint) hasTentativeAddr(addr tcpip.Address) bool { e.mu.RLock() addressEndpoint := e.getAddressRLocked(addr) e.mu.RUnlock() return addressEndpoint != nil && addressEndpoint.GetKind() == stack.PermanentTentative } // dupTentativeAddrDetected attempts to inform e that a tentative addr is a // duplicate on a link. // // dupTentativeAddrDetected removes the tentative address if it exists. If the // address was generated via SLAAC, an attempt is made to generate a new // address. func (e *endpoint) dupTentativeAddrDetected(addr tcpip.Address) tcpip.Error { e.mu.Lock() defer e.mu.Unlock() addressEndpoint := e.getAddressRLocked(addr) if addressEndpoint == nil { return &tcpip.ErrBadAddress{} } if addressEndpoint.GetKind() != stack.PermanentTentative { return &tcpip.ErrInvalidEndpointState{} } // If the address is a SLAAC address, do not invalidate its SLAAC prefix as an // attempt will be made to generate a new address for it. if err := e.removePermanentEndpointLocked(addressEndpoint, false /* allowSLAACInvalidation */, true /* dadFailure */); err != nil { return err } prefix := addressEndpoint.Subnet() switch t := addressEndpoint.ConfigType(); t { case stack.AddressConfigStatic: case stack.AddressConfigSlaac: e.mu.ndp.regenerateSLAACAddr(prefix) case stack.AddressConfigSlaacTemp: // Do not reset the generation attempts counter for the prefix as the // temporary address is being regenerated in response to a DAD conflict. e.mu.ndp.regenerateTempSLAACAddr(prefix, false /* resetGenAttempts */) default: panic(fmt.Sprintf("unrecognized address config type = %d", t)) } return nil } // transitionForwarding transitions the endpoint's forwarding status to // forwarding. // // Must only be called when the forwarding status changes. func (e *endpoint) transitionForwarding(forwarding bool) { e.mu.Lock() defer e.mu.Unlock() if !e.Enabled() { return } if forwarding { // When transitioning into an IPv6 router, host-only state (NDP discovered // routers, discovered on-link prefixes, and auto-generated addresses) is // cleaned up/invalidated and NDP router solicitations are stopped. e.mu.ndp.stopSolicitingRouters() e.mu.ndp.cleanupState(true /* hostOnly */) } else { // When transitioning into an IPv6 host, NDP router solicitations are // started. e.mu.ndp.startSolicitingRouters() } } // Enable implements stack.NetworkEndpoint. func (e *endpoint) Enable() tcpip.Error { e.mu.Lock() defer e.mu.Unlock() // If the NIC is not enabled, the endpoint can't do anything meaningful so // don't enable the endpoint. if !e.nic.Enabled() { return &tcpip.ErrNotPermitted{} } // If the endpoint is already enabled, there is nothing for it to do. if !e.setEnabled(true) { return nil } // Groups may have been joined when the endpoint was disabled, or the // endpoint may have left groups from the perspective of MLD when the // endpoint was disabled. Either way, we need to let routers know to // send us multicast traffic. e.mu.mld.initializeAll() // Join the IPv6 All-Nodes Multicast group if the stack is configured to // use IPv6. This is required to ensure that this node properly receives // and responds to the various NDP messages that are destined to the // all-nodes multicast address. An example is the Neighbor Advertisement // when we perform Duplicate Address Detection, or Router Advertisement // when we do Router Discovery. See RFC 4862, section 5.4.2 and RFC 4861 // section 4.2 for more information. // // Also auto-generate an IPv6 link-local address based on the endpoint's // link address if it is configured to do so. Note, each interface is // required to have IPv6 link-local unicast address, as per RFC 4291 // section 2.1. // Join the All-Nodes multicast group before starting DAD as responses to DAD // (NDP NS) messages may be sent to the All-Nodes multicast group if the // source address of the NDP NS is the unspecified address, as per RFC 4861 // section 7.2.4. if err := e.joinGroupLocked(header.IPv6AllNodesMulticastAddress); err != nil { // joinGroupLocked only returns an error if the group address is not a valid // IPv6 multicast address. panic(fmt.Sprintf("e.joinGroupLocked(%s): %s", header.IPv6AllNodesMulticastAddress, err)) } // Perform DAD on the all the unicast IPv6 endpoints that are in the permanent // state. // // Addresses may have aleady completed DAD but in the time since the endpoint // was last enabled, other devices may have acquired the same addresses. var err tcpip.Error e.mu.addressableEndpointState.ForEachEndpoint(func(addressEndpoint stack.AddressEndpoint) bool { addr := addressEndpoint.AddressWithPrefix().Address if !header.IsV6UnicastAddress(addr) { return true } switch addressEndpoint.GetKind() { case stack.Permanent: addressEndpoint.SetKind(stack.PermanentTentative) fallthrough case stack.PermanentTentative: err = e.mu.ndp.startDuplicateAddressDetection(addr, addressEndpoint) return err == nil default: return true } }) if err != nil { return err } // Do not auto-generate an IPv6 link-local address for loopback devices. if e.protocol.options.AutoGenLinkLocal && !e.nic.IsLoopback() { // The valid and preferred lifetime is infinite for the auto-generated // link-local address. e.mu.ndp.doSLAAC(header.IPv6LinkLocalPrefix.Subnet(), header.NDPInfiniteLifetime, header.NDPInfiniteLifetime) } // If we are operating as a router, then do not solicit routers since we // won't process the RAs anyway. // // Routers do not process Router Advertisements (RA) the same way a host // does. That is, routers do not learn from RAs (e.g. on-link prefixes // and default routers). Therefore, soliciting RAs from other routers on // a link is unnecessary for routers. if !e.protocol.Forwarding() { e.mu.ndp.startSolicitingRouters() } return nil } // Enabled implements stack.NetworkEndpoint. func (e *endpoint) Enabled() bool { return e.nic.Enabled() && e.isEnabled() } // isEnabled returns true if the endpoint is enabled, regardless of the // enabled status of the NIC. func (e *endpoint) isEnabled() bool { return atomic.LoadUint32(&e.enabled) == 1 } // setEnabled sets the enabled status for the endpoint. // // Returns true if the enabled status was updated. func (e *endpoint) setEnabled(v bool) bool { if v { return atomic.SwapUint32(&e.enabled, 1) == 0 } return atomic.SwapUint32(&e.enabled, 0) == 1 } // Disable implements stack.NetworkEndpoint. func (e *endpoint) Disable() { e.mu.Lock() defer e.mu.Unlock() e.disableLocked() } func (e *endpoint) disableLocked() { if !e.Enabled() { return } e.mu.ndp.stopSolicitingRouters() e.mu.ndp.cleanupState(false /* hostOnly */) e.stopDADForPermanentAddressesLocked() // The endpoint may have already left the multicast group. switch err := e.leaveGroupLocked(header.IPv6AllNodesMulticastAddress); err.(type) { case nil, *tcpip.ErrBadLocalAddress: default: panic(fmt.Sprintf("unexpected error when leaving group = %s: %s", header.IPv6AllNodesMulticastAddress, err)) } // Leave groups from the perspective of MLD so that routers know that // we are no longer interested in the group. e.mu.mld.softLeaveAll() if !e.setEnabled(false) { panic("should have only done work to disable the endpoint if it was enabled") } } // stopDADForPermanentAddressesLocked stops DAD for all permaneent addresses. // // Precondition: e.mu must be write locked. func (e *endpoint) stopDADForPermanentAddressesLocked() { // Stop DAD for all the tentative unicast addresses. e.mu.addressableEndpointState.ForEachEndpoint(func(addressEndpoint stack.AddressEndpoint) bool { if addressEndpoint.GetKind() != stack.PermanentTentative { return true } addr := addressEndpoint.AddressWithPrefix().Address if header.IsV6UnicastAddress(addr) { e.mu.ndp.stopDuplicateAddressDetection(addr, false /* failed */) } return true }) } // DefaultTTL is the default hop limit for this endpoint. func (e *endpoint) DefaultTTL() uint8 { return e.protocol.DefaultTTL() } // MTU implements stack.NetworkEndpoint.MTU. It returns the link-layer MTU minus // the network layer max header length. func (e *endpoint) MTU() uint32 { networkMTU, err := calculateNetworkMTU(e.nic.MTU(), header.IPv6MinimumSize) if err != nil { return 0 } return networkMTU } // MaxHeaderLength returns the maximum length needed by ipv6 headers (and // underlying protocols). func (e *endpoint) MaxHeaderLength() uint16 { // TODO(gvisor.dev/issues/5035): The maximum header length returned here does // not open the possibility for the caller to know about size required for // extension headers. return e.nic.MaxHeaderLength() + header.IPv6MinimumSize } func addIPHeader(srcAddr, dstAddr tcpip.Address, pkt *stack.PacketBuffer, params stack.NetworkHeaderParams, extensionHeaders header.IPv6ExtHdrSerializer) tcpip.Error { extHdrsLen := extensionHeaders.Length() length := pkt.Size() + extensionHeaders.Length() if length > math.MaxUint16 { return &tcpip.ErrMessageTooLong{} } ip := header.IPv6(pkt.NetworkHeader().Push(header.IPv6MinimumSize + extHdrsLen)) ip.Encode(&header.IPv6Fields{ PayloadLength: uint16(length), TransportProtocol: params.Protocol, HopLimit: params.TTL, TrafficClass: params.TOS, SrcAddr: srcAddr, DstAddr: dstAddr, ExtensionHeaders: extensionHeaders, }) pkt.NetworkProtocolNumber = ProtocolNumber return nil } func packetMustBeFragmented(pkt *stack.PacketBuffer, networkMTU uint32, gso *stack.GSO) bool { payload := pkt.TransportHeader().View().Size() + pkt.Data.Size() return (gso == nil || gso.Type == stack.GSONone) && uint32(payload) > networkMTU } // handleFragments fragments pkt and calls the handler function on each // fragment. It returns the number of fragments handled and the number of // fragments left to be processed. The IP header must already be present in the // original packet. The transport header protocol number is required to avoid // parsing the IPv6 extension headers. func (e *endpoint) handleFragments(r *stack.Route, gso *stack.GSO, networkMTU uint32, pkt *stack.PacketBuffer, transProto tcpip.TransportProtocolNumber, handler func(*stack.PacketBuffer) tcpip.Error) (int, int, tcpip.Error) { networkHeader := header.IPv6(pkt.NetworkHeader().View()) // TODO(gvisor.dev/issue/3912): Once the Authentication or ESP Headers are // supported for outbound packets, their length should not affect the fragment // maximum payload length because they should only be transmitted once. fragmentPayloadLen := (networkMTU - header.IPv6FragmentHeaderSize) &^ 7 if fragmentPayloadLen < header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit { // We need at least 8 bytes of space left for the fragmentable part because // the fragment payload must obviously be non-zero and must be a multiple // of 8 as per RFC 8200 section 4.5: // Each complete fragment, except possibly the last ("rightmost") one, is // an integer multiple of 8 octets long. return 0, 1, &tcpip.ErrMessageTooLong{} } if fragmentPayloadLen < uint32(pkt.TransportHeader().View().Size()) { // As per RFC 8200 Section 4.5, the Transport Header is expected to be small // enough to fit in the first fragment. return 0, 1, &tcpip.ErrMessageTooLong{} } pf := fragmentation.MakePacketFragmenter(pkt, fragmentPayloadLen, calculateFragmentReserve(pkt)) id := atomic.AddUint32(&e.protocol.ids[hashRoute(r, e.protocol.hashIV)%buckets], 1) var n int for { fragPkt, more := buildNextFragment(&pf, networkHeader, transProto, id) if err := handler(fragPkt); err != nil { return n, pf.RemainingFragmentCount() + 1, err } n++ if !more { return n, pf.RemainingFragmentCount(), nil } } } // WritePacket writes a packet to the given destination address and protocol. func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) tcpip.Error { if err := addIPHeader(r.LocalAddress, r.RemoteAddress, pkt, params, nil /* extensionHeaders */); err != nil { return err } // iptables filtering. All packets that reach here are locally // generated. outNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID()) if ok := e.protocol.stack.IPTables().Check(stack.Output, pkt, gso, r, "" /* preroutingAddr */, "" /* inNicName */, outNicName); !ok { // iptables is telling us to drop the packet. e.stats.ip.IPTablesOutputDropped.Increment() return nil } // If the packet is manipulated as per NAT Output rules, handle packet // based on destination address and do not send the packet to link // layer. // // TODO(gvisor.dev/issue/170): We should do this for every // packet, rather than only NATted packets, but removing this check // short circuits broadcasts before they are sent out to other hosts. if pkt.NatDone { netHeader := header.IPv6(pkt.NetworkHeader().View()) if ep := e.protocol.findEndpointWithAddress(netHeader.DestinationAddress()); ep != nil { // Since we rewrote the packet but it is being routed back to us, we // can safely assume the checksum is valid. ep.handleLocalPacket(pkt, true /* canSkipRXChecksum */) return nil } } return e.writePacket(r, gso, pkt, params.Protocol, false /* headerIncluded */) } func (e *endpoint) writePacket(r *stack.Route, gso *stack.GSO, pkt *stack.PacketBuffer, protocol tcpip.TransportProtocolNumber, headerIncluded bool) tcpip.Error { if r.Loop&stack.PacketLoop != 0 { // If the packet was generated by the stack (not a raw/packet endpoint // where a packet may be written with the header included), then we can // safely assume the checksum is valid. e.handleLocalPacket(pkt, !headerIncluded /* canSkipRXChecksum */) } if r.Loop&stack.PacketOut == 0 { return nil } stats := e.stats.ip networkMTU, err := calculateNetworkMTU(e.nic.MTU(), uint32(pkt.NetworkHeader().View().Size())) if err != nil { stats.OutgoingPacketErrors.Increment() return err } if packetMustBeFragmented(pkt, networkMTU, gso) { sent, remain, err := e.handleFragments(r, gso, networkMTU, pkt, protocol, func(fragPkt *stack.PacketBuffer) tcpip.Error { // TODO(gvisor.dev/issue/3884): Evaluate whether we want to send each // fragment one by one using WritePacket() (current strategy) or if we // want to create a PacketBufferList from the fragments and feed it to // WritePackets(). It'll be faster but cost more memory. return e.nic.WritePacket(r, gso, ProtocolNumber, fragPkt) }) stats.PacketsSent.IncrementBy(uint64(sent)) stats.OutgoingPacketErrors.IncrementBy(uint64(remain)) return err } if err := e.nic.WritePacket(r, gso, ProtocolNumber, pkt); err != nil { stats.OutgoingPacketErrors.Increment() return err } stats.PacketsSent.Increment() return nil } // WritePackets implements stack.NetworkEndpoint.WritePackets. func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, params stack.NetworkHeaderParams) (int, tcpip.Error) { if r.Loop&stack.PacketLoop != 0 { panic("not implemented") } if r.Loop&stack.PacketOut == 0 { return pkts.Len(), nil } stats := e.stats.ip linkMTU := e.nic.MTU() for pb := pkts.Front(); pb != nil; pb = pb.Next() { if err := addIPHeader(r.LocalAddress, r.RemoteAddress, pb, params, nil /* extensionHeaders */); err != nil { return 0, err } networkMTU, err := calculateNetworkMTU(linkMTU, uint32(pb.NetworkHeader().View().Size())) if err != nil { stats.OutgoingPacketErrors.IncrementBy(uint64(pkts.Len())) return 0, err } if packetMustBeFragmented(pb, networkMTU, gso) { // Keep track of the packet that is about to be fragmented so it can be // removed once the fragmentation is done. originalPkt := pb if _, _, err := e.handleFragments(r, gso, networkMTU, pb, params.Protocol, func(fragPkt *stack.PacketBuffer) tcpip.Error { // Modify the packet list in place with the new fragments. pkts.InsertAfter(pb, fragPkt) pb = fragPkt return nil }); err != nil { stats.OutgoingPacketErrors.IncrementBy(uint64(pkts.Len())) return 0, err } // Remove the packet that was just fragmented and process the rest. pkts.Remove(originalPkt) } } // iptables filtering. All packets that reach here are locally // generated. outNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID()) dropped, natPkts := e.protocol.stack.IPTables().CheckPackets(stack.Output, pkts, gso, r, "" /* inNicName */, outNicName) stats.IPTablesOutputDropped.IncrementBy(uint64(len(dropped))) for pkt := range dropped { pkts.Remove(pkt) } // The NAT-ed packets may now be destined for us. locallyDelivered := 0 for pkt := range natPkts { ep := e.protocol.findEndpointWithAddress(header.IPv6(pkt.NetworkHeader().View()).DestinationAddress()) if ep == nil { // The NAT-ed packet is still destined for some remote node. continue } // Do not send the locally destined packet out the NIC. pkts.Remove(pkt) // Deliver the packet locally. ep.handleLocalPacket(pkt, true /* canSkipRXChecksum */) locallyDelivered++ } // The rest of the packets can be delivered to the NIC as a batch. pktsLen := pkts.Len() written, err := e.nic.WritePackets(r, gso, pkts, ProtocolNumber) stats.PacketsSent.IncrementBy(uint64(written)) stats.OutgoingPacketErrors.IncrementBy(uint64(pktsLen - written)) // Dropped packets aren't errors, so include them in the return value. return locallyDelivered + written + len(dropped), err } // WriteHeaderIncludedPacket implements stack.NetworkEndpoint. func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) tcpip.Error { // The packet already has an IP header, but there are a few required checks. h, ok := pkt.Data.PullUp(header.IPv6MinimumSize) if !ok { return &tcpip.ErrMalformedHeader{} } ip := header.IPv6(h) // Always set the payload length. pktSize := pkt.Data.Size() ip.SetPayloadLength(uint16(pktSize - header.IPv6MinimumSize)) // Set the source address when zero. if ip.SourceAddress() == header.IPv6Any { ip.SetSourceAddress(r.LocalAddress) } // Set the destination. If the packet already included a destination, it will // be part of the route anyways. ip.SetDestinationAddress(r.RemoteAddress) // Populate the packet buffer's network header and don't allow an invalid // packet to be sent. // // Note that parsing only makes sure that the packet is well formed as per the // wire format. We also want to check if the header's fields are valid before // sending the packet. proto, _, _, _, ok := parse.IPv6(pkt) if !ok || !header.IPv6(pkt.NetworkHeader().View()).IsValid(pktSize) { return &tcpip.ErrMalformedHeader{} } return e.writePacket(r, nil /* gso */, pkt, proto, true /* headerIncluded */) } // forwardPacket attempts to forward a packet to its final destination. func (e *endpoint) forwardPacket(pkt *stack.PacketBuffer) tcpip.Error { h := header.IPv6(pkt.NetworkHeader().View()) hopLimit := h.HopLimit() if hopLimit <= 1 { // As per RFC 4443 section 3.3, // // If a router receives a packet with a Hop Limit of zero, or if a // router decrements a packet's Hop Limit to zero, it MUST discard the // packet and originate an ICMPv6 Time Exceeded message with Code 0 to // the source of the packet. This indicates either a routing loop or // too small an initial Hop Limit value. return e.protocol.returnError(&icmpReasonHopLimitExceeded{}, pkt) } dstAddr := h.DestinationAddress() // Check if the destination is owned by the stack. if ep := e.protocol.findEndpointWithAddress(dstAddr); ep != nil { ep.handlePacket(pkt) return nil } r, err := e.protocol.stack.FindRoute(0, "", dstAddr, ProtocolNumber, false /* multicastLoop */) if err != nil { return err } defer r.Release() // We need to do a deep copy of the IP packet because // WriteHeaderIncludedPacket takes ownership of the packet buffer, but we do // not own it. newHdr := header.IPv6(stack.PayloadSince(pkt.NetworkHeader())) // As per RFC 8200 section 3, // // Hop Limit 8-bit unsigned integer. Decremented by 1 by // each node that forwards the packet. newHdr.SetHopLimit(hopLimit - 1) return r.WriteHeaderIncludedPacket(stack.NewPacketBuffer(stack.PacketBufferOptions{ ReserveHeaderBytes: int(r.MaxHeaderLength()), Data: buffer.View(newHdr).ToVectorisedView(), })) } // HandlePacket is called by the link layer when new ipv6 packets arrive for // this endpoint. func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) { stats := e.stats.ip stats.PacketsReceived.Increment() if !e.isEnabled() { stats.DisabledPacketsReceived.Increment() return } if !e.protocol.parse(pkt) { stats.MalformedPacketsReceived.Increment() return } if !e.nic.IsLoopback() { if e.protocol.stack.HandleLocal() { addressEndpoint := e.AcquireAssignedAddress(header.IPv6(pkt.NetworkHeader().View()).SourceAddress(), e.nic.Promiscuous(), stack.CanBePrimaryEndpoint) if addressEndpoint != nil { addressEndpoint.DecRef() // The source address is one of our own, so we never should have gotten // a packet like this unless HandleLocal is false or our NIC is the // loopback interface. stats.InvalidSourceAddressesReceived.Increment() return } } // Loopback traffic skips the prerouting chain. inNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID()) if ok := e.protocol.stack.IPTables().Check(stack.Prerouting, pkt, nil, nil, e.MainAddress().Address, inNicName, "" /* outNicName */); !ok { // iptables is telling us to drop the packet. stats.IPTablesPreroutingDropped.Increment() return } } e.handlePacket(pkt) } func (e *endpoint) handleLocalPacket(pkt *stack.PacketBuffer, canSkipRXChecksum bool) { stats := e.stats.ip stats.PacketsReceived.Increment() pkt = pkt.CloneToInbound() if e.protocol.parse(pkt) { pkt.RXTransportChecksumValidated = canSkipRXChecksum e.handlePacket(pkt) return } stats.MalformedPacketsReceived.Increment() } // handlePacket is like HandlePacket except it does not perform the prerouting // iptables hook. func (e *endpoint) handlePacket(pkt *stack.PacketBuffer) { pkt.NICID = e.nic.ID() stats := e.stats.ip h := header.IPv6(pkt.NetworkHeader().View()) if !h.IsValid(pkt.Data.Size() + pkt.NetworkHeader().View().Size() + pkt.TransportHeader().View().Size()) { stats.MalformedPacketsReceived.Increment() return } srcAddr := h.SourceAddress() dstAddr := h.DestinationAddress() // As per RFC 4291 section 2.7: // Multicast addresses must not be used as source addresses in IPv6 // packets or appear in any Routing header. if header.IsV6MulticastAddress(srcAddr) { stats.InvalidSourceAddressesReceived.Increment() return } // The destination address should be an address we own or a group we joined // for us to receive the packet. Otherwise, attempt to forward the packet. if addressEndpoint := e.AcquireAssignedAddress(dstAddr, e.nic.Promiscuous(), stack.CanBePrimaryEndpoint); addressEndpoint != nil { addressEndpoint.DecRef() } else if !e.IsInGroup(dstAddr) { if !e.protocol.Forwarding() { stats.InvalidDestinationAddressesReceived.Increment() return } _ = e.forwardPacket(pkt) return } // vv consists of: // - Any IPv6 header bytes after the first 40 (i.e. extensions). // - The transport header, if present. // - Any other payload data. vv := pkt.NetworkHeader().View()[header.IPv6MinimumSize:].ToVectorisedView() vv.AppendView(pkt.TransportHeader().View()) vv.Append(pkt.Data) it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(h.NextHeader()), vv) hasFragmentHeader := false // iptables filtering. All packets that reach here are intended for // this machine and need not be forwarded. inNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID()) if ok := e.protocol.stack.IPTables().Check(stack.Input, pkt, nil, nil, "" /* preroutingAddr */, inNicName, "" /* outNicName */); !ok { // iptables is telling us to drop the packet. stats.IPTablesInputDropped.Increment() return } for { // Keep track of the start of the previous header so we can report the // special case of a Hop by Hop at a location other than at the start. previousHeaderStart := it.HeaderOffset() extHdr, done, err := it.Next() if err != nil { stats.MalformedPacketsReceived.Increment() return } if done { break } switch extHdr := extHdr.(type) { case header.IPv6HopByHopOptionsExtHdr: // As per RFC 8200 section 4.1, the Hop By Hop extension header is // restricted to appear immediately after an IPv6 fixed header. if previousHeaderStart != 0 { _ = e.protocol.returnError(&icmpReasonParameterProblem{ code: header.ICMPv6UnknownHeader, pointer: previousHeaderStart, }, pkt) return } optsIt := extHdr.Iter() for { opt, done, err := optsIt.Next() if err != nil { stats.MalformedPacketsReceived.Increment() return } if done { break } // We currently do not support any IPv6 Hop By Hop extension header // options. switch opt.UnknownAction() { case header.IPv6OptionUnknownActionSkip: case header.IPv6OptionUnknownActionDiscard: return case header.IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest: if header.IsV6MulticastAddress(dstAddr) { return } fallthrough case header.IPv6OptionUnknownActionDiscardSendICMP: // This case satisfies a requirement of RFC 8200 section 4.2 // which states that an unknown option starting with bits [10] should: // // discard the packet and, regardless of whether or not the // packet's Destination Address was a multicast address, send an // ICMP Parameter Problem, Code 2, message to the packet's // Source Address, pointing to the unrecognized Option Type. // _ = e.protocol.returnError(&icmpReasonParameterProblem{ code: header.ICMPv6UnknownOption, pointer: it.ParseOffset() + optsIt.OptionOffset(), respondToMulticast: true, }, pkt) return default: panic(fmt.Sprintf("unrecognized action for an unrecognized Hop By Hop extension header option = %d", opt)) } } case header.IPv6RoutingExtHdr: // As per RFC 8200 section 4.4, if a node encounters a routing header with // an unrecognized routing type value, with a non-zero Segments Left // value, the node must discard the packet and send an ICMP Parameter // Problem, Code 0 to the packet's Source Address, pointing to the // unrecognized Routing Type. // // If the Segments Left is 0, the node must ignore the Routing extension // header and process the next header in the packet. // // Note, the stack does not yet handle any type of routing extension // header, so we just make sure Segments Left is zero before processing // the next extension header. if extHdr.SegmentsLeft() != 0 { _ = e.protocol.returnError(&icmpReasonParameterProblem{ code: header.ICMPv6ErroneousHeader, pointer: it.ParseOffset(), }, pkt) return } case header.IPv6FragmentExtHdr: hasFragmentHeader = true if extHdr.IsAtomic() { // This fragment extension header indicates that this packet is an // atomic fragment. An atomic fragment is a fragment that contains // all the data required to reassemble a full packet. As per RFC 6946, // atomic fragments must not interfere with "normal" fragmented traffic // so we skip processing the fragment instead of feeding it through the // reassembly process below. continue } fragmentFieldOffset := it.ParseOffset() // Don't consume the iterator if we have the first fragment because we // will use it to validate that the first fragment holds the upper layer // header. rawPayload := it.AsRawHeader(extHdr.FragmentOffset() != 0 /* consume */) if extHdr.FragmentOffset() == 0 { // Check that the iterator ends with a raw payload as the first fragment // should include all headers up to and including any upper layer // headers, as per RFC 8200 section 4.5; only upper layer data // (non-headers) should follow the fragment extension header. var lastHdr header.IPv6PayloadHeader for { it, done, err := it.Next() if err != nil { stats.MalformedPacketsReceived.Increment() stats.MalformedFragmentsReceived.Increment() return } if done { break } lastHdr = it } // If the last header is a raw header, then the last portion of the IPv6 // payload is not a known IPv6 extension header. Note, this does not // mean that the last portion is an upper layer header or not an // extension header because: // 1) we do not yet support all extension headers // 2) we do not validate the upper layer header before reassembling. // // This check makes sure that a known IPv6 extension header is not // present after the Fragment extension header in a non-initial // fragment. // // TODO(#2196): Support IPv6 Authentication and Encapsulated // Security Payload extension headers. // TODO(#2333): Validate that the upper layer header is valid. switch lastHdr.(type) { case header.IPv6RawPayloadHeader: default: stats.MalformedPacketsReceived.Increment() stats.MalformedFragmentsReceived.Increment() return } } fragmentPayloadLen := rawPayload.Buf.Size() if fragmentPayloadLen == 0 { // Drop the packet as it's marked as a fragment but has no payload. stats.MalformedPacketsReceived.Increment() stats.MalformedFragmentsReceived.Increment() return } // As per RFC 2460 Section 4.5: // // If the length of a fragment, as derived from the fragment packet's // Payload Length field, is not a multiple of 8 octets and the M flag // of that fragment is 1, then that fragment must be discarded and an // ICMP Parameter Problem, Code 0, message should be sent to the source // of the fragment, pointing to the Payload Length field of the // fragment packet. if extHdr.More() && fragmentPayloadLen%header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit != 0 { stats.MalformedPacketsReceived.Increment() stats.MalformedFragmentsReceived.Increment() _ = e.protocol.returnError(&icmpReasonParameterProblem{ code: header.ICMPv6ErroneousHeader, pointer: header.IPv6PayloadLenOffset, }, pkt) return } // The packet is a fragment, let's try to reassemble it. start := extHdr.FragmentOffset() * header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit // As per RFC 2460 Section 4.5: // // If the length and offset of a fragment are such that the Payload // Length of the packet reassembled from that fragment would exceed // 65,535 octets, then that fragment must be discarded and an ICMP // Parameter Problem, Code 0, message should be sent to the source of // the fragment, pointing to the Fragment Offset field of the fragment // packet. if int(start)+fragmentPayloadLen > header.IPv6MaximumPayloadSize { stats.MalformedPacketsReceived.Increment() stats.MalformedFragmentsReceived.Increment() _ = e.protocol.returnError(&icmpReasonParameterProblem{ code: header.ICMPv6ErroneousHeader, pointer: fragmentFieldOffset, }, pkt) return } // Note that pkt doesn't have its transport header set after reassembly, // and won't until DeliverNetworkPacket sets it. resPkt, proto, ready, err := e.protocol.fragmentation.Process( // IPv6 ignores the Protocol field since the ID only needs to be unique // across source-destination pairs, as per RFC 8200 section 4.5. fragmentation.FragmentID{ Source: srcAddr, Destination: dstAddr, ID: extHdr.ID(), }, start, start+uint16(fragmentPayloadLen)-1, extHdr.More(), uint8(rawPayload.Identifier), pkt, ) if err != nil { stats.MalformedPacketsReceived.Increment() stats.MalformedFragmentsReceived.Increment() return } if ready { pkt = resPkt // We create a new iterator with the reassembled packet because we could // have more extension headers in the reassembled payload, as per RFC // 8200 section 4.5. We also use the NextHeader value from the first // fragment. it = header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(proto), pkt.Data) } case header.IPv6DestinationOptionsExtHdr: optsIt := extHdr.Iter() for { opt, done, err := optsIt.Next() if err != nil { stats.MalformedPacketsReceived.Increment() return } if done { break } // We currently do not support any IPv6 Destination extension header // options. switch opt.UnknownAction() { case header.IPv6OptionUnknownActionSkip: case header.IPv6OptionUnknownActionDiscard: return case header.IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest: if header.IsV6MulticastAddress(dstAddr) { return } fallthrough case header.IPv6OptionUnknownActionDiscardSendICMP: // This case satisfies a requirement of RFC 8200 section 4.2 // which states that an unknown option starting with bits [10] should: // // discard the packet and, regardless of whether or not the // packet's Destination Address was a multicast address, send an // ICMP Parameter Problem, Code 2, message to the packet's // Source Address, pointing to the unrecognized Option Type. // _ = e.protocol.returnError(&icmpReasonParameterProblem{ code: header.ICMPv6UnknownOption, pointer: it.ParseOffset() + optsIt.OptionOffset(), respondToMulticast: true, }, pkt) return default: panic(fmt.Sprintf("unrecognized action for an unrecognized Destination extension header option = %d", opt)) } } case header.IPv6RawPayloadHeader: // If the last header in the payload isn't a known IPv6 extension header, // handle it as if it is transport layer data. // For unfragmented packets, extHdr still contains the transport header. // Get rid of it. // // For reassembled fragments, pkt.TransportHeader is unset, so this is a // no-op and pkt.Data begins with the transport header. extHdr.Buf.TrimFront(pkt.TransportHeader().View().Size()) pkt.Data = extHdr.Buf stats.PacketsDelivered.Increment() if p := tcpip.TransportProtocolNumber(extHdr.Identifier); p == header.ICMPv6ProtocolNumber { pkt.TransportProtocolNumber = p e.handleICMP(pkt, hasFragmentHeader) } else { stats.PacketsDelivered.Increment() switch res := e.dispatcher.DeliverTransportPacket(p, pkt); res { case stack.TransportPacketHandled: case stack.TransportPacketDestinationPortUnreachable: // As per RFC 4443 section 3.1: // A destination node SHOULD originate a Destination Unreachable // message with Code 4 in response to a packet for which the // transport protocol (e.g., UDP) has no listener, if that transport // protocol has no alternative means to inform the sender. _ = e.protocol.returnError(&icmpReasonPortUnreachable{}, pkt) case stack.TransportPacketProtocolUnreachable: // As per RFC 8200 section 4. (page 7): // Extension headers are numbered from IANA IP Protocol Numbers // [IANA-PN], the same values used for IPv4 and IPv6. When // processing a sequence of Next Header values in a packet, the // first one that is not an extension header [IANA-EH] indicates // that the next item in the packet is the corresponding upper-layer // header. // With more related information on page 8: // If, as a result of processing a header, the destination node is // required to proceed to the next header but the Next Header value // in the current header is unrecognized by the node, it should // discard the packet and send an ICMP Parameter Problem message to // the source of the packet, with an ICMP Code value of 1 // ("unrecognized Next Header type encountered") and the ICMP // Pointer field containing the offset of the unrecognized value // within the original packet. // // Which when taken together indicate that an unknown protocol should // be treated as an unrecognized next header value. // The location of the Next Header field is in a different place in // the initial IPv6 header than it is in the extension headers so // treat it specially. prevHdrIDOffset := uint32(header.IPv6NextHeaderOffset) if previousHeaderStart != 0 { prevHdrIDOffset = previousHeaderStart } _ = e.protocol.returnError(&icmpReasonParameterProblem{ code: header.ICMPv6UnknownHeader, pointer: prevHdrIDOffset, }, pkt) default: panic(fmt.Sprintf("unrecognized result from DeliverTransportPacket = %d", res)) } } default: // Since the iterator returns IPv6RawPayloadHeader for unknown Extension // Header IDs this should never happen unless we missed a supported type // here. panic(fmt.Sprintf("unrecognized type from it.Next() = %T", extHdr)) } } } // Close cleans up resources associated with the endpoint. func (e *endpoint) Close() { e.mu.Lock() e.disableLocked() e.mu.ndp.removeSLAACAddresses(false /* keepLinkLocal */) e.stopDADForPermanentAddressesLocked() e.mu.addressableEndpointState.Cleanup() e.mu.Unlock() e.protocol.forgetEndpoint(e.nic.ID()) } // NetworkProtocolNumber implements stack.NetworkEndpoint.NetworkProtocolNumber. func (e *endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber { return e.protocol.Number() } // AddAndAcquirePermanentAddress implements stack.AddressableEndpoint. func (e *endpoint) AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, peb stack.PrimaryEndpointBehavior, configType stack.AddressConfigType, deprecated bool) (stack.AddressEndpoint, tcpip.Error) { // TODO(b/169350103): add checks here after making sure we no longer receive // an empty address. e.mu.Lock() defer e.mu.Unlock() return e.addAndAcquirePermanentAddressLocked(addr, peb, configType, deprecated) } // addAndAcquirePermanentAddressLocked is like AddAndAcquirePermanentAddress but // with locking requirements. // // addAndAcquirePermanentAddressLocked also joins the passed address's // solicited-node multicast group and start duplicate address detection. // // Precondition: e.mu must be write locked. func (e *endpoint) addAndAcquirePermanentAddressLocked(addr tcpip.AddressWithPrefix, peb stack.PrimaryEndpointBehavior, configType stack.AddressConfigType, deprecated bool) (stack.AddressEndpoint, tcpip.Error) { addressEndpoint, err := e.mu.addressableEndpointState.AddAndAcquirePermanentAddress(addr, peb, configType, deprecated) if err != nil { return nil, err } if !header.IsV6UnicastAddress(addr.Address) { return addressEndpoint, nil } addressEndpoint.SetKind(stack.PermanentTentative) if e.Enabled() { if err := e.mu.ndp.startDuplicateAddressDetection(addr.Address, addressEndpoint); err != nil { return nil, err } } snmc := header.SolicitedNodeAddr(addr.Address) if err := e.joinGroupLocked(snmc); err != nil { // joinGroupLocked only returns an error if the group address is not a valid // IPv6 multicast address. panic(fmt.Sprintf("e.joinGroupLocked(%s): %s", snmc, err)) } return addressEndpoint, nil } // RemovePermanentAddress implements stack.AddressableEndpoint. func (e *endpoint) RemovePermanentAddress(addr tcpip.Address) tcpip.Error { e.mu.Lock() defer e.mu.Unlock() addressEndpoint := e.getAddressRLocked(addr) if addressEndpoint == nil || !addressEndpoint.GetKind().IsPermanent() { return &tcpip.ErrBadLocalAddress{} } return e.removePermanentEndpointLocked(addressEndpoint, true /* allowSLAACInvalidation */, false /* dadFailure */) } // removePermanentEndpointLocked is like removePermanentAddressLocked except // it works with a stack.AddressEndpoint. // // Precondition: e.mu must be write locked. func (e *endpoint) removePermanentEndpointLocked(addressEndpoint stack.AddressEndpoint, allowSLAACInvalidation, dadFailure bool) tcpip.Error { addr := addressEndpoint.AddressWithPrefix() // If we are removing an address generated via SLAAC, cleanup // its SLAAC resources and notify the integrator. switch addressEndpoint.ConfigType() { case stack.AddressConfigSlaac: e.mu.ndp.cleanupSLAACAddrResourcesAndNotify(addr, allowSLAACInvalidation) case stack.AddressConfigSlaacTemp: e.mu.ndp.cleanupTempSLAACAddrResourcesAndNotify(addr) } return e.removePermanentEndpointInnerLocked(addressEndpoint, dadFailure) } // removePermanentEndpointInnerLocked is like removePermanentEndpointLocked // except it does not cleanup SLAAC address state. // // Precondition: e.mu must be write locked. func (e *endpoint) removePermanentEndpointInnerLocked(addressEndpoint stack.AddressEndpoint, dadFailure bool) tcpip.Error { addr := addressEndpoint.AddressWithPrefix() e.mu.ndp.stopDuplicateAddressDetection(addr.Address, dadFailure) if err := e.mu.addressableEndpointState.RemovePermanentEndpoint(addressEndpoint); err != nil { return err } snmc := header.SolicitedNodeAddr(addr.Address) err := e.leaveGroupLocked(snmc) // The endpoint may have already left the multicast group. if _, ok := err.(*tcpip.ErrBadLocalAddress); ok { err = nil } return err } // hasPermanentAddressLocked returns true if the endpoint has a permanent // address equal to the passed address. // // Precondition: e.mu must be read or write locked. func (e *endpoint) hasPermanentAddressRLocked(addr tcpip.Address) bool { addressEndpoint := e.getAddressRLocked(addr) if addressEndpoint == nil { return false } return addressEndpoint.GetKind().IsPermanent() } // getAddressRLocked returns the endpoint for the passed address. // // Precondition: e.mu must be read or write locked. func (e *endpoint) getAddressRLocked(localAddr tcpip.Address) stack.AddressEndpoint { return e.mu.addressableEndpointState.GetAddress(localAddr) } // MainAddress implements stack.AddressableEndpoint. func (e *endpoint) MainAddress() tcpip.AddressWithPrefix { e.mu.RLock() defer e.mu.RUnlock() return e.mu.addressableEndpointState.MainAddress() } // AcquireAssignedAddress implements stack.AddressableEndpoint. func (e *endpoint) AcquireAssignedAddress(localAddr tcpip.Address, allowTemp bool, tempPEB stack.PrimaryEndpointBehavior) stack.AddressEndpoint { e.mu.Lock() defer e.mu.Unlock() return e.acquireAddressOrCreateTempLocked(localAddr, allowTemp, tempPEB) } // acquireAddressOrCreateTempLocked is like AcquireAssignedAddress but with // locking requirements. // // Precondition: e.mu must be write locked. func (e *endpoint) acquireAddressOrCreateTempLocked(localAddr tcpip.Address, allowTemp bool, tempPEB stack.PrimaryEndpointBehavior) stack.AddressEndpoint { return e.mu.addressableEndpointState.AcquireAssignedAddress(localAddr, allowTemp, tempPEB) } // AcquireOutgoingPrimaryAddress implements stack.AddressableEndpoint. func (e *endpoint) AcquireOutgoingPrimaryAddress(remoteAddr tcpip.Address, allowExpired bool) stack.AddressEndpoint { e.mu.RLock() defer e.mu.RUnlock() return e.acquireOutgoingPrimaryAddressRLocked(remoteAddr, allowExpired) } // getLinkLocalAddressRLocked returns a link-local address from the primary list // of addresses, if one is available. // // See stack.PrimaryEndpointBehavior for more details about the primary list. // // Precondition: e.mu must be read locked. func (e *endpoint) getLinkLocalAddressRLocked() tcpip.Address { var linkLocalAddr tcpip.Address e.mu.addressableEndpointState.ForEachPrimaryEndpoint(func(addressEndpoint stack.AddressEndpoint) bool { if addressEndpoint.IsAssigned(false /* allowExpired */) { if addr := addressEndpoint.AddressWithPrefix().Address; header.IsV6LinkLocalAddress(addr) { linkLocalAddr = addr return false } } return true }) return linkLocalAddr } // acquireOutgoingPrimaryAddressRLocked is like AcquireOutgoingPrimaryAddress // but with locking requirements. // // Precondition: e.mu must be read locked. func (e *endpoint) acquireOutgoingPrimaryAddressRLocked(remoteAddr tcpip.Address, allowExpired bool) stack.AddressEndpoint { // addrCandidate is a candidate for Source Address Selection, as per // RFC 6724 section 5. type addrCandidate struct { addressEndpoint stack.AddressEndpoint addr tcpip.Address scope header.IPv6AddressScope label uint8 matchingPrefix uint8 } if len(remoteAddr) == 0 { return e.mu.addressableEndpointState.AcquireOutgoingPrimaryAddress(remoteAddr, allowExpired) } // Create a candidate set of available addresses we can potentially use as a // source address. var cs []addrCandidate e.mu.addressableEndpointState.ForEachPrimaryEndpoint(func(addressEndpoint stack.AddressEndpoint) bool { // If r is not valid for outgoing connections, it is not a valid endpoint. if !addressEndpoint.IsAssigned(allowExpired) { return true } addr := addressEndpoint.AddressWithPrefix().Address scope, err := header.ScopeForIPv6Address(addr) if err != nil { // Should never happen as we got r from the primary IPv6 endpoint list and // ScopeForIPv6Address only returns an error if addr is not an IPv6 // address. panic(fmt.Sprintf("header.ScopeForIPv6Address(%s): %s", addr, err)) } cs = append(cs, addrCandidate{ addressEndpoint: addressEndpoint, addr: addr, scope: scope, label: getLabel(addr), matchingPrefix: remoteAddr.MatchingPrefix(addr), }) return true }) remoteScope, err := header.ScopeForIPv6Address(remoteAddr) if err != nil { // primaryIPv6Endpoint should never be called with an invalid IPv6 address. panic(fmt.Sprintf("header.ScopeForIPv6Address(%s): %s", remoteAddr, err)) } remoteLabel := getLabel(remoteAddr) // Sort the addresses as per RFC 6724 section 5 rules 1-3. // // TODO(b/146021396): Implement rules 4, 5 of RFC 6724 section 5. sort.Slice(cs, func(i, j int) bool { sa := cs[i] sb := cs[j] // Prefer same address as per RFC 6724 section 5 rule 1. if sa.addr == remoteAddr { return true } if sb.addr == remoteAddr { return false } // Prefer appropriate scope as per RFC 6724 section 5 rule 2. if sa.scope < sb.scope { return sa.scope >= remoteScope } else if sb.scope < sa.scope { return sb.scope < remoteScope } // Avoid deprecated addresses as per RFC 6724 section 5 rule 3. if saDep, sbDep := sa.addressEndpoint.Deprecated(), sb.addressEndpoint.Deprecated(); saDep != sbDep { // If sa is not deprecated, it is preferred over sb. return sbDep } // Prefer matching label as per RFC 6724 section 5 rule 6. if sa, sb := sa.label == remoteLabel, sb.label == remoteLabel; sa != sb { if sa { return true } if sb { return false } } // Prefer temporary addresses as per RFC 6724 section 5 rule 7. if saTemp, sbTemp := sa.addressEndpoint.ConfigType() == stack.AddressConfigSlaacTemp, sb.addressEndpoint.ConfigType() == stack.AddressConfigSlaacTemp; saTemp != sbTemp { return saTemp } // Use longest matching prefix as per RFC 6724 section 5 rule 8. if sa.matchingPrefix > sb.matchingPrefix { return true } if sb.matchingPrefix > sa.matchingPrefix { return false } // sa and sb are equal, return the endpoint that is closest to the front of // the primary endpoint list. return i < j }) // Return the most preferred address that can have its reference count // incremented. for _, c := range cs { if c.addressEndpoint.IncRef() { return c.addressEndpoint } } return nil } // PrimaryAddresses implements stack.AddressableEndpoint. func (e *endpoint) PrimaryAddresses() []tcpip.AddressWithPrefix { e.mu.RLock() defer e.mu.RUnlock() return e.mu.addressableEndpointState.PrimaryAddresses() } // PermanentAddresses implements stack.AddressableEndpoint. func (e *endpoint) PermanentAddresses() []tcpip.AddressWithPrefix { e.mu.RLock() defer e.mu.RUnlock() return e.mu.addressableEndpointState.PermanentAddresses() } // JoinGroup implements stack.GroupAddressableEndpoint. func (e *endpoint) JoinGroup(addr tcpip.Address) tcpip.Error { e.mu.Lock() defer e.mu.Unlock() return e.joinGroupLocked(addr) } // joinGroupLocked is like JoinGroup but with locking requirements. // // Precondition: e.mu must be locked. func (e *endpoint) joinGroupLocked(addr tcpip.Address) tcpip.Error { if !header.IsV6MulticastAddress(addr) { return &tcpip.ErrBadAddress{} } e.mu.mld.joinGroup(addr) return nil } // LeaveGroup implements stack.GroupAddressableEndpoint. func (e *endpoint) LeaveGroup(addr tcpip.Address) tcpip.Error { e.mu.Lock() defer e.mu.Unlock() return e.leaveGroupLocked(addr) } // leaveGroupLocked is like LeaveGroup but with locking requirements. // // Precondition: e.mu must be locked. func (e *endpoint) leaveGroupLocked(addr tcpip.Address) tcpip.Error { return e.mu.mld.leaveGroup(addr) } // IsInGroup implements stack.GroupAddressableEndpoint. func (e *endpoint) IsInGroup(addr tcpip.Address) bool { e.mu.RLock() defer e.mu.RUnlock() return e.mu.mld.isInGroup(addr) } // Stats implements stack.NetworkEndpoint. func (e *endpoint) Stats() stack.NetworkEndpointStats { return &e.stats.localStats } var _ stack.ForwardingNetworkProtocol = (*protocol)(nil) var _ stack.NetworkProtocol = (*protocol)(nil) var _ fragmentation.TimeoutHandler = (*protocol)(nil) type protocol struct { stack *stack.Stack options Options mu struct { sync.RWMutex // eps is keyed by NICID to allow protocol methods to retrieve an endpoint // when handling a packet, by looking at which NIC handled the packet. eps map[tcpip.NICID]*endpoint } ids []uint32 hashIV uint32 // defaultTTL is the current default TTL for the protocol. Only the // uint8 portion of it is meaningful. // // Must be accessed using atomic operations. defaultTTL uint32 // forwarding is set to 1 when the protocol has forwarding enabled and 0 // when it is disabled. // // Must be accessed using atomic operations. forwarding uint32 fragmentation *fragmentation.Fragmentation } // Number returns the ipv6 protocol number. func (p *protocol) Number() tcpip.NetworkProtocolNumber { return ProtocolNumber } // MinimumPacketSize returns the minimum valid ipv6 packet size. func (p *protocol) MinimumPacketSize() int { return header.IPv6MinimumSize } // DefaultPrefixLen returns the IPv6 default prefix length. func (p *protocol) DefaultPrefixLen() int { return header.IPv6AddressSize * 8 } // ParseAddresses implements NetworkProtocol.ParseAddresses. func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) { h := header.IPv6(v) return h.SourceAddress(), h.DestinationAddress() } // NewEndpoint creates a new ipv6 endpoint. func (p *protocol) NewEndpoint(nic stack.NetworkInterface, dispatcher stack.TransportDispatcher) stack.NetworkEndpoint { e := &endpoint{ nic: nic, dispatcher: dispatcher, protocol: p, } e.mu.Lock() e.mu.addressableEndpointState.Init(e) e.mu.ndp.init(e) e.mu.mld.init(e) e.dad.mu.Lock() e.dad.mu.dad.Init(&e.dad.mu, p.options.DADConfigs, ip.DADOptions{ Clock: p.stack.Clock(), Protocol: &e.mu.ndp, NICID: nic.ID(), }) e.dad.mu.Unlock() e.mu.Unlock() stackStats := p.stack.Stats() tcpip.InitStatCounters(reflect.ValueOf(&e.stats.localStats).Elem()) e.stats.ip.Init(&e.stats.localStats.IP, &stackStats.IP) e.stats.icmp.init(&e.stats.localStats.ICMP, &stackStats.ICMP.V6) p.mu.Lock() defer p.mu.Unlock() p.mu.eps[nic.ID()] = e return e } func (p *protocol) findEndpointWithAddress(addr tcpip.Address) *endpoint { p.mu.RLock() defer p.mu.RUnlock() for _, e := range p.mu.eps { if addressEndpoint := e.AcquireAssignedAddress(addr, false /* allowTemp */, stack.NeverPrimaryEndpoint); addressEndpoint != nil { addressEndpoint.DecRef() return e } } return nil } func (p *protocol) forgetEndpoint(nicID tcpip.NICID) { p.mu.Lock() defer p.mu.Unlock() delete(p.mu.eps, nicID) } // SetOption implements NetworkProtocol.SetOption. func (p *protocol) SetOption(option tcpip.SettableNetworkProtocolOption) tcpip.Error { switch v := option.(type) { case *tcpip.DefaultTTLOption: p.SetDefaultTTL(uint8(*v)) return nil default: return &tcpip.ErrUnknownProtocolOption{} } } // Option implements NetworkProtocol.Option. func (p *protocol) Option(option tcpip.GettableNetworkProtocolOption) tcpip.Error { switch v := option.(type) { case *tcpip.DefaultTTLOption: *v = tcpip.DefaultTTLOption(p.DefaultTTL()) return nil default: return &tcpip.ErrUnknownProtocolOption{} } } // SetDefaultTTL sets the default TTL for endpoints created with this protocol. func (p *protocol) SetDefaultTTL(ttl uint8) { atomic.StoreUint32(&p.defaultTTL, uint32(ttl)) } // DefaultTTL returns the default TTL for endpoints created with this protocol. func (p *protocol) DefaultTTL() uint8 { return uint8(atomic.LoadUint32(&p.defaultTTL)) } // Close implements stack.TransportProtocol.Close. func (*protocol) Close() {} // Wait implements stack.TransportProtocol.Wait. func (*protocol) Wait() {} // parse is like Parse but also attempts to parse the transport layer. // // Returns true if the network header was successfully parsed. func (p *protocol) parse(pkt *stack.PacketBuffer) bool { transProtoNum, hasTransportHdr, ok := p.Parse(pkt) if !ok { return false } if hasTransportHdr { switch err := p.stack.ParsePacketBufferTransport(transProtoNum, pkt); err { case stack.ParsedOK: case stack.UnknownTransportProtocol, stack.TransportLayerParseError: // The transport layer will handle unknown protocols and transport layer // parsing errors. default: panic(fmt.Sprintf("unexpected error parsing transport header = %d", err)) } } return true } // Parse implements stack.NetworkProtocol.Parse. func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) { proto, _, fragOffset, fragMore, ok := parse.IPv6(pkt) if !ok { return 0, false, false } return proto, !fragMore && fragOffset == 0, true } // Forwarding implements stack.ForwardingNetworkProtocol. func (p *protocol) Forwarding() bool { return uint8(atomic.LoadUint32(&p.forwarding)) == 1 } // setForwarding sets the forwarding status for the protocol. // // Returns true if the forwarding status was updated. func (p *protocol) setForwarding(v bool) bool { if v { return atomic.SwapUint32(&p.forwarding, 1) == 0 } return atomic.SwapUint32(&p.forwarding, 0) == 1 } // SetForwarding implements stack.ForwardingNetworkProtocol. func (p *protocol) SetForwarding(v bool) { p.mu.Lock() defer p.mu.Unlock() if !p.setForwarding(v) { return } for _, ep := range p.mu.eps { ep.transitionForwarding(v) } } // calculateNetworkMTU calculates the network-layer payload MTU based on the // link-layer payload MTU and the length of every IPv6 header. // Note that this is different than the Payload Length field of the IPv6 header, // which includes the length of the extension headers. func calculateNetworkMTU(linkMTU, networkHeadersLen uint32) (uint32, tcpip.Error) { if linkMTU < header.IPv6MinimumMTU { return 0, &tcpip.ErrInvalidEndpointState{} } // As per RFC 7112 section 5, we should discard packets if their IPv6 header // is bigger than 1280 bytes (ie, the minimum link MTU) since we do not // support PMTU discovery: // Hosts that do not discover the Path MTU MUST limit the IPv6 Header Chain // length to 1280 bytes. Limiting the IPv6 Header Chain length to 1280 // bytes ensures that the header chain length does not exceed the IPv6 // minimum MTU. if networkHeadersLen > header.IPv6MinimumMTU { return 0, &tcpip.ErrMalformedHeader{} } networkMTU := linkMTU - uint32(networkHeadersLen) if networkMTU > maxPayloadSize { networkMTU = maxPayloadSize } return networkMTU, nil } // Options holds options to configure a new protocol. type Options struct { // NDPConfigs is the default NDP configurations used by interfaces. NDPConfigs NDPConfigurations // AutoGenLinkLocal determines whether or not the stack attempts to // auto-generate a link-local address for newly enabled non-loopback // NICs. // // Note, setting this to true does not mean that a link-local address is // assigned right away, or at all. If Duplicate Address Detection is enabled, // an address is only assigned if it successfully resolves. If it fails, no // further attempts are made to auto-generate a link-local adddress. // // The generated link-local address follows RFC 4291 Appendix A guidelines. AutoGenLinkLocal bool // NDPDisp is the NDP event dispatcher that an integrator can provide to // receive NDP related events. NDPDisp NDPDispatcher // OpaqueIIDOpts hold the options for generating opaque interface // identifiers (IIDs) as outlined by RFC 7217. OpaqueIIDOpts OpaqueInterfaceIdentifierOptions // TempIIDSeed is used to seed the initial temporary interface identifier // history value used to generate IIDs for temporary SLAAC addresses. // // Temporary SLAAC adresses are short-lived addresses which are unpredictable // and random from the perspective of other nodes on the network. It is // recommended that the seed be a random byte buffer of at least // header.IIDSize bytes to make sure that temporary SLAAC addresses are // sufficiently random. It should follow minimum randomness requirements for // security as outlined by RFC 4086. // // Note: using a nil value, the same seed across netstack program runs, or a // seed that is too small would reduce randomness and increase predictability, // defeating the purpose of temporary SLAAC addresses. TempIIDSeed []byte // MLD holds options for MLD. MLD MLDOptions // DADConfigs holds the default DAD configurations used by IPv6 endpoints. DADConfigs stack.DADConfigurations } // NewProtocolWithOptions returns an IPv6 network protocol. func NewProtocolWithOptions(opts Options) stack.NetworkProtocolFactory { opts.NDPConfigs.validate() ids := hash.RandN32(buckets) hashIV := hash.RandN32(1)[0] return func(s *stack.Stack) stack.NetworkProtocol { p := &protocol{ stack: s, options: opts, ids: ids, hashIV: hashIV, } p.fragmentation = fragmentation.NewFragmentation(header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit, fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, ReassembleTimeout, s.Clock(), p) p.mu.eps = make(map[tcpip.NICID]*endpoint) p.SetDefaultTTL(DefaultTTL) return p } } // NewProtocol is equivalent to NewProtocolWithOptions with an empty Options. func NewProtocol(s *stack.Stack) stack.NetworkProtocol { return NewProtocolWithOptions(Options{})(s) } func calculateFragmentReserve(pkt *stack.PacketBuffer) int { return pkt.AvailableHeaderBytes() + pkt.NetworkHeader().View().Size() + header.IPv6FragmentHeaderSize } // hashRoute calculates a hash value for the given route. It uses the source & // destination address and 32-bit number to generate the hash. func hashRoute(r *stack.Route, hashIV uint32) uint32 { // The FNV-1a was chosen because it is a fast hashing algorithm, and // cryptographic properties are not needed here. h := fnv.New32a() if _, err := h.Write([]byte(r.LocalAddress)); err != nil { panic(fmt.Sprintf("Hash.Write: %s, but Hash' implementation of Write is not expected to ever return an error", err)) } if _, err := h.Write([]byte(r.RemoteAddress)); err != nil { panic(fmt.Sprintf("Hash.Write: %s, but Hash' implementation of Write is not expected to ever return an error", err)) } s := make([]byte, 4) binary.LittleEndian.PutUint32(s, hashIV) if _, err := h.Write(s); err != nil { panic(fmt.Sprintf("Hash.Write: %s, but Hash' implementation of Write is not expected ever to return an error", err)) } return h.Sum32() } func buildNextFragment(pf *fragmentation.PacketFragmenter, originalIPHeaders header.IPv6, transportProto tcpip.TransportProtocolNumber, id uint32) (*stack.PacketBuffer, bool) { fragPkt, offset, copied, more := pf.BuildNextFragment() fragPkt.NetworkProtocolNumber = ProtocolNumber originalIPHeadersLength := len(originalIPHeaders) s := header.IPv6ExtHdrSerializer{&header.IPv6SerializableFragmentExtHdr{ FragmentOffset: uint16(offset / header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit), M: more, Identification: id, }} fragmentIPHeadersLength := originalIPHeadersLength + s.Length() fragmentIPHeaders := header.IPv6(fragPkt.NetworkHeader().Push(fragmentIPHeadersLength)) // Copy the IPv6 header and any extension headers already populated. if copied := copy(fragmentIPHeaders, originalIPHeaders); copied != originalIPHeadersLength { panic(fmt.Sprintf("wrong number of bytes copied into fragmentIPHeaders: got %d, want %d", copied, originalIPHeadersLength)) } nextHeader, _ := s.Serialize(transportProto, fragmentIPHeaders[originalIPHeadersLength:]) fragmentIPHeaders.SetNextHeader(nextHeader) fragmentIPHeaders.SetPayloadLength(uint16(copied + fragmentIPHeadersLength - header.IPv6MinimumSize)) return fragPkt, more }