summaryrefslogtreecommitdiffhomepage
path: root/pkg/tcpip/stack/transport_demuxer.go
diff options
context:
space:
mode:
authorIan Lewis <ianmlewis@gmail.com>2020-08-17 21:44:31 -0400
committerIan Lewis <ianmlewis@gmail.com>2020-08-17 21:44:31 -0400
commitac324f646ee3cb7955b0b45a7453aeb9671cbdf1 (patch)
tree0cbc5018e8807421d701d190dc20525726c7ca76 /pkg/tcpip/stack/transport_demuxer.go
parent352ae1022ce19de28fc72e034cc469872ad79d06 (diff)
parent6d0c5803d557d453f15ac6f683697eeb46dab680 (diff)
Merge branch 'master' into ip-forwarding
- Merges aleksej-paschenko's with HEAD - Adds vfs2 support for ip_forward
Diffstat (limited to 'pkg/tcpip/stack/transport_demuxer.go')
-rw-r--r--pkg/tcpip/stack/transport_demuxer.go566
1 files changed, 377 insertions, 189 deletions
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index 97a1aec4b..b902c6ca9 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -17,12 +17,12 @@ package stack
import (
"fmt"
"math/rand"
- "sync"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
- "gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
"gvisor.dev/gvisor/pkg/tcpip/header"
+ "gvisor.dev/gvisor/pkg/tcpip/ports"
)
type protocolIDs struct {
@@ -35,28 +35,130 @@ type protocolIDs struct {
type transportEndpoints struct {
// mu protects all fields of the transportEndpoints.
mu sync.RWMutex
- endpoints map[TransportEndpointID]*endpointsByNic
+ endpoints map[TransportEndpointID]*endpointsByNIC
// rawEndpoints contains endpoints for raw sockets, which receive all
// traffic of a given protocol regardless of port.
rawEndpoints []RawTransportEndpoint
}
-type endpointsByNic struct {
+// unregisterEndpoint unregisters the endpoint with the given id such that it
+// won't receive any more packets.
+func (eps *transportEndpoints) unregisterEndpoint(id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) {
+ eps.mu.Lock()
+ defer eps.mu.Unlock()
+ epsByNIC, ok := eps.endpoints[id]
+ if !ok {
+ return
+ }
+ if !epsByNIC.unregisterEndpoint(bindToDevice, ep, flags) {
+ return
+ }
+ delete(eps.endpoints, id)
+}
+
+func (eps *transportEndpoints) transportEndpoints() []TransportEndpoint {
+ eps.mu.RLock()
+ defer eps.mu.RUnlock()
+ es := make([]TransportEndpoint, 0, len(eps.endpoints))
+ for _, e := range eps.endpoints {
+ es = append(es, e.transportEndpoints()...)
+ }
+ return es
+}
+
+// iterEndpointsLocked yields all endpointsByNIC in eps that match id, in
+// descending order of match quality. If a call to yield returns false,
+// iterEndpointsLocked stops iteration and returns immediately.
+//
+// Preconditions: eps.mu must be locked.
+func (eps *transportEndpoints) iterEndpointsLocked(id TransportEndpointID, yield func(*endpointsByNIC) bool) {
+ // Try to find a match with the id as provided.
+ if ep, ok := eps.endpoints[id]; ok {
+ if !yield(ep) {
+ return
+ }
+ }
+
+ // Try to find a match with the id minus the local address.
+ nid := id
+
+ nid.LocalAddress = ""
+ if ep, ok := eps.endpoints[nid]; ok {
+ if !yield(ep) {
+ return
+ }
+ }
+
+ // Try to find a match with the id minus the remote part.
+ nid.LocalAddress = id.LocalAddress
+ nid.RemoteAddress = ""
+ nid.RemotePort = 0
+ if ep, ok := eps.endpoints[nid]; ok {
+ if !yield(ep) {
+ return
+ }
+ }
+
+ // Try to find a match with only the local port.
+ nid.LocalAddress = ""
+ if ep, ok := eps.endpoints[nid]; ok {
+ if !yield(ep) {
+ return
+ }
+ }
+}
+
+// findAllEndpointsLocked returns all endpointsByNIC in eps that match id, in
+// descending order of match quality.
+//
+// Preconditions: eps.mu must be locked.
+func (eps *transportEndpoints) findAllEndpointsLocked(id TransportEndpointID) []*endpointsByNIC {
+ var matchedEPs []*endpointsByNIC
+ eps.iterEndpointsLocked(id, func(ep *endpointsByNIC) bool {
+ matchedEPs = append(matchedEPs, ep)
+ return true
+ })
+ return matchedEPs
+}
+
+// findEndpointLocked returns the endpoint that most closely matches the given id.
+//
+// Preconditions: eps.mu must be locked.
+func (eps *transportEndpoints) findEndpointLocked(id TransportEndpointID) *endpointsByNIC {
+ var matchedEP *endpointsByNIC
+ eps.iterEndpointsLocked(id, func(ep *endpointsByNIC) bool {
+ matchedEP = ep
+ return false
+ })
+ return matchedEP
+}
+
+type endpointsByNIC struct {
mu sync.RWMutex
endpoints map[tcpip.NICID]*multiPortEndpoint
// seed is a random secret for a jenkins hash.
seed uint32
}
+func (epsByNIC *endpointsByNIC) transportEndpoints() []TransportEndpoint {
+ epsByNIC.mu.RLock()
+ defer epsByNIC.mu.RUnlock()
+ var eps []TransportEndpoint
+ for _, ep := range epsByNIC.endpoints {
+ eps = append(eps, ep.transportEndpoints()...)
+ }
+ return eps
+}
+
// HandlePacket is called by the stack when new packets arrive to this transport
// endpoint.
-func (epsByNic *endpointsByNic) handlePacket(r *Route, id TransportEndpointID, vv buffer.VectorisedView) {
- epsByNic.mu.RLock()
+func (epsByNIC *endpointsByNIC) handlePacket(r *Route, id TransportEndpointID, pkt *PacketBuffer) {
+ epsByNIC.mu.RLock()
- mpep, ok := epsByNic.endpoints[r.ref.nic.ID()]
+ mpep, ok := epsByNIC.endpoints[r.ref.nic.ID()]
if !ok {
- if mpep, ok = epsByNic.endpoints[0]; !ok {
- epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
+ if mpep, ok = epsByNIC.endpoints[0]; !ok {
+ epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
return
}
}
@@ -64,24 +166,30 @@ func (epsByNic *endpointsByNic) handlePacket(r *Route, id TransportEndpointID, v
// If this is a broadcast or multicast datagram, deliver the datagram to all
// endpoints bound to the right device.
if isMulticastOrBroadcast(id.LocalAddress) {
- mpep.handlePacketAll(r, id, vv)
- epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
+ mpep.handlePacketAll(r, id, pkt)
+ epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
return
}
-
// multiPortEndpoints are guaranteed to have at least one element.
- selectEndpoint(id, mpep, epsByNic.seed).HandlePacket(r, id, vv)
- epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
+ transEP := selectEndpoint(id, mpep, epsByNIC.seed)
+ if queuedProtocol, mustQueue := mpep.demux.queuedProtocols[protocolIDs{mpep.netProto, mpep.transProto}]; mustQueue {
+ queuedProtocol.QueuePacket(r, transEP, id, pkt)
+ epsByNIC.mu.RUnlock()
+ return
+ }
+
+ transEP.HandlePacket(r, id, pkt)
+ epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
}
// HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (epsByNic *endpointsByNic) handleControlPacket(n *NIC, id TransportEndpointID, typ ControlType, extra uint32, vv buffer.VectorisedView) {
- epsByNic.mu.RLock()
- defer epsByNic.mu.RUnlock()
+func (epsByNIC *endpointsByNIC) handleControlPacket(n *NIC, id TransportEndpointID, typ ControlType, extra uint32, pkt *PacketBuffer) {
+ epsByNIC.mu.RLock()
+ defer epsByNIC.mu.RUnlock()
- mpep, ok := epsByNic.endpoints[n.ID()]
+ mpep, ok := epsByNIC.endpoints[n.ID()]
if !ok {
- mpep, ok = epsByNic.endpoints[0]
+ mpep, ok = epsByNIC.endpoints[0]
}
if !ok {
return
@@ -91,55 +199,52 @@ func (epsByNic *endpointsByNic) handleControlPacket(n *NIC, id TransportEndpoint
// broadcast like we are doing with handlePacket above?
// multiPortEndpoints are guaranteed to have at least one element.
- selectEndpoint(id, mpep, epsByNic.seed).HandleControlPacket(id, typ, extra, vv)
+ selectEndpoint(id, mpep, epsByNIC.seed).HandleControlPacket(id, typ, extra, pkt)
}
// registerEndpoint returns true if it succeeds. It fails and returns
// false if ep already has an element with the same key.
-func (epsByNic *endpointsByNic) registerEndpoint(t TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
- epsByNic.mu.Lock()
- defer epsByNic.mu.Unlock()
+func (epsByNIC *endpointsByNIC) registerEndpoint(d *transportDemuxer, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, t TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) *tcpip.Error {
+ epsByNIC.mu.Lock()
+ defer epsByNIC.mu.Unlock()
- if multiPortEp, ok := epsByNic.endpoints[bindToDevice]; ok {
- // There was already a bind.
- return multiPortEp.singleRegisterEndpoint(t, reusePort)
+ multiPortEp, ok := epsByNIC.endpoints[bindToDevice]
+ if !ok {
+ multiPortEp = &multiPortEndpoint{
+ demux: d,
+ netProto: netProto,
+ transProto: transProto,
+ }
+ epsByNIC.endpoints[bindToDevice] = multiPortEp
}
- // This is a new binding.
- multiPortEp := &multiPortEndpoint{}
- multiPortEp.endpointsMap = make(map[TransportEndpoint]int)
- multiPortEp.reuse = reusePort
- epsByNic.endpoints[bindToDevice] = multiPortEp
- return multiPortEp.singleRegisterEndpoint(t, reusePort)
+ return multiPortEp.singleRegisterEndpoint(t, flags)
}
-// unregisterEndpoint returns true if endpointsByNic has to be unregistered.
-func (epsByNic *endpointsByNic) unregisterEndpoint(bindToDevice tcpip.NICID, t TransportEndpoint) bool {
- epsByNic.mu.Lock()
- defer epsByNic.mu.Unlock()
- multiPortEp, ok := epsByNic.endpoints[bindToDevice]
+func (epsByNIC *endpointsByNIC) checkEndpoint(d *transportDemuxer, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, flags ports.Flags, bindToDevice tcpip.NICID) *tcpip.Error {
+ epsByNIC.mu.RLock()
+ defer epsByNIC.mu.RUnlock()
+
+ multiPortEp, ok := epsByNIC.endpoints[bindToDevice]
if !ok {
- return false
- }
- if multiPortEp.unregisterEndpoint(t) {
- delete(epsByNic.endpoints, bindToDevice)
+ return nil
}
- return len(epsByNic.endpoints) == 0
+
+ return multiPortEp.singleCheckEndpoint(flags)
}
-// unregisterEndpoint unregisters the endpoint with the given id such that it
-// won't receive any more packets.
-func (eps *transportEndpoints) unregisterEndpoint(id TransportEndpointID, ep TransportEndpoint, bindToDevice tcpip.NICID) {
- eps.mu.Lock()
- defer eps.mu.Unlock()
- epsByNic, ok := eps.endpoints[id]
+// unregisterEndpoint returns true if endpointsByNIC has to be unregistered.
+func (epsByNIC *endpointsByNIC) unregisterEndpoint(bindToDevice tcpip.NICID, t TransportEndpoint, flags ports.Flags) bool {
+ epsByNIC.mu.Lock()
+ defer epsByNIC.mu.Unlock()
+ multiPortEp, ok := epsByNIC.endpoints[bindToDevice]
if !ok {
- return
+ return false
}
- if !epsByNic.unregisterEndpoint(bindToDevice, ep) {
- return
+ if multiPortEp.unregisterEndpoint(t, flags) {
+ delete(epsByNIC.endpoints, bindToDevice)
}
- delete(eps.endpoints, id)
+ return len(epsByNIC.endpoints) == 0
}
// transportDemuxer demultiplexes packets targeted at a transport endpoint
@@ -149,17 +254,33 @@ func (eps *transportEndpoints) unregisterEndpoint(id TransportEndpointID, ep Tra
// newTransportDemuxer.
type transportDemuxer struct {
// protocol is immutable.
- protocol map[protocolIDs]*transportEndpoints
+ protocol map[protocolIDs]*transportEndpoints
+ queuedProtocols map[protocolIDs]queuedTransportProtocol
+}
+
+// queuedTransportProtocol if supported by a protocol implementation will cause
+// the dispatcher to delivery packets to the QueuePacket method instead of
+// calling HandlePacket directly on the endpoint.
+type queuedTransportProtocol interface {
+ QueuePacket(r *Route, ep TransportEndpoint, id TransportEndpointID, pkt *PacketBuffer)
}
func newTransportDemuxer(stack *Stack) *transportDemuxer {
- d := &transportDemuxer{protocol: make(map[protocolIDs]*transportEndpoints)}
+ d := &transportDemuxer{
+ protocol: make(map[protocolIDs]*transportEndpoints),
+ queuedProtocols: make(map[protocolIDs]queuedTransportProtocol),
+ }
// Add each network and transport pair to the demuxer.
for netProto := range stack.networkProtocols {
for proto := range stack.transportProtocols {
- d.protocol[protocolIDs{netProto, proto}] = &transportEndpoints{
- endpoints: make(map[TransportEndpointID]*endpointsByNic),
+ protoIDs := protocolIDs{netProto, proto}
+ d.protocol[protoIDs] = &transportEndpoints{
+ endpoints: make(map[TransportEndpointID]*endpointsByNIC),
+ }
+ qTransProto, isQueued := (stack.transportProtocols[proto].proto).(queuedTransportProtocol)
+ if isQueued {
+ d.queuedProtocols[protoIDs] = qTransProto
}
}
}
@@ -169,10 +290,21 @@ func newTransportDemuxer(stack *Stack) *transportDemuxer {
// registerEndpoint registers the given endpoint with the dispatcher such that
// packets that match the endpoint ID are delivered to it.
-func (d *transportDemuxer) registerEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
+func (d *transportDemuxer) registerEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) *tcpip.Error {
for i, n := range netProtos {
- if err := d.singleRegisterEndpoint(n, protocol, id, ep, reusePort, bindToDevice); err != nil {
- d.unregisterEndpoint(netProtos[:i], protocol, id, ep, bindToDevice)
+ if err := d.singleRegisterEndpoint(n, protocol, id, ep, flags, bindToDevice); err != nil {
+ d.unregisterEndpoint(netProtos[:i], protocol, id, ep, flags, bindToDevice)
+ return err
+ }
+ }
+
+ return nil
+}
+
+// checkEndpoint checks if an endpoint can be registered with the dispatcher.
+func (d *transportDemuxer) checkEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, flags ports.Flags, bindToDevice tcpip.NICID) *tcpip.Error {
+ for _, n := range netProtos {
+ if err := d.singleCheckEndpoint(n, protocol, id, flags, bindToDevice); err != nil {
return err
}
}
@@ -183,12 +315,29 @@ func (d *transportDemuxer) registerEndpoint(netProtos []tcpip.NetworkProtocolNum
// multiPortEndpoint is a container for TransportEndpoints which are bound to
// the same pair of address and port. endpointsArr always has at least one
// element.
+//
+// FIXME(gvisor.dev/issue/873): Restore this properly. Currently, we just save
+// this to ensure that the underlying endpoints get saved/restored, but not not
+// use the restored copy.
+//
+// +stateify savable
type multiPortEndpoint struct {
- mu sync.RWMutex
- endpointsArr []TransportEndpoint
- endpointsMap map[TransportEndpoint]int
- // reuse indicates if more than one endpoint is allowed.
- reuse bool
+ mu sync.RWMutex `state:"nosave"`
+ demux *transportDemuxer
+ netProto tcpip.NetworkProtocolNumber
+ transProto tcpip.TransportProtocolNumber
+
+ // endpoints stores the transport endpoints in the order in which they
+ // were bound. This is required for UDP SO_REUSEADDR.
+ endpoints []TransportEndpoint
+ flags ports.FlagCounter
+}
+
+func (ep *multiPortEndpoint) transportEndpoints() []TransportEndpoint {
+ ep.mu.RLock()
+ eps := append([]TransportEndpoint(nil), ep.endpoints...)
+ ep.mu.RUnlock()
+ return eps
}
// reciprocalScale scales a value into range [0, n).
@@ -203,8 +352,12 @@ func reciprocalScale(val, n uint32) uint32 {
// ports then uses it to select a socket. In this case, all packets from one
// address will be sent to same endpoint.
func selectEndpoint(id TransportEndpointID, mpep *multiPortEndpoint, seed uint32) TransportEndpoint {
- if len(mpep.endpointsArr) == 1 {
- return mpep.endpointsArr[0]
+ if len(mpep.endpoints) == 1 {
+ return mpep.endpoints[0]
+ }
+
+ if mpep.flags.IntersectionRefs().ToFlags().Effective().MostRecent {
+ return mpep.endpoints[len(mpep.endpoints)-1]
}
payload := []byte{
@@ -220,72 +373,89 @@ func selectEndpoint(id TransportEndpointID, mpep *multiPortEndpoint, seed uint32
h.Write([]byte(id.RemoteAddress))
hash := h.Sum32()
- idx := reciprocalScale(hash, uint32(len(mpep.endpointsArr)))
- return mpep.endpointsArr[idx]
+ idx := reciprocalScale(hash, uint32(len(mpep.endpoints)))
+ return mpep.endpoints[idx]
}
-func (ep *multiPortEndpoint) handlePacketAll(r *Route, id TransportEndpointID, vv buffer.VectorisedView) {
+func (ep *multiPortEndpoint) handlePacketAll(r *Route, id TransportEndpointID, pkt *PacketBuffer) {
ep.mu.RLock()
- for i, endpoint := range ep.endpointsArr {
- // HandlePacket modifies vv, so each endpoint needs its own copy except for
- // the final one.
- if i == len(ep.endpointsArr)-1 {
- endpoint.HandlePacket(r, id, vv)
- break
+ queuedProtocol, mustQueue := ep.demux.queuedProtocols[protocolIDs{ep.netProto, ep.transProto}]
+ // HandlePacket takes ownership of pkt, so each endpoint needs
+ // its own copy except for the final one.
+ for _, endpoint := range ep.endpoints[:len(ep.endpoints)-1] {
+ if mustQueue {
+ queuedProtocol.QueuePacket(r, endpoint, id, pkt.Clone())
+ } else {
+ endpoint.HandlePacket(r, id, pkt.Clone())
}
- vvCopy := buffer.NewView(vv.Size())
- copy(vvCopy, vv.ToView())
- endpoint.HandlePacket(r, id, vvCopy.ToVectorisedView())
+ }
+ if endpoint := ep.endpoints[len(ep.endpoints)-1]; mustQueue {
+ queuedProtocol.QueuePacket(r, endpoint, id, pkt)
+ } else {
+ endpoint.HandlePacket(r, id, pkt)
}
ep.mu.RUnlock() // Don't use defer for performance reasons.
}
// singleRegisterEndpoint tries to add an endpoint to the multiPortEndpoint
// list. The list might be empty already.
-func (ep *multiPortEndpoint) singleRegisterEndpoint(t TransportEndpoint, reusePort bool) *tcpip.Error {
+func (ep *multiPortEndpoint) singleRegisterEndpoint(t TransportEndpoint, flags ports.Flags) *tcpip.Error {
ep.mu.Lock()
defer ep.mu.Unlock()
- if len(ep.endpointsArr) > 0 {
+ bits := flags.Bits() & ports.MultiBindFlagMask
+
+ if len(ep.endpoints) != 0 {
+ // If it was previously bound, we need to check if we can bind again.
+ if ep.flags.TotalRefs() > 0 && bits&ep.flags.IntersectionRefs() == 0 {
+ return tcpip.ErrPortInUse
+ }
+ }
+
+ ep.endpoints = append(ep.endpoints, t)
+ ep.flags.AddRef(bits)
+
+ return nil
+}
+
+func (ep *multiPortEndpoint) singleCheckEndpoint(flags ports.Flags) *tcpip.Error {
+ ep.mu.RLock()
+ defer ep.mu.RUnlock()
+
+ bits := flags.Bits() & ports.MultiBindFlagMask
+
+ if len(ep.endpoints) != 0 {
// If it was previously bound, we need to check if we can bind again.
- if !ep.reuse || !reusePort {
+ if ep.flags.TotalRefs() > 0 && bits&ep.flags.IntersectionRefs() == 0 {
return tcpip.ErrPortInUse
}
}
- // A new endpoint is added into endpointsArr and its index there is saved in
- // endpointsMap. This will allow us to remove endpoint from the array fast.
- ep.endpointsMap[t] = len(ep.endpointsArr)
- ep.endpointsArr = append(ep.endpointsArr, t)
return nil
}
// unregisterEndpoint returns true if multiPortEndpoint has to be unregistered.
-func (ep *multiPortEndpoint) unregisterEndpoint(t TransportEndpoint) bool {
+func (ep *multiPortEndpoint) unregisterEndpoint(t TransportEndpoint, flags ports.Flags) bool {
ep.mu.Lock()
defer ep.mu.Unlock()
- idx, ok := ep.endpointsMap[t]
- if !ok {
- return false
- }
- delete(ep.endpointsMap, t)
- l := len(ep.endpointsArr)
- if l > 1 {
- // The last endpoint in endpointsArr is moved instead of the deleted one.
- lastEp := ep.endpointsArr[l-1]
- ep.endpointsArr[idx] = lastEp
- ep.endpointsMap[lastEp] = idx
- ep.endpointsArr = ep.endpointsArr[0 : l-1]
- return false
+ for i, endpoint := range ep.endpoints {
+ if endpoint == t {
+ copy(ep.endpoints[i:], ep.endpoints[i+1:])
+ ep.endpoints[len(ep.endpoints)-1] = nil
+ ep.endpoints = ep.endpoints[:len(ep.endpoints)-1]
+
+ ep.flags.DropRef(flags.Bits() & ports.MultiBindFlagMask)
+ break
+ }
}
- return true
+ return len(ep.endpoints) == 0
}
-func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
+func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) *tcpip.Error {
if id.RemotePort != 0 {
- // TODO(eyalsoha): Why?
- reusePort = false
+ // SO_REUSEPORT only applies to bound/listening endpoints.
+ flags.LoadBalanced = false
}
eps, ok := d.protocol[protocolIDs{netProto, protocol}]
@@ -296,82 +466,109 @@ func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocol
eps.mu.Lock()
defer eps.mu.Unlock()
- if epsByNic, ok := eps.endpoints[id]; ok {
- // There was already a binding.
- return epsByNic.registerEndpoint(ep, reusePort, bindToDevice)
+ epsByNIC, ok := eps.endpoints[id]
+ if !ok {
+ epsByNIC = &endpointsByNIC{
+ endpoints: make(map[tcpip.NICID]*multiPortEndpoint),
+ seed: rand.Uint32(),
+ }
+ eps.endpoints[id] = epsByNIC
+ }
+
+ return epsByNIC.registerEndpoint(d, netProto, protocol, ep, flags, bindToDevice)
+}
+
+func (d *transportDemuxer) singleCheckEndpoint(netProto tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, flags ports.Flags, bindToDevice tcpip.NICID) *tcpip.Error {
+ if id.RemotePort != 0 {
+ // SO_REUSEPORT only applies to bound/listening endpoints.
+ flags.LoadBalanced = false
+ }
+
+ eps, ok := d.protocol[protocolIDs{netProto, protocol}]
+ if !ok {
+ return tcpip.ErrUnknownProtocol
}
- // This is a new binding.
- epsByNic := &endpointsByNic{
- endpoints: make(map[tcpip.NICID]*multiPortEndpoint),
- seed: rand.Uint32(),
+ eps.mu.RLock()
+ defer eps.mu.RUnlock()
+
+ epsByNIC, ok := eps.endpoints[id]
+ if !ok {
+ return nil
}
- eps.endpoints[id] = epsByNic
- return epsByNic.registerEndpoint(ep, reusePort, bindToDevice)
+ return epsByNIC.checkEndpoint(d, netProto, protocol, flags, bindToDevice)
}
// unregisterEndpoint unregisters the endpoint with the given id such that it
// won't receive any more packets.
-func (d *transportDemuxer) unregisterEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, bindToDevice tcpip.NICID) {
+func (d *transportDemuxer) unregisterEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) {
+ if id.RemotePort != 0 {
+ // SO_REUSEPORT only applies to bound/listening endpoints.
+ flags.LoadBalanced = false
+ }
+
for _, n := range netProtos {
if eps, ok := d.protocol[protocolIDs{n, protocol}]; ok {
- eps.unregisterEndpoint(id, ep, bindToDevice)
+ eps.unregisterEndpoint(id, ep, flags, bindToDevice)
}
}
}
-var loopbackSubnet = func() tcpip.Subnet {
- sn, err := tcpip.NewSubnet("\x7f\x00\x00\x00", "\xff\x00\x00\x00")
- if err != nil {
- panic(err)
- }
- return sn
-}()
-
// deliverPacket attempts to find one or more matching transport endpoints, and
-// then, if matches are found, delivers the packet to them. Returns true if it
-// found one or more endpoints, false otherwise.
-func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProtocolNumber, netHeader buffer.View, vv buffer.VectorisedView, id TransportEndpointID) bool {
+// then, if matches are found, delivers the packet to them. Returns true if
+// the packet no longer needs to be handled.
+func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer, id TransportEndpointID) bool {
eps, ok := d.protocol[protocolIDs{r.NetProto, protocol}]
if !ok {
return false
}
- eps.mu.RLock()
-
- // Determine which transport endpoint or endpoints to deliver this packet to.
- // If the packet is a broadcast or multicast, then find all matching
+ // If the packet is a UDP broadcast or multicast, then find all matching
// transport endpoints.
- var destEps []*endpointsByNic
if protocol == header.UDPProtocolNumber && isMulticastOrBroadcast(id.LocalAddress) {
- destEps = d.findAllEndpointsLocked(eps, vv, id)
- } else if ep := d.findEndpointLocked(eps, vv, id); ep != nil {
- destEps = append(destEps, ep)
+ eps.mu.RLock()
+ destEPs := eps.findAllEndpointsLocked(id)
+ eps.mu.RUnlock()
+ // Fail if we didn't find at least one matching transport endpoint.
+ if len(destEPs) == 0 {
+ r.Stats().UDP.UnknownPortErrors.Increment()
+ return false
+ }
+ // handlePacket takes ownership of pkt, so each endpoint needs its own
+ // copy except for the final one.
+ for _, ep := range destEPs[:len(destEPs)-1] {
+ ep.handlePacket(r, id, pkt.Clone())
+ }
+ destEPs[len(destEPs)-1].handlePacket(r, id, pkt)
+ return true
}
- eps.mu.RUnlock()
+ // If the packet is a TCP packet with a non-unicast source or destination
+ // address, then do nothing further and instruct the caller to do the same.
+ if protocol == header.TCPProtocolNumber && (!isUnicast(r.LocalAddress) || !isUnicast(r.RemoteAddress)) {
+ // TCP can only be used to communicate between a single source and a
+ // single destination; the addresses must be unicast.
+ r.Stats().TCP.InvalidSegmentsReceived.Increment()
+ return true
+ }
- // Fail if we didn't find at least one matching transport endpoint.
- if len(destEps) == 0 {
- // UDP packet could not be delivered to an unknown destination port.
+ eps.mu.RLock()
+ ep := eps.findEndpointLocked(id)
+ eps.mu.RUnlock()
+ if ep == nil {
if protocol == header.UDPProtocolNumber {
r.Stats().UDP.UnknownPortErrors.Increment()
}
return false
}
-
- // Deliver the packet.
- for _, ep := range destEps {
- ep.handlePacket(r, id, vv)
- }
-
+ ep.handlePacket(r, id, pkt)
return true
}
// deliverRawPacket attempts to deliver the given packet and returns whether it
// was delivered successfully.
-func (d *transportDemuxer) deliverRawPacket(r *Route, protocol tcpip.TransportProtocolNumber, netHeader buffer.View, vv buffer.VectorisedView) bool {
+func (d *transportDemuxer) deliverRawPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer) bool {
eps, ok := d.protocol[protocolIDs{r.NetProto, protocol}]
if !ok {
return false
@@ -385,7 +582,7 @@ func (d *transportDemuxer) deliverRawPacket(r *Route, protocol tcpip.TransportPr
for _, rawEP := range eps.rawEndpoints {
// Each endpoint gets its own copy of the packet for the sake
// of save/restore.
- rawEP.HandlePacket(r, buffer.NewViewFromBytes(netHeader), vv.ToView().ToVectorisedView())
+ rawEP.HandlePacket(r, pkt)
foundRaw = true
}
eps.mu.RUnlock()
@@ -395,67 +592,51 @@ func (d *transportDemuxer) deliverRawPacket(r *Route, protocol tcpip.TransportPr
// deliverControlPacket attempts to deliver the given control packet. Returns
// true if it found an endpoint, false otherwise.
-func (d *transportDemuxer) deliverControlPacket(n *NIC, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, vv buffer.VectorisedView, id TransportEndpointID) bool {
+func (d *transportDemuxer) deliverControlPacket(n *NIC, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt *PacketBuffer, id TransportEndpointID) bool {
eps, ok := d.protocol[protocolIDs{net, trans}]
if !ok {
return false
}
- // Try to find the endpoint.
eps.mu.RLock()
- ep := d.findEndpointLocked(eps, vv, id)
+ ep := eps.findEndpointLocked(id)
eps.mu.RUnlock()
-
- // Fail if we didn't find one.
if ep == nil {
return false
}
- // Deliver the packet.
- ep.handleControlPacket(n, id, typ, extra, vv)
-
+ ep.handleControlPacket(n, id, typ, extra, pkt)
return true
}
-func (d *transportDemuxer) findAllEndpointsLocked(eps *transportEndpoints, vv buffer.VectorisedView, id TransportEndpointID) []*endpointsByNic {
- var matchedEPs []*endpointsByNic
- // Try to find a match with the id as provided.
- if ep, ok := eps.endpoints[id]; ok {
- matchedEPs = append(matchedEPs, ep)
+// findTransportEndpoint find a single endpoint that most closely matches the provided id.
+func (d *transportDemuxer) findTransportEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, id TransportEndpointID, r *Route) TransportEndpoint {
+ eps, ok := d.protocol[protocolIDs{netProto, transProto}]
+ if !ok {
+ return nil
}
- // Try to find a match with the id minus the local address.
- nid := id
-
- nid.LocalAddress = ""
- if ep, ok := eps.endpoints[nid]; ok {
- matchedEPs = append(matchedEPs, ep)
+ eps.mu.RLock()
+ epsByNIC := eps.findEndpointLocked(id)
+ if epsByNIC == nil {
+ eps.mu.RUnlock()
+ return nil
}
- // Try to find a match with the id minus the remote part.
- nid.LocalAddress = id.LocalAddress
- nid.RemoteAddress = ""
- nid.RemotePort = 0
- if ep, ok := eps.endpoints[nid]; ok {
- matchedEPs = append(matchedEPs, ep)
- }
+ epsByNIC.mu.RLock()
+ eps.mu.RUnlock()
- // Try to find a match with only the local port.
- nid.LocalAddress = ""
- if ep, ok := eps.endpoints[nid]; ok {
- matchedEPs = append(matchedEPs, ep)
+ mpep, ok := epsByNIC.endpoints[r.ref.nic.ID()]
+ if !ok {
+ if mpep, ok = epsByNIC.endpoints[0]; !ok {
+ epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
+ return nil
+ }
}
- return matchedEPs
-}
-
-// findEndpointLocked returns the endpoint that most closely matches the given
-// id.
-func (d *transportDemuxer) findEndpointLocked(eps *transportEndpoints, vv buffer.VectorisedView, id TransportEndpointID) *endpointsByNic {
- if matchedEPs := d.findAllEndpointsLocked(eps, vv, id); len(matchedEPs) > 0 {
- return matchedEPs[0]
- }
- return nil
+ ep := selectEndpoint(id, mpep, epsByNIC.seed)
+ epsByNIC.mu.RUnlock()
+ return ep
}
// registerRawEndpoint registers the given endpoint with the dispatcher such
@@ -469,8 +650,8 @@ func (d *transportDemuxer) registerRawEndpoint(netProto tcpip.NetworkProtocolNum
}
eps.mu.Lock()
- defer eps.mu.Unlock()
eps.rawEndpoints = append(eps.rawEndpoints, ep)
+ eps.mu.Unlock()
return nil
}
@@ -484,15 +665,22 @@ func (d *transportDemuxer) unregisterRawEndpoint(netProto tcpip.NetworkProtocolN
}
eps.mu.Lock()
- defer eps.mu.Unlock()
for i, rawEP := range eps.rawEndpoints {
if rawEP == ep {
- eps.rawEndpoints = append(eps.rawEndpoints[:i], eps.rawEndpoints[i+1:]...)
- return
+ lastIdx := len(eps.rawEndpoints) - 1
+ eps.rawEndpoints[i] = eps.rawEndpoints[lastIdx]
+ eps.rawEndpoints[lastIdx] = nil
+ eps.rawEndpoints = eps.rawEndpoints[:lastIdx]
+ break
}
}
+ eps.mu.Unlock()
}
func isMulticastOrBroadcast(addr tcpip.Address) bool {
return addr == header.IPv4Broadcast || header.IsV4MulticastAddress(addr) || header.IsV6MulticastAddress(addr)
}
+
+func isUnicast(addr tcpip.Address) bool {
+ return addr != header.IPv4Any && addr != header.IPv6Any && !isMulticastOrBroadcast(addr)
+}