diff options
-rw-r--r-- | pkg/sentry/socket/socket.go | 1 | ||||
-rw-r--r-- | pkg/tcpip/link/loopback/loopback.go | 30 | ||||
-rw-r--r-- | pkg/tcpip/stack/nic.go | 54 | ||||
-rw-r--r-- | pkg/tcpip/stack/stack.go | 66 | ||||
-rw-r--r-- | pkg/tcpip/transport/icmp/endpoint.go | 1 | ||||
-rw-r--r-- | pkg/tcpip/transport/packet/endpoint.go | 48 | ||||
-rw-r--r-- | pkg/tcpip/transport/packet/endpoint_state.go | 6 | ||||
-rw-r--r-- | pkg/tcpip/transport/raw/endpoint.go | 4 | ||||
-rw-r--r-- | pkg/tcpip/transport/udp/endpoint.go | 1 | ||||
-rw-r--r-- | pkg/tcpip/transport/udp/endpoint_state.go | 6 | ||||
-rw-r--r-- | runsc/boot/boot_state_autogen.go | 3 | ||||
-rw-r--r-- | runsc/boot/loader.go | 23 | ||||
-rw-r--r-- | runsc/config/config.go | 3 | ||||
-rw-r--r-- | runsc/config/flags.go | 1 |
14 files changed, 182 insertions, 65 deletions
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go index 658e90bb9..83b9d9389 100644 --- a/pkg/sentry/socket/socket.go +++ b/pkg/sentry/socket/socket.go @@ -750,6 +750,7 @@ func AddressAndFamily(addr []byte) (tcpip.FullAddress, uint16, *syserr.Error) { return tcpip.FullAddress{ NIC: tcpip.NICID(a.InterfaceIndex), Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]), + Port: Ntohs(a.Protocol), }, family, nil case linux.AF_UNSPEC: diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go index d7bbfa639..ca1f9c08d 100644 --- a/pkg/tcpip/link/loopback/loopback.go +++ b/pkg/tcpip/link/loopback/loopback.go @@ -76,19 +76,8 @@ func (*endpoint) Wait() {} // WritePacket implements stack.LinkEndpoint.WritePacket. It delivers outbound // packets to the network-layer dispatcher. -func (e *endpoint) WritePacket(_ stack.RouteInfo, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) tcpip.Error { - // Construct data as the unparsed portion for the loopback packet. - data := buffer.NewVectorisedView(pkt.Size(), pkt.Views()) - - // Because we're immediately turning around and writing the packet back - // to the rx path, we intentionally don't preserve the remote and local - // link addresses from the stack.Route we're passed. - newPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ - Data: data, - }) - e.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, protocol, newPkt) - - return nil +func (e *endpoint) WritePacket(_ stack.RouteInfo, _ tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) tcpip.Error { + return e.WriteRawPacket(pkt) } // WritePackets implements stack.LinkEndpoint.WritePackets. @@ -105,4 +94,17 @@ func (e *endpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.Net } // WriteRawPacket implements stack.LinkEndpoint. -func (*endpoint) WriteRawPacket(*stack.PacketBuffer) tcpip.Error { return &tcpip.ErrNotSupported{} } +func (e *endpoint) WriteRawPacket(pkt *stack.PacketBuffer) tcpip.Error { + // Construct data as the unparsed portion for the loopback packet. + data := buffer.NewVectorisedView(pkt.Size(), pkt.Views()) + + // Because we're immediately turning around and writing the packet back + // to the rx path, we intentionally don't preserve the remote and local + // link addresses from the stack.Route we're passed. + newPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ + Data: data, + }) + e.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, pkt.NetworkProtocolNumber, newPkt) + + return nil +} diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go index b854d868c..ddc1ddab6 100644 --- a/pkg/tcpip/stack/nic.go +++ b/pkg/tcpip/stack/nic.go @@ -734,10 +734,29 @@ func (n *nic) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcp n.mu.RUnlock() // Deliver to interested packet endpoints without holding NIC lock. + var packetEPPkt *PacketBuffer deliverPacketEPs := func(ep PacketEndpoint) { - p := pkt.Clone() - p.PktType = tcpip.PacketHost - ep.HandlePacket(n.id, local, protocol, p) + if packetEPPkt == nil { + // Packet endpoints hold the full packet. + // + // We perform a deep copy because higher-level endpoints may point to + // the middle of a view that is held by a packet endpoint. Save/Restore + // does not support overlapping slices and will panic in this case. + // + // TODO(https://gvisor.dev/issue/6517): Avoid this copy once S/R supports + // overlapping slices (e.g. by passing a shallow copy of pkt to the packet + // endpoint). + packetEPPkt = NewPacketBuffer(PacketBufferOptions{ + Data: PayloadSince(pkt.LinkHeader()).ToVectorisedView(), + }) + // If a link header was populated in the original packet buffer, then + // populate it in the packet buffer we provide to packet endpoints as + // packet endpoints inspect link headers. + packetEPPkt.LinkHeader().Consume(pkt.LinkHeader().View().Size()) + packetEPPkt.PktType = tcpip.PacketHost + } + + ep.HandlePacket(n.id, local, protocol, packetEPPkt.Clone()) } if protoEPs != nil { protoEPs.forEach(deliverPacketEPs) @@ -758,13 +777,30 @@ func (n *nic) DeliverOutboundPacket(remote, local tcpip.LinkAddress, protocol tc eps := n.mu.packetEPs[header.EthernetProtocolAll] n.mu.RUnlock() + var packetEPPkt *PacketBuffer eps.forEach(func(ep PacketEndpoint) { - p := pkt.Clone() - p.PktType = tcpip.PacketOutgoing - // Add the link layer header as outgoing packets are intercepted - // before the link layer header is created. - n.LinkEndpoint.AddHeader(local, remote, protocol, p) - ep.HandlePacket(n.id, local, protocol, p) + if packetEPPkt == nil { + // Packet endpoints hold the full packet. + // + // We perform a deep copy because higher-level endpoints may point to + // the middle of a view that is held by a packet endpoint. Save/Restore + // does not support overlapping slices and will panic in this case. + // + // TODO(https://gvisor.dev/issue/6517): Avoid this copy once S/R supports + // overlapping slices (e.g. by passing a shallow copy of pkt to the packet + // endpoint). + packetEPPkt = NewPacketBuffer(PacketBufferOptions{ + ReserveHeaderBytes: pkt.AvailableHeaderBytes(), + Data: PayloadSince(pkt.NetworkHeader()).ToVectorisedView(), + }) + // Add the link layer header as outgoing packets are intercepted before + // the link layer header is created and packet endpoints are interested + // in the link header. + n.LinkEndpoint.AddHeader(local, remote, protocol, packetEPPkt) + packetEPPkt.PktType = tcpip.PacketOutgoing + } + + ep.HandlePacket(n.id, local, protocol, packetEPPkt.Clone()) }) } diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go index cfa8a2e8f..cb741e540 100644 --- a/pkg/tcpip/stack/stack.go +++ b/pkg/tcpip/stack/stack.go @@ -72,7 +72,8 @@ type Stack struct { // rawFactory creates raw endpoints. If nil, raw endpoints are // disabled. It is set during Stack creation and is immutable. - rawFactory RawFactory + rawFactory RawFactory + packetEndpointWriteSupported bool demux *transportDemuxer @@ -218,6 +219,10 @@ type Options struct { // this is non-nil. RawFactory RawFactory + // AllowPacketEndpointWrite determines if packet endpoints support write + // operations. + AllowPacketEndpointWrite bool + // RandSource is an optional source to use to generate random // numbers. If omitted it defaults to a Source seeded by the data // returned by the stack secure RNG. @@ -359,23 +364,24 @@ func New(opts Options) *Stack { opts.NUDConfigs.resetInvalidFields() s := &Stack{ - transportProtocols: make(map[tcpip.TransportProtocolNumber]*transportProtocolState), - networkProtocols: make(map[tcpip.NetworkProtocolNumber]NetworkProtocol), - nics: make(map[tcpip.NICID]*nic), - defaultForwardingEnabled: make(map[tcpip.NetworkProtocolNumber]struct{}), - cleanupEndpoints: make(map[TransportEndpoint]struct{}), - PortManager: ports.NewPortManager(), - clock: clock, - stats: opts.Stats.FillIn(), - handleLocal: opts.HandleLocal, - tables: opts.IPTables, - icmpRateLimiter: NewICMPRateLimiter(), - seed: seed, - nudConfigs: opts.NUDConfigs, - uniqueIDGenerator: opts.UniqueID, - nudDisp: opts.NUDDisp, - randomGenerator: randomGenerator, - secureRNG: opts.SecureRNG, + transportProtocols: make(map[tcpip.TransportProtocolNumber]*transportProtocolState), + networkProtocols: make(map[tcpip.NetworkProtocolNumber]NetworkProtocol), + nics: make(map[tcpip.NICID]*nic), + packetEndpointWriteSupported: opts.AllowPacketEndpointWrite, + defaultForwardingEnabled: make(map[tcpip.NetworkProtocolNumber]struct{}), + cleanupEndpoints: make(map[TransportEndpoint]struct{}), + PortManager: ports.NewPortManager(), + clock: clock, + stats: opts.Stats.FillIn(), + handleLocal: opts.HandleLocal, + tables: opts.IPTables, + icmpRateLimiter: NewICMPRateLimiter(), + seed: seed, + nudConfigs: opts.NUDConfigs, + uniqueIDGenerator: opts.UniqueID, + nudDisp: opts.NUDDisp, + randomGenerator: randomGenerator, + secureRNG: opts.SecureRNG, sendBufferSize: tcpip.SendBufferSizeOption{ Min: MinBufferSize, Default: DefaultBufferSize, @@ -1653,9 +1659,27 @@ func (s *Stack) WritePacketToRemote(nicID tcpip.NICID, remote tcpip.LinkAddress, ReserveHeaderBytes: int(nic.MaxHeaderLength()), Data: payload, }) + pkt.NetworkProtocolNumber = netProto return nic.WritePacketToRemote(remote, netProto, pkt) } +// WriteRawPacket writes data directly to the specified NIC without adding any +// headers. +func (s *Stack) WriteRawPacket(nicID tcpip.NICID, proto tcpip.NetworkProtocolNumber, payload buffer.VectorisedView) tcpip.Error { + s.mu.RLock() + nic, ok := s.nics[nicID] + s.mu.RUnlock() + if !ok { + return &tcpip.ErrUnknownNICID{} + } + + pkt := NewPacketBuffer(PacketBufferOptions{ + Data: payload, + }) + pkt.NetworkProtocolNumber = proto + return nic.WriteRawPacket(pkt) +} + // NetworkProtocolInstance returns the protocol instance in the stack for the // specified network protocol. This method is public for protocol implementers // and tests to use. @@ -1947,3 +1971,9 @@ func (s *Stack) IsSubnetBroadcast(nicID tcpip.NICID, protocol tcpip.NetworkProto return false } + +// PacketEndpointWriteSupported returns true iff packet endpoints support write +// operations. +func (s *Stack) PacketEndpointWriteSupported() bool { + return s.packetEndpointWriteSupported +} diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go index f9a15efb2..00497bf07 100644 --- a/pkg/tcpip/transport/icmp/endpoint.go +++ b/pkg/tcpip/transport/icmp/endpoint.go @@ -329,6 +329,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcp route = r } + // TODO(https://gvisor.dev/issue/6538): Avoid this allocation. v := make([]byte, p.Len()) if _, err := io.ReadFull(p, v); err != nil { return 0, &tcpip.ErrBadBuffer{} diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go index 8e7bb6c6e..89b4720aa 100644 --- a/pkg/tcpip/transport/packet/endpoint.go +++ b/pkg/tcpip/transport/packet/endpoint.go @@ -207,8 +207,52 @@ func (ep *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResul return res, nil } -func (*endpoint) Write(tcpip.Payloader, tcpip.WriteOptions) (int64, tcpip.Error) { - return 0, &tcpip.ErrInvalidOptionValue{} +func (ep *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { + if !ep.stack.PacketEndpointWriteSupported() { + return 0, &tcpip.ErrNotSupported{} + } + + ep.mu.Lock() + closed := ep.closed + nicID := ep.boundNIC + ep.mu.Unlock() + if closed { + return 0, &tcpip.ErrClosedForSend{} + } + + var remote tcpip.LinkAddress + proto := ep.netProto + if to := opts.To; to != nil { + remote = tcpip.LinkAddress(to.Addr) + + if n := to.NIC; n != 0 { + nicID = n + } + + if p := to.Port; p != 0 { + proto = tcpip.NetworkProtocolNumber(p) + } + } + + if nicID == 0 { + return 0, &tcpip.ErrInvalidOptionValue{} + } + + // TODO(https://gvisor.dev/issue/6538): Avoid this allocation. + payloadBytes := make(buffer.View, p.Len()) + if _, err := io.ReadFull(p, payloadBytes); err != nil { + return 0, &tcpip.ErrBadBuffer{} + } + + if err := func() tcpip.Error { + if ep.cooked { + return ep.stack.WritePacketToRemote(nicID, remote, proto, payloadBytes.ToVectorisedView()) + } + return ep.stack.WriteRawPacket(nicID, proto, payloadBytes.ToVectorisedView()) + }(); err != nil { + return 0, err + } + return int64(len(payloadBytes)), nil } // Disconnect implements tcpip.Endpoint.Disconnect. Packet sockets cannot be diff --git a/pkg/tcpip/transport/packet/endpoint_state.go b/pkg/tcpip/transport/packet/endpoint_state.go index e729921db..5c688d286 100644 --- a/pkg/tcpip/transport/packet/endpoint_state.go +++ b/pkg/tcpip/transport/packet/endpoint_state.go @@ -34,17 +34,11 @@ func (p *packet) loadReceivedAt(nsec int64) { // saveData saves packet.data field. func (p *packet) saveData() buffer.VectorisedView { - // We cannot save p.data directly as p.data.views may alias to p.views, - // which is not allowed by state framework (in-struct pointer). return p.data.Clone(nil) } // loadData loads packet.data field. func (p *packet) loadData(data buffer.VectorisedView) { - // NOTE: We cannot do the p.data = data.Clone(p.views[:]) optimization - // here because data.views is not guaranteed to be loaded by now. Plus, - // data.views will be allocated anyway so there really is little point - // of utilizing p.views for data.views. p.data = data } diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go index 55854ba59..3bf6c0a8f 100644 --- a/pkg/tcpip/transport/raw/endpoint.go +++ b/pkg/tcpip/transport/raw/endpoint.go @@ -281,6 +281,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcp return nil, nil, nil, &tcpip.ErrInvalidEndpointState{} } + // TODO(https://gvisor.dev/issue/6538): Avoid this allocation. payloadBytes := make([]byte, p.Len()) if _, err := io.ReadFull(p, payloadBytes); err != nil { return nil, nil, nil, &tcpip.ErrBadBuffer{} @@ -600,6 +601,9 @@ func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) { // We copy headers' underlying bytes because pkt.*Header may point to // the middle of a slice, and another struct may point to the "outer" // slice. Save/restore doesn't support overlapping slices and will fail. + // + // TODO(https://gvisor.dev/issue/6517): Avoid the copy once S/R supports + // overlapping slices. var combinedVV buffer.VectorisedView if e.TransportEndpointInfo.NetProto == header.IPv4ProtocolNumber { network, transport := pkt.NetworkHeader().View(), pkt.TransportHeader().View() diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go index ac7ecb5f8..4b6bdc3be 100644 --- a/pkg/tcpip/transport/udp/endpoint.go +++ b/pkg/tcpip/transport/udp/endpoint.go @@ -366,6 +366,7 @@ func (e *endpoint) prepareForWrite(p tcpip.Payloader, opts tcpip.WriteOptions) ( return udpPacketInfo{}, err } + // TODO(https://gvisor.dev/issue/6538): Avoid this allocation. v := make([]byte, p.Len()) if _, err := io.ReadFull(p, v); err != nil { ctx.Release() diff --git a/pkg/tcpip/transport/udp/endpoint_state.go b/pkg/tcpip/transport/udp/endpoint_state.go index 20c45ab87..2ff8b0482 100644 --- a/pkg/tcpip/transport/udp/endpoint_state.go +++ b/pkg/tcpip/transport/udp/endpoint_state.go @@ -36,17 +36,11 @@ func (p *udpPacket) loadReceivedAt(nsec int64) { // saveData saves udpPacket.data field. func (p *udpPacket) saveData() buffer.VectorisedView { - // We cannot save p.data directly as p.data.views may alias to p.views, - // which is not allowed by state framework (in-struct pointer). return p.data.Clone(nil) } // loadData loads udpPacket.data field. func (p *udpPacket) loadData(data buffer.VectorisedView) { - // NOTE: We cannot do the p.data = data.Clone(p.views[:]) optimization - // here because data.views is not guaranteed to be loaded by now. Plus, - // data.views will be allocated anyway so there really is little point - // of utilizing p.views for data.views. p.data = data } diff --git a/runsc/boot/boot_state_autogen.go b/runsc/boot/boot_state_autogen.go index 0f6746d1f..95494a4c1 100644 --- a/runsc/boot/boot_state_autogen.go +++ b/runsc/boot/boot_state_autogen.go @@ -14,6 +14,7 @@ func (f *sandboxNetstackCreator) StateFields() []string { return []string{ "clock", "uniqueID", + "allowPacketEndpointWrite", } } @@ -24,6 +25,7 @@ func (f *sandboxNetstackCreator) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.clock) stateSinkObject.Save(1, &f.uniqueID) + stateSinkObject.Save(2, &f.allowPacketEndpointWrite) } func (f *sandboxNetstackCreator) afterLoad() {} @@ -32,6 +34,7 @@ func (f *sandboxNetstackCreator) afterLoad() {} func (f *sandboxNetstackCreator) StateLoad(stateSourceObject state.Source) { stateSourceObject.Load(0, &f.clock) stateSourceObject.Load(1, &f.uniqueID) + stateSourceObject.Load(2, &f.allowPacketEndpointWrite) } func init() { diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 3f667cd74..1dd0048ac 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -1089,13 +1089,14 @@ func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID st return inet.NewRootNamespace(hostinet.NewStack(), nil), nil case config.NetworkNone, config.NetworkSandbox: - s, err := newEmptySandboxNetworkStack(clock, uniqueID) + s, err := newEmptySandboxNetworkStack(clock, uniqueID, conf.AllowPacketEndpointWrite) if err != nil { return nil, err } creator := &sandboxNetstackCreator{ - clock: clock, - uniqueID: uniqueID, + clock: clock, + uniqueID: uniqueID, + allowPacketEndpointWrite: conf.AllowPacketEndpointWrite, } return inet.NewRootNamespace(s, creator), nil @@ -1105,7 +1106,7 @@ func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID st } -func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) { +func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID, allowPacketEndpointWrite bool) (inet.Stack, error) { netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol} transProtos := []stack.TransportProtocolFactory{ tcp.NewProtocol, @@ -1121,9 +1122,10 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in HandleLocal: true, // Enable raw sockets for users with sufficient // privileges. - RawFactory: raw.EndpointFactory{}, - UniqueID: uniqueID, - DefaultIPTables: netfilter.DefaultLinuxTables, + RawFactory: raw.EndpointFactory{}, + AllowPacketEndpointWrite: allowPacketEndpointWrite, + UniqueID: uniqueID, + DefaultIPTables: netfilter.DefaultLinuxTables, })} // Enable SACK Recovery. @@ -1160,13 +1162,14 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in // // +stateify savable type sandboxNetstackCreator struct { - clock tcpip.Clock - uniqueID stack.UniqueID + clock tcpip.Clock + uniqueID stack.UniqueID + allowPacketEndpointWrite bool } // CreateStack implements kernel.NetworkStackCreator.CreateStack. func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) { - s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID) + s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID, f.allowPacketEndpointWrite) if err != nil { return nil, err } diff --git a/runsc/config/config.go b/runsc/config/config.go index 2f52863ff..2ce8cc006 100644 --- a/runsc/config/config.go +++ b/runsc/config/config.go @@ -86,6 +86,9 @@ type Config struct { // capabilities. EnableRaw bool `flag:"net-raw"` + // AllowPacketEndpointWrite enables write operations on packet endpoints. + AllowPacketEndpointWrite bool `flag:"TESTONLY-allow-packet-endpoint-write"` + // HardwareGSO indicates that hardware segmentation offload is enabled. HardwareGSO bool `flag:"gso"` diff --git a/runsc/config/flags.go b/runsc/config/flags.go index 85507902a..cc5aba474 100644 --- a/runsc/config/flags.go +++ b/runsc/config/flags.go @@ -92,6 +92,7 @@ func RegisterFlags() { // Test flags, not to be used outside tests, ever. flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.") flag.String("TESTONLY-test-name-env", "", "TEST ONLY; do not ever use! Used for automated tests to improve logging.") + flag.Bool("TESTONLY-allow-packet-endpoint-write", false, "TEST ONLY; do not ever use! Used for tests to allow writes on packet sockets.") }) } |