From 0e1cc476dbc51646a1eed5f9de9319286c65fbf9 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Fri, 3 May 2019 11:51:59 -0700 Subject: Fix transport/raw copybara export - include packet_list.go - exclude state.go (by renaming to include an underscore) Also rename raw.go to endpoint.go for consistency. PiperOrigin-RevId: 246547912 Change-Id: I19c8331c794ba683a940cc96a8be6497b53ff24d --- pkg/tcpip/transport/raw/BUILD | 4 +- pkg/tcpip/transport/raw/endpoint.go | 558 ++++++++++++++++++++++++++++++ pkg/tcpip/transport/raw/endpoint_state.go | 88 +++++ pkg/tcpip/transport/raw/raw.go | 558 ------------------------------ pkg/tcpip/transport/raw/state.go | 88 ----- 5 files changed, 648 insertions(+), 648 deletions(-) create mode 100644 pkg/tcpip/transport/raw/endpoint.go create mode 100644 pkg/tcpip/transport/raw/endpoint_state.go delete mode 100644 pkg/tcpip/transport/raw/raw.go delete mode 100644 pkg/tcpip/transport/raw/state.go (limited to 'pkg/tcpip/transport') diff --git a/pkg/tcpip/transport/raw/BUILD b/pkg/tcpip/transport/raw/BUILD index 005079639..52f6b9759 100644 --- a/pkg/tcpip/transport/raw/BUILD +++ b/pkg/tcpip/transport/raw/BUILD @@ -18,9 +18,9 @@ go_template_instance( go_library( name = "raw", srcs = [ + "endpoint.go", + "endpoint_state.go", "packet_list.go", - "raw.go", - "state.go", ], importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/raw", imports = ["gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"], diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go new file mode 100644 index 000000000..7004c7ff4 --- /dev/null +++ b/pkg/tcpip/transport/raw/endpoint.go @@ -0,0 +1,558 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package raw provides the implementation of raw sockets (see raw(7)). Raw +// sockets allow applications to: +// +// * manually write and inspect transport layer headers and payloads +// * receive all traffic of a given transport protcol (e.g. ICMP or UDP) +// * optionally write and inspect network layer and link layer headers for +// packets +// +// Raw sockets don't have any notion of ports, and incoming packets are +// demultiplexed solely by protocol number. Thus, a raw UDP endpoint will +// receive every UDP packet received by netstack. bind(2) and connect(2) can be +// used to filter incoming packets by source and destination. +package raw + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/sleep" + "gvisor.googlesource.com/gvisor/pkg/tcpip" + "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer" + "gvisor.googlesource.com/gvisor/pkg/tcpip/header" + "gvisor.googlesource.com/gvisor/pkg/tcpip/stack" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// +stateify savable +type packet struct { + packetEntry + // data holds the actual packet data, including any headers and + // payload. + data buffer.VectorisedView `state:".(buffer.VectorisedView)"` + // views is pre-allocated space to back data. As long as the packet is + // made up of fewer than 8 buffer.Views, no extra allocation is + // necessary to store packet data. + views [8]buffer.View `state:"nosave"` + // timestampNS is the unix time at which the packet was received. + timestampNS int64 + // senderAddr is the network address of the sender. + senderAddr tcpip.FullAddress +} + +// endpoint is the raw socket implementation of tcpip.Endpoint. It is legal to +// have goroutines make concurrent calls into the endpoint. +// +// Lock order: +// endpoint.mu +// endpoint.rcvMu +// +// +stateify savable +type endpoint struct { + // The following fields are initialized at creation time and are + // immutable. + stack *stack.Stack `state:"manual"` + netProto tcpip.NetworkProtocolNumber + transProto tcpip.TransportProtocolNumber + waiterQueue *waiter.Queue + + // The following fields are used to manage the receive queue and are + // protected by rcvMu. + rcvMu sync.Mutex `state:"nosave"` + rcvList packetList + rcvBufSizeMax int `state:".(int)"` + rcvBufSize int + rcvClosed bool + + // The following fields are protected by mu. + mu sync.RWMutex `state:"nosave"` + sndBufSize int + // shutdownFlags represent the current shutdown state of the endpoint. + shutdownFlags tcpip.ShutdownFlags + closed bool + connected bool + bound bool + // registeredNIC is the NIC to which th endpoint is explicitly + // registered. Is set when Connect or Bind are used to specify a NIC. + registeredNIC tcpip.NICID + // boundNIC and boundAddr are set on calls to Bind(). When callers + // attempt actions that would invalidate the binding data (e.g. sending + // data via a NIC other than boundNIC), the endpoint will return an + // error. + boundNIC tcpip.NICID + boundAddr tcpip.Address + // route is the route to a remote network endpoint. It is set via + // Connect(), and is valid only when conneted is true. + route stack.Route `state:"manual"` +} + +// NewEndpoint returns a raw endpoint for the given protocols. +// TODO(b/129292371): IP_HDRINCL, IPPROTO_RAW, and AF_PACKET. +func NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { + if netProto != header.IPv4ProtocolNumber { + return nil, tcpip.ErrUnknownProtocol + } + + ep := &endpoint{ + stack: stack, + netProto: netProto, + transProto: transProto, + waiterQueue: waiterQueue, + rcvBufSizeMax: 32 * 1024, + sndBufSize: 32 * 1024, + } + + if err := ep.stack.RegisterRawTransportEndpoint(ep.registeredNIC, ep.netProto, ep.transProto, ep); err != nil { + return nil, err + } + + return ep, nil +} + +// Close implements tcpip.Endpoint.Close. +func (ep *endpoint) Close() { + ep.mu.Lock() + defer ep.mu.Unlock() + + if ep.closed { + return + } + + ep.stack.UnregisterRawTransportEndpoint(ep.registeredNIC, ep.netProto, ep.transProto, ep) + + ep.rcvMu.Lock() + defer ep.rcvMu.Unlock() + + // Clear the receive list. + ep.rcvClosed = true + ep.rcvBufSize = 0 + for !ep.rcvList.Empty() { + ep.rcvList.Remove(ep.rcvList.Front()) + } + + if ep.connected { + ep.route.Release() + } + + ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut) +} + +// Read implements tcpip.Endpoint.Read. +func (ep *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) { + ep.rcvMu.Lock() + + // If there's no data to read, return that read would block or that the + // endpoint is closed. + if ep.rcvList.Empty() { + err := tcpip.ErrWouldBlock + if ep.rcvClosed { + err = tcpip.ErrClosedForReceive + } + ep.rcvMu.Unlock() + return buffer.View{}, tcpip.ControlMessages{}, err + } + + packet := ep.rcvList.Front() + ep.rcvList.Remove(packet) + ep.rcvBufSize -= packet.data.Size() + + ep.rcvMu.Unlock() + + if addr != nil { + *addr = packet.senderAddr + } + + return packet.data.ToView(), tcpip.ControlMessages{HasTimestamp: true, Timestamp: packet.timestampNS}, nil +} + +// Write implements tcpip.Endpoint.Write. +func (ep *endpoint) Write(payload tcpip.Payload, opts tcpip.WriteOptions) (uintptr, <-chan struct{}, *tcpip.Error) { + // MSG_MORE is unimplemented. This also means that MSG_EOR is a no-op. + if opts.More { + return 0, nil, tcpip.ErrInvalidOptionValue + } + + ep.mu.RLock() + + if ep.closed { + ep.mu.RUnlock() + return 0, nil, tcpip.ErrInvalidEndpointState + } + + // Check whether we've shutdown writing. + if ep.shutdownFlags&tcpip.ShutdownWrite != 0 { + ep.mu.RUnlock() + return 0, nil, tcpip.ErrClosedForSend + } + + // Did the user caller provide a destination? If not, use the connected + // destination. + if opts.To == nil { + // If the user doesn't specify a destination, they should have + // connected to another address. + if !ep.connected { + ep.mu.RUnlock() + return 0, nil, tcpip.ErrNotConnected + } + + if ep.route.IsResolutionRequired() { + savedRoute := &ep.route + // Promote lock to exclusive if using a shared route, + // given that it may need to change in finishWrite. + ep.mu.RUnlock() + ep.mu.Lock() + + // Make sure that the route didn't change during the + // time we didn't hold the lock. + if !ep.connected || savedRoute != &ep.route { + ep.mu.Unlock() + return 0, nil, tcpip.ErrInvalidEndpointState + } + + n, ch, err := ep.finishWrite(payload, savedRoute) + ep.mu.Unlock() + return n, ch, err + } + + n, ch, err := ep.finishWrite(payload, &ep.route) + ep.mu.RUnlock() + return n, ch, err + } + + // The caller provided a destination. Reject destination address if it + // goes through a different NIC than the endpoint was bound to. + nic := opts.To.NIC + if ep.bound && nic != 0 && nic != ep.boundNIC { + ep.mu.RUnlock() + return 0, nil, tcpip.ErrNoRoute + } + + // We don't support IPv6 yet, so this has to be an IPv4 address. + if len(opts.To.Addr) != header.IPv4AddressSize { + ep.mu.RUnlock() + return 0, nil, tcpip.ErrInvalidEndpointState + } + + // Find the route to the destination. If boundAddress is 0, + // FindRoute will choose an appropriate source address. + route, err := ep.stack.FindRoute(nic, ep.boundAddr, opts.To.Addr, ep.netProto, false) + if err != nil { + ep.mu.RUnlock() + return 0, nil, err + } + + n, ch, err := ep.finishWrite(payload, &route) + route.Release() + ep.mu.RUnlock() + return n, ch, err +} + +// finishWrite writes the payload to a route. It resolves the route if +// necessary. It's really just a helper to make defer unnecessary in Write. +func (ep *endpoint) finishWrite(payload tcpip.Payload, route *stack.Route) (uintptr, <-chan struct{}, *tcpip.Error) { + // We may need to resolve the route (match a link layer address to the + // network address). If that requires blocking (e.g. to use ARP), + // return a channel on which the caller can wait. + if route.IsResolutionRequired() { + waker := &sleep.Waker{} + if ch, err := route.Resolve(waker); err != nil { + if err == tcpip.ErrWouldBlock { + // Link address needs to be resolved. + // Resolution was triggered the background. + // Better luck next time. + route.RemoveWaker(waker) + return 0, ch, tcpip.ErrNoLinkAddress + } + return 0, nil, err + } + } + + payloadBytes, err := payload.Get(payload.Size()) + if err != nil { + return 0, nil, err + } + + switch ep.netProto { + case header.IPv4ProtocolNumber: + hdr := buffer.NewPrependable(len(payloadBytes) + int(route.MaxHeaderLength())) + if err := route.WritePacket(nil /* gso */, hdr, buffer.View(payloadBytes).ToVectorisedView(), header.ICMPv4ProtocolNumber, route.DefaultTTL()); err != nil { + return 0, nil, err + } + + default: + return 0, nil, tcpip.ErrUnknownProtocol + } + + return uintptr(len(payloadBytes)), nil, nil +} + +// Peek implements tcpip.Endpoint.Peek. +func (ep *endpoint) Peek([][]byte) (uintptr, tcpip.ControlMessages, *tcpip.Error) { + return 0, tcpip.ControlMessages{}, nil +} + +// Connect implements tcpip.Endpoint.Connect. +func (ep *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { + ep.mu.Lock() + defer ep.mu.Unlock() + + if ep.closed { + return tcpip.ErrInvalidEndpointState + } + + // We don't support IPv6 yet. + if len(addr.Addr) != header.IPv4AddressSize { + return tcpip.ErrInvalidEndpointState + } + + nic := addr.NIC + if ep.bound { + if ep.boundNIC == 0 { + // If we're bound, but not to a specific NIC, the NIC + // in addr will be used. Nothing to do here. + } else if addr.NIC == 0 { + // If we're bound to a specific NIC, but addr doesn't + // specify a NIC, use the bound NIC. + nic = ep.boundNIC + } else if addr.NIC != ep.boundNIC { + // We're bound and addr specifies a NIC. They must be + // the same. + return tcpip.ErrInvalidEndpointState + } + } + + // Find a route to the destination. + route, err := ep.stack.FindRoute(nic, tcpip.Address(""), addr.Addr, ep.netProto, false) + if err != nil { + return err + } + defer route.Release() + + // Re-register the endpoint with the appropriate NIC. + if err := ep.stack.RegisterRawTransportEndpoint(addr.NIC, ep.netProto, ep.transProto, ep); err != nil { + return err + } + ep.stack.UnregisterRawTransportEndpoint(ep.registeredNIC, ep.netProto, ep.transProto, ep) + + // Save the route and NIC we've connected via. + ep.route = route.Clone() + ep.registeredNIC = nic + ep.connected = true + + return nil +} + +// Shutdown implements tcpip.Endpoint.Shutdown. +func (ep *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error { + ep.mu.Lock() + defer ep.mu.Unlock() + + if !ep.connected { + return tcpip.ErrNotConnected + } + + ep.shutdownFlags |= flags + + if flags&tcpip.ShutdownRead != 0 { + ep.rcvMu.Lock() + wasClosed := ep.rcvClosed + ep.rcvClosed = true + ep.rcvMu.Unlock() + + if !wasClosed { + ep.waiterQueue.Notify(waiter.EventIn) + } + } + + return nil +} + +// Listen implements tcpip.Endpoint.Listen. +func (ep *endpoint) Listen(backlog int) *tcpip.Error { + return tcpip.ErrNotSupported +} + +// Accept implements tcpip.Endpoint.Accept. +func (ep *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) { + return nil, nil, tcpip.ErrNotSupported +} + +// Bind implements tcpip.Endpoint.Bind. +func (ep *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error { + ep.mu.Lock() + defer ep.mu.Unlock() + + // Callers must provide an IPv4 address or no network address (for + // binding to a NIC, but not an address). + if len(addr.Addr) != 0 && len(addr.Addr) != 4 { + return tcpip.ErrInvalidEndpointState + } + + // If a local address was specified, verify that it's valid. + if len(addr.Addr) == header.IPv4AddressSize && ep.stack.CheckLocalAddress(addr.NIC, ep.netProto, addr.Addr) == 0 { + return tcpip.ErrBadLocalAddress + } + + // Re-register the endpoint with the appropriate NIC. + if err := ep.stack.RegisterRawTransportEndpoint(addr.NIC, ep.netProto, ep.transProto, ep); err != nil { + return err + } + ep.stack.UnregisterRawTransportEndpoint(ep.registeredNIC, ep.netProto, ep.transProto, ep) + + ep.registeredNIC = addr.NIC + ep.boundNIC = addr.NIC + ep.boundAddr = addr.Addr + ep.bound = true + + return nil +} + +// GetLocalAddress implements tcpip.Endpoint.GetLocalAddress. +func (ep *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { + return tcpip.FullAddress{}, tcpip.ErrNotSupported +} + +// GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress. +func (ep *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) { + ep.mu.RLock() + defer ep.mu.RUnlock() + + if !ep.connected { + return tcpip.FullAddress{}, tcpip.ErrNotConnected + } + + return tcpip.FullAddress{ + NIC: ep.registeredNIC, + Addr: ep.route.RemoteAddress, + }, nil +} + +// Readiness implements tcpip.Endpoint.Readiness. +func (ep *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { + // The endpoint is always writable. + result := waiter.EventOut & mask + + // Determine whether the endpoint is readable. + if (mask & waiter.EventIn) != 0 { + ep.rcvMu.Lock() + if !ep.rcvList.Empty() || ep.rcvClosed { + result |= waiter.EventIn + } + ep.rcvMu.Unlock() + } + + return result +} + +// SetSockOpt implements tcpip.Endpoint.SetSockOpt. +func (ep *endpoint) SetSockOpt(opt interface{}) *tcpip.Error { + return nil +} + +// GetSockOpt implements tcpip.Endpoint.GetSockOpt. +func (ep *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { + switch o := opt.(type) { + case tcpip.ErrorOption: + return nil + + case *tcpip.SendBufferSizeOption: + ep.mu.Lock() + *o = tcpip.SendBufferSizeOption(ep.sndBufSize) + ep.mu.Unlock() + return nil + + case *tcpip.ReceiveBufferSizeOption: + ep.rcvMu.Lock() + *o = tcpip.ReceiveBufferSizeOption(ep.rcvBufSizeMax) + ep.rcvMu.Unlock() + return nil + + case *tcpip.ReceiveQueueSizeOption: + ep.rcvMu.Lock() + if ep.rcvList.Empty() { + *o = 0 + } else { + p := ep.rcvList.Front() + *o = tcpip.ReceiveQueueSizeOption(p.data.Size()) + } + ep.rcvMu.Unlock() + return nil + + case *tcpip.KeepaliveEnabledOption: + *o = 0 + return nil + + default: + return tcpip.ErrUnknownProtocolOption + } +} + +// HandlePacket implements stack.RawTransportEndpoint.HandlePacket. +func (ep *endpoint) HandlePacket(route *stack.Route, netHeader buffer.View, vv buffer.VectorisedView) { + ep.rcvMu.Lock() + + // Drop the packet if our buffer is currently full. + if ep.rcvClosed || ep.rcvBufSize >= ep.rcvBufSizeMax { + ep.stack.Stats().DroppedPackets.Increment() + ep.rcvMu.Unlock() + return + } + + if ep.bound { + // If bound to a NIC, only accept data for that NIC. + if ep.boundNIC != 0 && ep.boundNIC != route.NICID() { + ep.rcvMu.Unlock() + return + } + // If bound to an address, only accept data for that address. + if ep.boundAddr != "" && ep.boundAddr != route.RemoteAddress { + ep.rcvMu.Unlock() + return + } + } + + // If connected, only accept packets from the remote address we + // connected to. + if ep.connected && ep.route.RemoteAddress != route.RemoteAddress { + ep.rcvMu.Unlock() + return + } + + wasEmpty := ep.rcvBufSize == 0 + + // Push new packet into receive list and increment the buffer size. + packet := &packet{ + senderAddr: tcpip.FullAddress{ + NIC: route.NICID(), + Addr: route.RemoteAddress, + }, + } + + combinedVV := netHeader.ToVectorisedView() + combinedVV.Append(vv) + packet.data = combinedVV.Clone(packet.views[:]) + packet.timestampNS = ep.stack.NowNanoseconds() + + ep.rcvList.PushBack(packet) + ep.rcvBufSize += packet.data.Size() + + ep.rcvMu.Unlock() + + // Notify waiters that there's data to be read. + if wasEmpty { + ep.waiterQueue.Notify(waiter.EventIn) + } +} diff --git a/pkg/tcpip/transport/raw/endpoint_state.go b/pkg/tcpip/transport/raw/endpoint_state.go new file mode 100644 index 000000000..e8907ebb1 --- /dev/null +++ b/pkg/tcpip/transport/raw/endpoint_state.go @@ -0,0 +1,88 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package raw + +import ( + "gvisor.googlesource.com/gvisor/pkg/tcpip" + "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer" + "gvisor.googlesource.com/gvisor/pkg/tcpip/stack" +) + +// saveData saves packet.data field. +func (p *packet) saveData() buffer.VectorisedView { + // We cannot save p.data directly as p.data.views may alias to p.views, + // which is not allowed by state framework (in-struct pointer). + return p.data.Clone(nil) +} + +// loadData loads packet.data field. +func (p *packet) loadData(data buffer.VectorisedView) { + // NOTE: We cannot do the p.data = data.Clone(p.views[:]) optimization + // here because data.views is not guaranteed to be loaded by now. Plus, + // data.views will be allocated anyway so there really is little point + // of utilizing p.views for data.views. + p.data = data +} + +// beforeSave is invoked by stateify. +func (ep *endpoint) beforeSave() { + // Stop incoming packets from being handled (and mutate endpoint state). + // The lock will be released after saveRcvBufSizeMax(), which would have + // saved ep.rcvBufSizeMax and set it to 0 to continue blocking incoming + // packets. + ep.rcvMu.Lock() +} + +// saveRcvBufSizeMax is invoked by stateify. +func (ep *endpoint) saveRcvBufSizeMax() int { + max := ep.rcvBufSizeMax + // Make sure no new packets will be handled regardless of the lock. + ep.rcvBufSizeMax = 0 + // Release the lock acquired in beforeSave() so regular endpoint closing + // logic can proceed after save. + ep.rcvMu.Unlock() + return max +} + +// loadRcvBufSizeMax is invoked by stateify. +func (ep *endpoint) loadRcvBufSizeMax(max int) { + ep.rcvBufSizeMax = max +} + +// afterLoad is invoked by stateify. +func (ep *endpoint) afterLoad() { + // StackFromEnv is a stack used specifically for save/restore. + ep.stack = stack.StackFromEnv + + // If the endpoint is connected, re-connect via the save/restore stack. + if ep.connected { + var err *tcpip.Error + ep.route, err = ep.stack.FindRoute(ep.registeredNIC, ep.boundAddr, ep.route.RemoteAddress, ep.netProto, false) + if err != nil { + panic(*err) + } + } + + // If the endpoint is bound, re-bind via the save/restore stack. + if ep.bound { + if ep.stack.CheckLocalAddress(ep.registeredNIC, ep.netProto, ep.boundAddr) == 0 { + panic(tcpip.ErrBadLocalAddress) + } + } + + if err := ep.stack.RegisterRawTransportEndpoint(ep.registeredNIC, ep.netProto, ep.transProto, ep); err != nil { + panic(*err) + } +} diff --git a/pkg/tcpip/transport/raw/raw.go b/pkg/tcpip/transport/raw/raw.go deleted file mode 100644 index 7004c7ff4..000000000 --- a/pkg/tcpip/transport/raw/raw.go +++ /dev/null @@ -1,558 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package raw provides the implementation of raw sockets (see raw(7)). Raw -// sockets allow applications to: -// -// * manually write and inspect transport layer headers and payloads -// * receive all traffic of a given transport protcol (e.g. ICMP or UDP) -// * optionally write and inspect network layer and link layer headers for -// packets -// -// Raw sockets don't have any notion of ports, and incoming packets are -// demultiplexed solely by protocol number. Thus, a raw UDP endpoint will -// receive every UDP packet received by netstack. bind(2) and connect(2) can be -// used to filter incoming packets by source and destination. -package raw - -import ( - "sync" - - "gvisor.googlesource.com/gvisor/pkg/sleep" - "gvisor.googlesource.com/gvisor/pkg/tcpip" - "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer" - "gvisor.googlesource.com/gvisor/pkg/tcpip/header" - "gvisor.googlesource.com/gvisor/pkg/tcpip/stack" - "gvisor.googlesource.com/gvisor/pkg/waiter" -) - -// +stateify savable -type packet struct { - packetEntry - // data holds the actual packet data, including any headers and - // payload. - data buffer.VectorisedView `state:".(buffer.VectorisedView)"` - // views is pre-allocated space to back data. As long as the packet is - // made up of fewer than 8 buffer.Views, no extra allocation is - // necessary to store packet data. - views [8]buffer.View `state:"nosave"` - // timestampNS is the unix time at which the packet was received. - timestampNS int64 - // senderAddr is the network address of the sender. - senderAddr tcpip.FullAddress -} - -// endpoint is the raw socket implementation of tcpip.Endpoint. It is legal to -// have goroutines make concurrent calls into the endpoint. -// -// Lock order: -// endpoint.mu -// endpoint.rcvMu -// -// +stateify savable -type endpoint struct { - // The following fields are initialized at creation time and are - // immutable. - stack *stack.Stack `state:"manual"` - netProto tcpip.NetworkProtocolNumber - transProto tcpip.TransportProtocolNumber - waiterQueue *waiter.Queue - - // The following fields are used to manage the receive queue and are - // protected by rcvMu. - rcvMu sync.Mutex `state:"nosave"` - rcvList packetList - rcvBufSizeMax int `state:".(int)"` - rcvBufSize int - rcvClosed bool - - // The following fields are protected by mu. - mu sync.RWMutex `state:"nosave"` - sndBufSize int - // shutdownFlags represent the current shutdown state of the endpoint. - shutdownFlags tcpip.ShutdownFlags - closed bool - connected bool - bound bool - // registeredNIC is the NIC to which th endpoint is explicitly - // registered. Is set when Connect or Bind are used to specify a NIC. - registeredNIC tcpip.NICID - // boundNIC and boundAddr are set on calls to Bind(). When callers - // attempt actions that would invalidate the binding data (e.g. sending - // data via a NIC other than boundNIC), the endpoint will return an - // error. - boundNIC tcpip.NICID - boundAddr tcpip.Address - // route is the route to a remote network endpoint. It is set via - // Connect(), and is valid only when conneted is true. - route stack.Route `state:"manual"` -} - -// NewEndpoint returns a raw endpoint for the given protocols. -// TODO(b/129292371): IP_HDRINCL, IPPROTO_RAW, and AF_PACKET. -func NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { - if netProto != header.IPv4ProtocolNumber { - return nil, tcpip.ErrUnknownProtocol - } - - ep := &endpoint{ - stack: stack, - netProto: netProto, - transProto: transProto, - waiterQueue: waiterQueue, - rcvBufSizeMax: 32 * 1024, - sndBufSize: 32 * 1024, - } - - if err := ep.stack.RegisterRawTransportEndpoint(ep.registeredNIC, ep.netProto, ep.transProto, ep); err != nil { - return nil, err - } - - return ep, nil -} - -// Close implements tcpip.Endpoint.Close. -func (ep *endpoint) Close() { - ep.mu.Lock() - defer ep.mu.Unlock() - - if ep.closed { - return - } - - ep.stack.UnregisterRawTransportEndpoint(ep.registeredNIC, ep.netProto, ep.transProto, ep) - - ep.rcvMu.Lock() - defer ep.rcvMu.Unlock() - - // Clear the receive list. - ep.rcvClosed = true - ep.rcvBufSize = 0 - for !ep.rcvList.Empty() { - ep.rcvList.Remove(ep.rcvList.Front()) - } - - if ep.connected { - ep.route.Release() - } - - ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut) -} - -// Read implements tcpip.Endpoint.Read. -func (ep *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) { - ep.rcvMu.Lock() - - // If there's no data to read, return that read would block or that the - // endpoint is closed. - if ep.rcvList.Empty() { - err := tcpip.ErrWouldBlock - if ep.rcvClosed { - err = tcpip.ErrClosedForReceive - } - ep.rcvMu.Unlock() - return buffer.View{}, tcpip.ControlMessages{}, err - } - - packet := ep.rcvList.Front() - ep.rcvList.Remove(packet) - ep.rcvBufSize -= packet.data.Size() - - ep.rcvMu.Unlock() - - if addr != nil { - *addr = packet.senderAddr - } - - return packet.data.ToView(), tcpip.ControlMessages{HasTimestamp: true, Timestamp: packet.timestampNS}, nil -} - -// Write implements tcpip.Endpoint.Write. -func (ep *endpoint) Write(payload tcpip.Payload, opts tcpip.WriteOptions) (uintptr, <-chan struct{}, *tcpip.Error) { - // MSG_MORE is unimplemented. This also means that MSG_EOR is a no-op. - if opts.More { - return 0, nil, tcpip.ErrInvalidOptionValue - } - - ep.mu.RLock() - - if ep.closed { - ep.mu.RUnlock() - return 0, nil, tcpip.ErrInvalidEndpointState - } - - // Check whether we've shutdown writing. - if ep.shutdownFlags&tcpip.ShutdownWrite != 0 { - ep.mu.RUnlock() - return 0, nil, tcpip.ErrClosedForSend - } - - // Did the user caller provide a destination? If not, use the connected - // destination. - if opts.To == nil { - // If the user doesn't specify a destination, they should have - // connected to another address. - if !ep.connected { - ep.mu.RUnlock() - return 0, nil, tcpip.ErrNotConnected - } - - if ep.route.IsResolutionRequired() { - savedRoute := &ep.route - // Promote lock to exclusive if using a shared route, - // given that it may need to change in finishWrite. - ep.mu.RUnlock() - ep.mu.Lock() - - // Make sure that the route didn't change during the - // time we didn't hold the lock. - if !ep.connected || savedRoute != &ep.route { - ep.mu.Unlock() - return 0, nil, tcpip.ErrInvalidEndpointState - } - - n, ch, err := ep.finishWrite(payload, savedRoute) - ep.mu.Unlock() - return n, ch, err - } - - n, ch, err := ep.finishWrite(payload, &ep.route) - ep.mu.RUnlock() - return n, ch, err - } - - // The caller provided a destination. Reject destination address if it - // goes through a different NIC than the endpoint was bound to. - nic := opts.To.NIC - if ep.bound && nic != 0 && nic != ep.boundNIC { - ep.mu.RUnlock() - return 0, nil, tcpip.ErrNoRoute - } - - // We don't support IPv6 yet, so this has to be an IPv4 address. - if len(opts.To.Addr) != header.IPv4AddressSize { - ep.mu.RUnlock() - return 0, nil, tcpip.ErrInvalidEndpointState - } - - // Find the route to the destination. If boundAddress is 0, - // FindRoute will choose an appropriate source address. - route, err := ep.stack.FindRoute(nic, ep.boundAddr, opts.To.Addr, ep.netProto, false) - if err != nil { - ep.mu.RUnlock() - return 0, nil, err - } - - n, ch, err := ep.finishWrite(payload, &route) - route.Release() - ep.mu.RUnlock() - return n, ch, err -} - -// finishWrite writes the payload to a route. It resolves the route if -// necessary. It's really just a helper to make defer unnecessary in Write. -func (ep *endpoint) finishWrite(payload tcpip.Payload, route *stack.Route) (uintptr, <-chan struct{}, *tcpip.Error) { - // We may need to resolve the route (match a link layer address to the - // network address). If that requires blocking (e.g. to use ARP), - // return a channel on which the caller can wait. - if route.IsResolutionRequired() { - waker := &sleep.Waker{} - if ch, err := route.Resolve(waker); err != nil { - if err == tcpip.ErrWouldBlock { - // Link address needs to be resolved. - // Resolution was triggered the background. - // Better luck next time. - route.RemoveWaker(waker) - return 0, ch, tcpip.ErrNoLinkAddress - } - return 0, nil, err - } - } - - payloadBytes, err := payload.Get(payload.Size()) - if err != nil { - return 0, nil, err - } - - switch ep.netProto { - case header.IPv4ProtocolNumber: - hdr := buffer.NewPrependable(len(payloadBytes) + int(route.MaxHeaderLength())) - if err := route.WritePacket(nil /* gso */, hdr, buffer.View(payloadBytes).ToVectorisedView(), header.ICMPv4ProtocolNumber, route.DefaultTTL()); err != nil { - return 0, nil, err - } - - default: - return 0, nil, tcpip.ErrUnknownProtocol - } - - return uintptr(len(payloadBytes)), nil, nil -} - -// Peek implements tcpip.Endpoint.Peek. -func (ep *endpoint) Peek([][]byte) (uintptr, tcpip.ControlMessages, *tcpip.Error) { - return 0, tcpip.ControlMessages{}, nil -} - -// Connect implements tcpip.Endpoint.Connect. -func (ep *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { - ep.mu.Lock() - defer ep.mu.Unlock() - - if ep.closed { - return tcpip.ErrInvalidEndpointState - } - - // We don't support IPv6 yet. - if len(addr.Addr) != header.IPv4AddressSize { - return tcpip.ErrInvalidEndpointState - } - - nic := addr.NIC - if ep.bound { - if ep.boundNIC == 0 { - // If we're bound, but not to a specific NIC, the NIC - // in addr will be used. Nothing to do here. - } else if addr.NIC == 0 { - // If we're bound to a specific NIC, but addr doesn't - // specify a NIC, use the bound NIC. - nic = ep.boundNIC - } else if addr.NIC != ep.boundNIC { - // We're bound and addr specifies a NIC. They must be - // the same. - return tcpip.ErrInvalidEndpointState - } - } - - // Find a route to the destination. - route, err := ep.stack.FindRoute(nic, tcpip.Address(""), addr.Addr, ep.netProto, false) - if err != nil { - return err - } - defer route.Release() - - // Re-register the endpoint with the appropriate NIC. - if err := ep.stack.RegisterRawTransportEndpoint(addr.NIC, ep.netProto, ep.transProto, ep); err != nil { - return err - } - ep.stack.UnregisterRawTransportEndpoint(ep.registeredNIC, ep.netProto, ep.transProto, ep) - - // Save the route and NIC we've connected via. - ep.route = route.Clone() - ep.registeredNIC = nic - ep.connected = true - - return nil -} - -// Shutdown implements tcpip.Endpoint.Shutdown. -func (ep *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error { - ep.mu.Lock() - defer ep.mu.Unlock() - - if !ep.connected { - return tcpip.ErrNotConnected - } - - ep.shutdownFlags |= flags - - if flags&tcpip.ShutdownRead != 0 { - ep.rcvMu.Lock() - wasClosed := ep.rcvClosed - ep.rcvClosed = true - ep.rcvMu.Unlock() - - if !wasClosed { - ep.waiterQueue.Notify(waiter.EventIn) - } - } - - return nil -} - -// Listen implements tcpip.Endpoint.Listen. -func (ep *endpoint) Listen(backlog int) *tcpip.Error { - return tcpip.ErrNotSupported -} - -// Accept implements tcpip.Endpoint.Accept. -func (ep *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) { - return nil, nil, tcpip.ErrNotSupported -} - -// Bind implements tcpip.Endpoint.Bind. -func (ep *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error { - ep.mu.Lock() - defer ep.mu.Unlock() - - // Callers must provide an IPv4 address or no network address (for - // binding to a NIC, but not an address). - if len(addr.Addr) != 0 && len(addr.Addr) != 4 { - return tcpip.ErrInvalidEndpointState - } - - // If a local address was specified, verify that it's valid. - if len(addr.Addr) == header.IPv4AddressSize && ep.stack.CheckLocalAddress(addr.NIC, ep.netProto, addr.Addr) == 0 { - return tcpip.ErrBadLocalAddress - } - - // Re-register the endpoint with the appropriate NIC. - if err := ep.stack.RegisterRawTransportEndpoint(addr.NIC, ep.netProto, ep.transProto, ep); err != nil { - return err - } - ep.stack.UnregisterRawTransportEndpoint(ep.registeredNIC, ep.netProto, ep.transProto, ep) - - ep.registeredNIC = addr.NIC - ep.boundNIC = addr.NIC - ep.boundAddr = addr.Addr - ep.bound = true - - return nil -} - -// GetLocalAddress implements tcpip.Endpoint.GetLocalAddress. -func (ep *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { - return tcpip.FullAddress{}, tcpip.ErrNotSupported -} - -// GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress. -func (ep *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) { - ep.mu.RLock() - defer ep.mu.RUnlock() - - if !ep.connected { - return tcpip.FullAddress{}, tcpip.ErrNotConnected - } - - return tcpip.FullAddress{ - NIC: ep.registeredNIC, - Addr: ep.route.RemoteAddress, - }, nil -} - -// Readiness implements tcpip.Endpoint.Readiness. -func (ep *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { - // The endpoint is always writable. - result := waiter.EventOut & mask - - // Determine whether the endpoint is readable. - if (mask & waiter.EventIn) != 0 { - ep.rcvMu.Lock() - if !ep.rcvList.Empty() || ep.rcvClosed { - result |= waiter.EventIn - } - ep.rcvMu.Unlock() - } - - return result -} - -// SetSockOpt implements tcpip.Endpoint.SetSockOpt. -func (ep *endpoint) SetSockOpt(opt interface{}) *tcpip.Error { - return nil -} - -// GetSockOpt implements tcpip.Endpoint.GetSockOpt. -func (ep *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { - switch o := opt.(type) { - case tcpip.ErrorOption: - return nil - - case *tcpip.SendBufferSizeOption: - ep.mu.Lock() - *o = tcpip.SendBufferSizeOption(ep.sndBufSize) - ep.mu.Unlock() - return nil - - case *tcpip.ReceiveBufferSizeOption: - ep.rcvMu.Lock() - *o = tcpip.ReceiveBufferSizeOption(ep.rcvBufSizeMax) - ep.rcvMu.Unlock() - return nil - - case *tcpip.ReceiveQueueSizeOption: - ep.rcvMu.Lock() - if ep.rcvList.Empty() { - *o = 0 - } else { - p := ep.rcvList.Front() - *o = tcpip.ReceiveQueueSizeOption(p.data.Size()) - } - ep.rcvMu.Unlock() - return nil - - case *tcpip.KeepaliveEnabledOption: - *o = 0 - return nil - - default: - return tcpip.ErrUnknownProtocolOption - } -} - -// HandlePacket implements stack.RawTransportEndpoint.HandlePacket. -func (ep *endpoint) HandlePacket(route *stack.Route, netHeader buffer.View, vv buffer.VectorisedView) { - ep.rcvMu.Lock() - - // Drop the packet if our buffer is currently full. - if ep.rcvClosed || ep.rcvBufSize >= ep.rcvBufSizeMax { - ep.stack.Stats().DroppedPackets.Increment() - ep.rcvMu.Unlock() - return - } - - if ep.bound { - // If bound to a NIC, only accept data for that NIC. - if ep.boundNIC != 0 && ep.boundNIC != route.NICID() { - ep.rcvMu.Unlock() - return - } - // If bound to an address, only accept data for that address. - if ep.boundAddr != "" && ep.boundAddr != route.RemoteAddress { - ep.rcvMu.Unlock() - return - } - } - - // If connected, only accept packets from the remote address we - // connected to. - if ep.connected && ep.route.RemoteAddress != route.RemoteAddress { - ep.rcvMu.Unlock() - return - } - - wasEmpty := ep.rcvBufSize == 0 - - // Push new packet into receive list and increment the buffer size. - packet := &packet{ - senderAddr: tcpip.FullAddress{ - NIC: route.NICID(), - Addr: route.RemoteAddress, - }, - } - - combinedVV := netHeader.ToVectorisedView() - combinedVV.Append(vv) - packet.data = combinedVV.Clone(packet.views[:]) - packet.timestampNS = ep.stack.NowNanoseconds() - - ep.rcvList.PushBack(packet) - ep.rcvBufSize += packet.data.Size() - - ep.rcvMu.Unlock() - - // Notify waiters that there's data to be read. - if wasEmpty { - ep.waiterQueue.Notify(waiter.EventIn) - } -} diff --git a/pkg/tcpip/transport/raw/state.go b/pkg/tcpip/transport/raw/state.go deleted file mode 100644 index e8907ebb1..000000000 --- a/pkg/tcpip/transport/raw/state.go +++ /dev/null @@ -1,88 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package raw - -import ( - "gvisor.googlesource.com/gvisor/pkg/tcpip" - "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer" - "gvisor.googlesource.com/gvisor/pkg/tcpip/stack" -) - -// saveData saves packet.data field. -func (p *packet) saveData() buffer.VectorisedView { - // We cannot save p.data directly as p.data.views may alias to p.views, - // which is not allowed by state framework (in-struct pointer). - return p.data.Clone(nil) -} - -// loadData loads packet.data field. -func (p *packet) loadData(data buffer.VectorisedView) { - // NOTE: We cannot do the p.data = data.Clone(p.views[:]) optimization - // here because data.views is not guaranteed to be loaded by now. Plus, - // data.views will be allocated anyway so there really is little point - // of utilizing p.views for data.views. - p.data = data -} - -// beforeSave is invoked by stateify. -func (ep *endpoint) beforeSave() { - // Stop incoming packets from being handled (and mutate endpoint state). - // The lock will be released after saveRcvBufSizeMax(), which would have - // saved ep.rcvBufSizeMax and set it to 0 to continue blocking incoming - // packets. - ep.rcvMu.Lock() -} - -// saveRcvBufSizeMax is invoked by stateify. -func (ep *endpoint) saveRcvBufSizeMax() int { - max := ep.rcvBufSizeMax - // Make sure no new packets will be handled regardless of the lock. - ep.rcvBufSizeMax = 0 - // Release the lock acquired in beforeSave() so regular endpoint closing - // logic can proceed after save. - ep.rcvMu.Unlock() - return max -} - -// loadRcvBufSizeMax is invoked by stateify. -func (ep *endpoint) loadRcvBufSizeMax(max int) { - ep.rcvBufSizeMax = max -} - -// afterLoad is invoked by stateify. -func (ep *endpoint) afterLoad() { - // StackFromEnv is a stack used specifically for save/restore. - ep.stack = stack.StackFromEnv - - // If the endpoint is connected, re-connect via the save/restore stack. - if ep.connected { - var err *tcpip.Error - ep.route, err = ep.stack.FindRoute(ep.registeredNIC, ep.boundAddr, ep.route.RemoteAddress, ep.netProto, false) - if err != nil { - panic(*err) - } - } - - // If the endpoint is bound, re-bind via the save/restore stack. - if ep.bound { - if ep.stack.CheckLocalAddress(ep.registeredNIC, ep.netProto, ep.boundAddr) == 0 { - panic(tcpip.ErrBadLocalAddress) - } - } - - if err := ep.stack.RegisterRawTransportEndpoint(ep.registeredNIC, ep.netProto, ep.transProto, ep); err != nil { - panic(*err) - } -} -- cgit v1.2.3