diff options
-rw-r--r-- | pkg/tcpip/stack/registration.go | 13 | ||||
-rw-r--r-- | pkg/tcpip/stack/stack.go | 20 | ||||
-rw-r--r-- | pkg/tcpip/stack/transport_demuxer.go | 67 | ||||
-rw-r--r-- | pkg/tcpip/stack/transport_test.go | 2 | ||||
-rw-r--r-- | pkg/tcpip/transport/icmp/BUILD | 1 | ||||
-rw-r--r-- | pkg/tcpip/transport/icmp/endpoint.go | 76 | ||||
-rw-r--r-- | pkg/tcpip/transport/icmp/protocol.go | 5 | ||||
-rw-r--r-- | pkg/tcpip/transport/raw/BUILD | 45 | ||||
-rw-r--r-- | pkg/tcpip/transport/raw/raw.go | 558 | ||||
-rw-r--r-- | pkg/tcpip/transport/raw/state.go | 88 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/endpoint.go | 2 | ||||
-rw-r--r-- | pkg/tcpip/transport/udp/endpoint.go | 2 | ||||
-rw-r--r-- | test/syscalls/linux/raw_socket_ipv4.cc | 421 |
13 files changed, 1106 insertions, 194 deletions
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go index ff356ea22..f3cc849ec 100644 --- a/pkg/tcpip/stack/registration.go +++ b/pkg/tcpip/stack/registration.go @@ -64,13 +64,24 @@ const ( type TransportEndpoint interface { // HandlePacket is called by the stack when new packets arrive to // this transport endpoint. - HandlePacket(r *Route, id TransportEndpointID, netHeader buffer.View, vv buffer.VectorisedView) + HandlePacket(r *Route, id TransportEndpointID, vv buffer.VectorisedView) // HandleControlPacket is called by the stack when new control (e.g., // ICMP) packets arrive to this transport endpoint. HandleControlPacket(id TransportEndpointID, typ ControlType, extra uint32, vv buffer.VectorisedView) } +// RawTransportEndpoint is the interface that needs to be implemented by raw +// transport protocol endpoints. RawTransportEndpoints receive the entire +// packet - including the link, network, and transport headers - as delivered +// to netstack. +type RawTransportEndpoint interface { + // HandlePacket is called by the stack when new packets arrive to + // this transport endpoint. The packet contains all data from the link + // layer up. + HandlePacket(r *Route, netHeader buffer.View, packet buffer.VectorisedView) +} + // TransportProtocol is the interface that needs to be implemented by transport // protocols (e.g., tcp, udp) that want to be part of the networking stack. type TransportProtocol interface { diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go index 15a268b10..a74c0a7a0 100644 --- a/pkg/tcpip/stack/stack.go +++ b/pkg/tcpip/stack/stack.go @@ -955,11 +955,11 @@ func (s *Stack) UnregisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip } // RegisterRawTransportEndpoint registers the given endpoint with the stack -// transport dispatcher. Received packets that match the provided protocol will -// be delivered to the given endpoint. -func (s *Stack) RegisterRawTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, ep TransportEndpoint, reusePort bool) *tcpip.Error { +// transport dispatcher. Received packets that match the provided transport +// protocol will be delivered to the given endpoint. +func (s *Stack) RegisterRawTransportEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) *tcpip.Error { if nicID == 0 { - return s.demux.registerRawEndpoint(netProtos, protocol, ep, reusePort) + return s.demux.registerRawEndpoint(netProto, transProto, ep) } s.mu.RLock() @@ -970,14 +970,14 @@ func (s *Stack) RegisterRawTransportEndpoint(nicID tcpip.NICID, netProtos []tcpi return tcpip.ErrUnknownNICID } - return nic.demux.registerRawEndpoint(netProtos, protocol, ep, reusePort) + return nic.demux.registerRawEndpoint(netProto, transProto, ep) } -// UnregisterRawTransportEndpoint removes the endpoint for the protocol from -// the stack transport dispatcher. -func (s *Stack) UnregisterRawTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, ep TransportEndpoint) { +// UnregisterRawTransportEndpoint removes the endpoint for the transport +// protocol from the stack transport dispatcher. +func (s *Stack) UnregisterRawTransportEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) { if nicID == 0 { - s.demux.unregisterRawEndpoint(netProtos, protocol, ep) + s.demux.unregisterRawEndpoint(netProto, transProto, ep) return } @@ -986,7 +986,7 @@ func (s *Stack) UnregisterRawTransportEndpoint(nicID tcpip.NICID, netProtos []tc nic := s.nics[nicID] if nic != nil { - nic.demux.unregisterRawEndpoint(netProtos, protocol, ep) + nic.demux.unregisterRawEndpoint(netProto, transProto, ep) } } diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go index 9ab314188..a8ac18e72 100644 --- a/pkg/tcpip/stack/transport_demuxer.go +++ b/pkg/tcpip/stack/transport_demuxer.go @@ -15,6 +15,7 @@ package stack import ( + "fmt" "math/rand" "sync" @@ -37,7 +38,7 @@ type transportEndpoints struct { endpoints map[TransportEndpointID]TransportEndpoint // rawEndpoints contains endpoints for raw sockets, which receive all // traffic of a given protocol regardless of port. - rawEndpoints []TransportEndpoint + rawEndpoints []RawTransportEndpoint } // unregisterEndpoint unregisters the endpoint with the given id such that it @@ -60,8 +61,10 @@ func (eps *transportEndpoints) unregisterEndpoint(id TransportEndpointID, ep Tra // transportDemuxer demultiplexes packets targeted at a transport endpoint // (i.e., after they've been parsed by the network layer). It does two levels // of demultiplexing: first based on the network and transport protocols, then -// based on endpoints IDs. +// based on endpoints IDs. It should only be instantiated via +// newTransportDemuxer. type transportDemuxer struct { + // protocol is immutable. protocol map[protocolIDs]*transportEndpoints } @@ -137,22 +140,22 @@ func (ep *multiPortEndpoint) selectEndpoint(id TransportEndpointID) TransportEnd // HandlePacket is called by the stack when new packets arrive to this transport // endpoint. -func (ep *multiPortEndpoint) HandlePacket(r *Route, id TransportEndpointID, netHeader buffer.View, vv buffer.VectorisedView) { +func (ep *multiPortEndpoint) HandlePacket(r *Route, id TransportEndpointID, vv buffer.VectorisedView) { // If this is a broadcast datagram, deliver the datagram to all endpoints // managed by ep. if id.LocalAddress == header.IPv4Broadcast { for i, endpoint := range ep.endpointsArr { // HandlePacket modifies vv, so each endpoint needs its own copy. if i == len(ep.endpointsArr)-1 { - endpoint.HandlePacket(r, id, netHeader, vv) + endpoint.HandlePacket(r, id, vv) break } vvCopy := buffer.NewView(vv.Size()) copy(vvCopy, vv.ToView()) - endpoint.HandlePacket(r, id, buffer.NewViewFromBytes(netHeader), vvCopy.ToVectorisedView()) + endpoint.HandlePacket(r, id, vvCopy.ToVectorisedView()) } } else { - ep.selectEndpoint(id).HandlePacket(r, id, netHeader, vv) + ep.selectEndpoint(id).HandlePacket(r, id, vv) } } @@ -286,17 +289,17 @@ func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProto // As in net/ipv4/ip_input.c:ip_local_deliver, attempt to deliver via // raw endpoint first. If there are multipe raw endpoints, they all // receive the packet. - found := false + foundRaw := false for _, rawEP := range eps.rawEndpoints { // Each endpoint gets its own copy of the packet for the sake // of save/restore. - rawEP.HandlePacket(r, id, buffer.NewViewFromBytes(netHeader), vv.ToView().ToVectorisedView()) - found = true + rawEP.HandlePacket(r, buffer.NewViewFromBytes(netHeader), vv.ToView().ToVectorisedView()) + foundRaw = true } eps.mu.RUnlock() // Fail if we didn't find at least one matching transport endpoint. - if len(destEps) == 0 && !found { + if len(destEps) == 0 && !foundRaw { // UDP packet could not be delivered to an unknown destination port. if protocol == header.UDPProtocolNumber { r.Stats().UDP.UnknownPortErrors.Increment() @@ -306,7 +309,7 @@ func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProto // Deliver the packet. for _, ep := range destEps { - ep.HandlePacket(r, id, netHeader, vv) + ep.HandlePacket(r, id, vv) } return true @@ -371,19 +374,8 @@ func (d *transportDemuxer) findEndpointLocked(eps *transportEndpoints, vv buffer // that packets of the appropriate protocol are delivered to it. A single // packet can be sent to one or more raw endpoints along with a non-raw // endpoint. -func (d *transportDemuxer) registerRawEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, ep TransportEndpoint, reusePort bool) *tcpip.Error { - for i, n := range netProtos { - if err := d.singleRegisterRawEndpoint(n, protocol, ep); err != nil { - d.unregisterRawEndpoint(netProtos[:i], protocol, ep) - return err - } - } - - return nil -} - -func (d *transportDemuxer) singleRegisterRawEndpoint(netProto tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, ep TransportEndpoint) *tcpip.Error { - eps, ok := d.protocol[protocolIDs{netProto, protocol}] +func (d *transportDemuxer) registerRawEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) *tcpip.Error { + eps, ok := d.protocol[protocolIDs{netProto, transProto}] if !ok { return nil } @@ -395,19 +387,20 @@ func (d *transportDemuxer) singleRegisterRawEndpoint(netProto tcpip.NetworkProto return nil } -// unregisterRawEndpoint unregisters the raw endpoint for the given protocol -// such that it won't receive any more packets. -func (d *transportDemuxer) unregisterRawEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, ep TransportEndpoint) { - for _, n := range netProtos { - if eps, ok := d.protocol[protocolIDs{n, protocol}]; ok { - eps.mu.Lock() - defer eps.mu.Unlock() - for i, rawEP := range eps.rawEndpoints { - if rawEP == ep { - eps.rawEndpoints = append(eps.rawEndpoints[:i], eps.rawEndpoints[i+1:]...) - return - } - } +// unregisterRawEndpoint unregisters the raw endpoint for the given transport +// protocol such that it won't receive any more packets. +func (d *transportDemuxer) unregisterRawEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) { + eps, ok := d.protocol[protocolIDs{netProto, transProto}] + if !ok { + panic(fmt.Errorf("tried to unregister endpoint with unsupported network and transport protocol pair: %d, %d", netProto, transProto)) + } + + eps.mu.Lock() + defer eps.mu.Unlock() + for i, rawEP := range eps.rawEndpoints { + if rawEP == ep { + eps.rawEndpoints = append(eps.rawEndpoints[:i], eps.rawEndpoints[i+1:]...) + return } } } diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go index dfd31557a..0c2589083 100644 --- a/pkg/tcpip/stack/transport_test.go +++ b/pkg/tcpip/stack/transport_test.go @@ -168,7 +168,7 @@ func (*fakeTransportEndpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Erro return tcpip.FullAddress{}, nil } -func (f *fakeTransportEndpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, _ buffer.View, _ buffer.VectorisedView) { +func (f *fakeTransportEndpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, _ buffer.VectorisedView) { // Increment the number of received packets. f.proto.packetCount++ if f.acceptQueue != nil { diff --git a/pkg/tcpip/transport/icmp/BUILD b/pkg/tcpip/transport/icmp/BUILD index 74d9ff253..9aa6f3978 100644 --- a/pkg/tcpip/transport/icmp/BUILD +++ b/pkg/tcpip/transport/icmp/BUILD @@ -32,6 +32,7 @@ go_library( "//pkg/tcpip/buffer", "//pkg/tcpip/header", "//pkg/tcpip/stack", + "//pkg/tcpip/transport/raw", "//pkg/waiter", ], ) diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go index 182097b46..8f2e3aa20 100644 --- a/pkg/tcpip/transport/icmp/endpoint.go +++ b/pkg/tcpip/transport/icmp/endpoint.go @@ -59,10 +59,6 @@ type endpoint struct { netProto tcpip.NetworkProtocolNumber transProto tcpip.TransportProtocolNumber waiterQueue *waiter.Queue - // raw indicates whether the endpoint is intended for use by a raw - // socket, which returns the network layer header along with the - // payload. It is immutable. - raw bool // The following fields are used to manage the receive queue, and are // protected by rcvMu. @@ -80,32 +76,26 @@ type endpoint struct { shutdownFlags tcpip.ShutdownFlags id stack.TransportEndpointID state endpointState - bindNICID tcpip.NICID - bindAddr tcpip.Address - regNICID tcpip.NICID - route stack.Route `state:"manual"` + // bindNICID and bindAddr are set via calls to Bind(). They are used to + // reject attempts to send data or connect via a different NIC or + // address + bindNICID tcpip.NICID + bindAddr tcpip.Address + // regNICID is the default NIC to be used when callers don't specify a + // NIC. + regNICID tcpip.NICID + route stack.Route `state:"manual"` } -func newEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue, raw bool) (*endpoint, *tcpip.Error) { - e := &endpoint{ +func newEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { + return &endpoint{ stack: stack, netProto: netProto, transProto: transProto, waiterQueue: waiterQueue, rcvBufSizeMax: 32 * 1024, sndBufSize: 32 * 1024, - raw: raw, - } - - // Raw endpoints must be immediately bound because they receive all - // ICMP traffic starting from when they're created via socket(). - if raw { - if err := e.bindLocked(tcpip.FullAddress{}); err != nil { - return nil, err - } - } - - return e, nil + }, nil } // Close puts the endpoint in a closed state and frees all resources @@ -115,11 +105,7 @@ func (e *endpoint) Close() { e.shutdownFlags = tcpip.ShutdownRead | tcpip.ShutdownWrite switch e.state { case stateBound, stateConnected: - if e.raw { - e.stack.UnregisterRawTransportEndpoint(e.regNICID, []tcpip.NetworkProtocolNumber{e.netProto}, e.transProto, e) - } else { - e.stack.UnregisterTransportEndpoint(e.regNICID, []tcpip.NetworkProtocolNumber{e.netProto}, e.transProto, e.id, e) - } + e.stack.UnregisterTransportEndpoint(e.regNICID, []tcpip.NetworkProtocolNumber{e.netProto}, e.transProto, e.id, e) } // Close the receive list and drain it. @@ -244,8 +230,9 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, <-c route = &e.route if route.IsResolutionRequired() { - // Promote lock to exclusive if using a shared route, given that it may - // need to change in Route.Resolve() call below. + // Promote lock to exclusive if using a shared route, + // given that it may need to change in Route.Resolve() + // call below. e.mu.RUnlock() defer e.mu.RLock() @@ -290,8 +277,9 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, <-c waker := &sleep.Waker{} if ch, err := route.Resolve(waker); err != nil { if err == tcpip.ErrWouldBlock { - // Link address needs to be resolved. Resolution was triggered the - // background. Better luck next time. + // Link address needs to be resolved. + // Resolution was triggered the background. + // Better luck next time. route.RemoveWaker(waker) return 0, ch, tcpip.ErrNoLinkAddress } @@ -368,11 +356,6 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { } func (e *endpoint) send4(r *stack.Route, data buffer.View) *tcpip.Error { - if e.raw { - hdr := buffer.NewPrependable(len(data) + int(r.MaxHeaderLength())) - return r.WritePacket(nil /* gso */, hdr, data.ToVectorisedView(), header.ICMPv4ProtocolNumber, r.DefaultTTL()) - } - if len(data) < header.ICMPv4EchoMinimumSize { return tcpip.ErrInvalidEndpointState } @@ -439,11 +422,6 @@ func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress, allowMismatch bool) (t // Connect connects the endpoint to its peer. Specifying a NIC is optional. func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { - // TODO: We don't yet support connect on a raw socket. - if e.raw { - return tcpip.ErrNotSupported - } - e.mu.Lock() defer e.mu.Unlock() @@ -547,11 +525,6 @@ func (*endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) { } func (e *endpoint) registerWithStack(nicid tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, *tcpip.Error) { - if e.raw { - err := e.stack.RegisterRawTransportEndpoint(nicid, netProtos, e.transProto, e, false) - return stack.TransportEndpointID{}, err - } - if id.LocalPort != 0 { // The endpoint already has a local port, just attempt to // register it. @@ -687,11 +660,12 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { // HandlePacket is called by the stack when new packets arrive to this transport // endpoint. -func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, netHeader buffer.View, vv buffer.VectorisedView) { +func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv buffer.VectorisedView) { e.rcvMu.Lock() // Drop the packet if our buffer is currently full. if !e.rcvReady || e.rcvClosed || e.rcvBufSize >= e.rcvBufSizeMax { + e.stack.Stats().DroppedPackets.Increment() e.rcvMu.Unlock() return } @@ -706,13 +680,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, ne }, } - if e.raw { - combinedVV := netHeader.ToVectorisedView() - combinedVV.Append(vv) - pkt.data = combinedVV.Clone(pkt.views[:]) - } else { - pkt.data = vv.Clone(pkt.views[:]) - } + pkt.data = vv.Clone(pkt.views[:]) e.rcvList.PushBack(pkt) e.rcvBufSize += pkt.data.Size() diff --git a/pkg/tcpip/transport/icmp/protocol.go b/pkg/tcpip/transport/icmp/protocol.go index 36b70988a..09ee2f892 100644 --- a/pkg/tcpip/transport/icmp/protocol.go +++ b/pkg/tcpip/transport/icmp/protocol.go @@ -30,6 +30,7 @@ import ( "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer" "gvisor.googlesource.com/gvisor/pkg/tcpip/header" "gvisor.googlesource.com/gvisor/pkg/tcpip/stack" + "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/raw" "gvisor.googlesource.com/gvisor/pkg/waiter" ) @@ -73,7 +74,7 @@ func (p *protocol) NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtoco if netProto != p.netProto() { return nil, tcpip.ErrUnknownProtocol } - return newEndpoint(stack, netProto, p.number, waiterQueue, false) + return newEndpoint(stack, netProto, p.number, waiterQueue) } // NewRawEndpoint creates a new raw icmp endpoint. It implements @@ -82,7 +83,7 @@ func (p *protocol) NewRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProt if netProto != p.netProto() { return nil, tcpip.ErrUnknownProtocol } - return newEndpoint(stack, netProto, p.number, waiterQueue, true) + return raw.NewEndpoint(stack, netProto, p.number, waiterQueue) } // MinimumPacketSize returns the minimum valid icmp packet size. diff --git a/pkg/tcpip/transport/raw/BUILD b/pkg/tcpip/transport/raw/BUILD new file mode 100644 index 000000000..005079639 --- /dev/null +++ b/pkg/tcpip/transport/raw/BUILD @@ -0,0 +1,45 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("//tools/go_generics:defs.bzl", "go_template_instance") +load("//tools/go_stateify:defs.bzl", "go_library") + +go_template_instance( + name = "packet_list", + out = "packet_list.go", + package = "raw", + prefix = "packet", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*packet", + "Linker": "*packet", + }, +) + +go_library( + name = "raw", + srcs = [ + "packet_list.go", + "raw.go", + "state.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/raw", + imports = ["gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"], + visibility = ["//visibility:public"], + deps = [ + "//pkg/log", + "//pkg/sleep", + "//pkg/tcpip", + "//pkg/tcpip/buffer", + "//pkg/tcpip/header", + "//pkg/tcpip/stack", + "//pkg/waiter", + ], +) + +filegroup( + name = "autogen", + srcs = [ + "packet_list.go", + ], + visibility = ["//:sandbox"], +) diff --git a/pkg/tcpip/transport/raw/raw.go b/pkg/tcpip/transport/raw/raw.go new file mode 100644 index 000000000..8dada2e4f --- /dev/null +++ b/pkg/tcpip/transport/raw/raw.go @@ -0,0 +1,558 @@ +// Copyright 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package raw provides the implementation of raw sockets (see raw(7)). Raw +// sockets allow applications to: +// +// * manually write and inspect transport layer headers and payloads +// * receive all traffic of a given transport protcol (e.g. ICMP or UDP) +// * optionally write and inspect network layer and link layer headers for +// packets +// +// Raw sockets don't have any notion of ports, and incoming packets are +// demultiplexed solely by protocol number. Thus, a raw UDP endpoint will +// receive every UDP packet received by netstack. bind(2) and connect(2) can be +// used to filter incoming packets by source and destination. +package raw + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/sleep" + "gvisor.googlesource.com/gvisor/pkg/tcpip" + "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer" + "gvisor.googlesource.com/gvisor/pkg/tcpip/header" + "gvisor.googlesource.com/gvisor/pkg/tcpip/stack" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// +stateify savable +type packet struct { + packetEntry + // data holds the actual packet data, including any headers and + // payload. + data buffer.VectorisedView `state:".(buffer.VectorisedView)"` + // views is pre-allocated space to back data. As long as the packet is + // made up of fewer than 8 buffer.Views, no extra allocation is + // necessary to store packet data. + views [8]buffer.View `state:"nosave"` + // timestampNS is the unix time at which the packet was received. + timestampNS int64 + // senderAddr is the network address of the sender. + senderAddr tcpip.FullAddress +} + +// endpoint is the raw socket implementation of tcpip.Endpoint. It is legal to +// have goroutines make concurrent calls into the endpoint. +// +// Lock order: +// endpoint.mu +// endpoint.rcvMu +// +// +stateify savable +type endpoint struct { + // The following fields are initialized at creation time and are + // immutable. + stack *stack.Stack `state:"manual"` + netProto tcpip.NetworkProtocolNumber + transProto tcpip.TransportProtocolNumber + waiterQueue *waiter.Queue + + // The following fields are used to manage the receive queue and are + // protected by rcvMu. + rcvMu sync.Mutex `state:"nosave"` + rcvList packetList + rcvBufSizeMax int `state:".(int)"` + rcvBufSize int + rcvClosed bool + + // The following fields are protected by mu. + mu sync.RWMutex `state:"nosave"` + sndBufSize int + // shutdownFlags represent the current shutdown state of the endpoint. + shutdownFlags tcpip.ShutdownFlags + closed bool + connected bool + bound bool + // registeredNIC is the NIC to which th endpoint is explicitly + // registered. Is set when Connect or Bind are used to specify a NIC. + registeredNIC tcpip.NICID + // boundNIC and boundAddr are set on calls to Bind(). When callers + // attempt actions that would invalidate the binding data (e.g. sending + // data via a NIC other than boundNIC), the endpoint will return an + // error. + boundNIC tcpip.NICID + boundAddr tcpip.Address + // route is the route to a remote network endpoint. It is set via + // Connect(), and is valid only when conneted is true. + route stack.Route `state:"manual"` +} + +// NewEndpoint returns a raw endpoint for the given protocols. +// TODO: IP_HDRINCL, IPPROTO_RAW, and AF_PACKET. +func NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { + if netProto != header.IPv4ProtocolNumber { + return nil, tcpip.ErrUnknownProtocol + } + + ep := &endpoint{ + stack: stack, + netProto: netProto, + transProto: transProto, + waiterQueue: waiterQueue, + rcvBufSizeMax: 32 * 1024, + sndBufSize: 32 * 1024, + } + + if err := ep.stack.RegisterRawTransportEndpoint(ep.registeredNIC, ep.netProto, ep.transProto, ep); err != nil { + return nil, err + } + + return ep, nil +} + +// Close implements tcpip.Endpoint.Close. +func (ep *endpoint) Close() { + ep.mu.Lock() + defer ep.mu.Unlock() + + if ep.closed { + return + } + + ep.stack.UnregisterRawTransportEndpoint(ep.registeredNIC, ep.netProto, ep.transProto, ep) + + ep.rcvMu.Lock() + defer ep.rcvMu.Unlock() + + // Clear the receive list. + ep.rcvClosed = true + ep.rcvBufSize = 0 + for !ep.rcvList.Empty() { + ep.rcvList.Remove(ep.rcvList.Front()) + } + + if ep.connected { + ep.route.Release() + } + + ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut) +} + +// Read implements tcpip.Endpoint.Read. +func (ep *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) { + ep.rcvMu.Lock() + + // If there's no data to read, return that read would block or that the + // endpoint is closed. + if ep.rcvList.Empty() { + err := tcpip.ErrWouldBlock + if ep.rcvClosed { + err = tcpip.ErrClosedForReceive + } + ep.rcvMu.Unlock() + return buffer.View{}, tcpip.ControlMessages{}, err + } + + packet := ep.rcvList.Front() + ep.rcvList.Remove(packet) + ep.rcvBufSize -= packet.data.Size() + + ep.rcvMu.Unlock() + + if addr != nil { + *addr = packet.senderAddr + } + + return packet.data.ToView(), tcpip.ControlMessages{HasTimestamp: true, Timestamp: packet.timestampNS}, nil +} + +// Write implements tcpip.Endpoint.Write. +func (ep *endpoint) Write(payload tcpip.Payload, opts tcpip.WriteOptions) (uintptr, <-chan struct{}, *tcpip.Error) { + // MSG_MORE is unimplemented. This also means that MSG_EOR is a no-op. + if opts.More { + return 0, nil, tcpip.ErrInvalidOptionValue + } + + ep.mu.RLock() + + if ep.closed { + ep.mu.RUnlock() + return 0, nil, tcpip.ErrInvalidEndpointState + } + + // Check whether we've shutdown writing. + if ep.shutdownFlags&tcpip.ShutdownWrite != 0 { + ep.mu.RUnlock() + return 0, nil, tcpip.ErrClosedForSend + } + + // Did the user caller provide a destination? If not, use the connected + // destination. + if opts.To == nil { + // If the user doesn't specify a destination, they should have + // connected to another address. + if !ep.connected { + ep.mu.RUnlock() + return 0, nil, tcpip.ErrNotConnected + } + + if ep.route.IsResolutionRequired() { + savedRoute := &ep.route + // Promote lock to exclusive if using a shared route, + // given that it may need to change in finishWrite. + ep.mu.RUnlock() + ep.mu.Lock() + + // Make sure that the route didn't change during the + // time we didn't hold the lock. + if !ep.connected || savedRoute != &ep.route { + ep.mu.Unlock() + return 0, nil, tcpip.ErrInvalidEndpointState + } + + n, ch, err := ep.finishWrite(payload, savedRoute) + ep.mu.Unlock() + return n, ch, err + } + + n, ch, err := ep.finishWrite(payload, &ep.route) + ep.mu.RUnlock() + return n, ch, err + } + + // The caller provided a destination. Reject destination address if it + // goes through a different NIC than the endpoint was bound to. + nic := opts.To.NIC + if ep.bound && nic != 0 && nic != ep.boundNIC { + ep.mu.RUnlock() + return 0, nil, tcpip.ErrNoRoute + } + + // We don't support IPv6 yet, so this has to be an IPv4 address. + if len(opts.To.Addr) != header.IPv4AddressSize { + ep.mu.RUnlock() + return 0, nil, tcpip.ErrInvalidEndpointState + } + + // Find the route to the destination. If boundAddress is 0, + // FindRoute will choose an appropriate source address. + route, err := ep.stack.FindRoute(nic, ep.boundAddr, opts.To.Addr, ep.netProto, false) + if err != nil { + ep.mu.RUnlock() + return 0, nil, err + } + + n, ch, err := ep.finishWrite(payload, &route) + route.Release() + ep.mu.RUnlock() + return n, ch, err +} + +// finishWrite writes the payload to a route. It resolves the route if +// necessary. It's really just a helper to make defer unnecessary in Write. +func (ep *endpoint) finishWrite(payload tcpip.Payload, route *stack.Route) (uintptr, <-chan struct{}, *tcpip.Error) { + // We may need to resolve the route (match a link layer address to the + // network address). If that requires blocking (e.g. to use ARP), + // return a channel on which the caller can wait. + if route.IsResolutionRequired() { + waker := &sleep.Waker{} + if ch, err := route.Resolve(waker); err != nil { + if err == tcpip.ErrWouldBlock { + // Link address needs to be resolved. + // Resolution was triggered the background. + // Better luck next time. + route.RemoveWaker(waker) + return 0, ch, tcpip.ErrNoLinkAddress + } + return 0, nil, err + } + } + + payloadBytes, err := payload.Get(payload.Size()) + if err != nil { + return 0, nil, err + } + + switch ep.netProto { + case header.IPv4ProtocolNumber: + hdr := buffer.NewPrependable(len(payloadBytes) + int(route.MaxHeaderLength())) + if err := route.WritePacket(nil /* gso */, hdr, buffer.View(payloadBytes).ToVectorisedView(), header.ICMPv4ProtocolNumber, route.DefaultTTL()); err != nil { + return 0, nil, err + } + + default: + return 0, nil, tcpip.ErrUnknownProtocol + } + + return uintptr(len(payloadBytes)), nil, nil +} + +// Peek implements tcpip.Endpoint.Peek. +func (ep *endpoint) Peek([][]byte) (uintptr, tcpip.ControlMessages, *tcpip.Error) { + return 0, tcpip.ControlMessages{}, nil +} + +// Connect implements tcpip.Endpoint.Connect. +func (ep *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { + ep.mu.Lock() + defer ep.mu.Unlock() + + if ep.closed { + return tcpip.ErrInvalidEndpointState + } + + // We don't support IPv6 yet. + if len(addr.Addr) != header.IPv4AddressSize { + return tcpip.ErrInvalidEndpointState + } + + nic := addr.NIC + if ep.bound { + if ep.boundNIC == 0 { + // If we're bound, but not to a specific NIC, the NIC + // in addr will be used. Nothing to do here. + } else if addr.NIC == 0 { + // If we're bound to a specific NIC, but addr doesn't + // specify a NIC, use the bound NIC. + nic = ep.boundNIC + } else if addr.NIC != ep.boundNIC { + // We're bound and addr specifies a NIC. They must be + // the same. + return tcpip.ErrInvalidEndpointState + } + } + + // Find a route to the destination. + route, err := ep.stack.FindRoute(nic, tcpip.Address(""), addr.Addr, ep.netProto, false) + if err != nil { + return err + } + defer route.Release() + + // Re-register the endpoint with the appropriate NIC. + if err := ep.stack.RegisterRawTransportEndpoint(addr.NIC, ep.netProto, ep.transProto, ep); err != nil { + return err + } + ep.stack.UnregisterRawTransportEndpoint(ep.registeredNIC, ep.netProto, ep.transProto, ep) + + // Save the route and NIC we've connected via. + ep.route = route.Clone() + ep.registeredNIC = nic + ep.connected = true + + return nil +} + +// Shutdown implements tcpip.Endpoint.Shutdown. +func (ep *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error { + ep.mu.Lock() + defer ep.mu.Unlock() + + if !ep.connected { + return tcpip.ErrNotConnected + } + + ep.shutdownFlags |= flags + + if flags&tcpip.ShutdownRead != 0 { + ep.rcvMu.Lock() + wasClosed := ep.rcvClosed + ep.rcvClosed = true + ep.rcvMu.Unlock() + + if !wasClosed { + ep.waiterQueue.Notify(waiter.EventIn) + } + } + + return nil +} + +// Listen implements tcpip.Endpoint.Listen. +func (ep *endpoint) Listen(backlog int) *tcpip.Error { + return tcpip.ErrNotSupported +} + +// Accept implements tcpip.Endpoint.Accept. +func (ep *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) { + return nil, nil, tcpip.ErrNotSupported +} + +// Bind implements tcpip.Endpoint.Bind. +func (ep *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error { + ep.mu.Lock() + defer ep.mu.Unlock() + + // Callers must provide an IPv4 address or no network address (for + // binding to a NIC, but not an address). + if len(addr.Addr) != 0 && len(addr.Addr) != 4 { + return tcpip.ErrInvalidEndpointState + } + + // If a local address was specified, verify that it's valid. + if len(addr.Addr) == header.IPv4AddressSize && ep.stack.CheckLocalAddress(addr.NIC, ep.netProto, addr.Addr) == 0 { + return tcpip.ErrBadLocalAddress + } + + // Re-register the endpoint with the appropriate NIC. + if err := ep.stack.RegisterRawTransportEndpoint(addr.NIC, ep.netProto, ep.transProto, ep); err != nil { + return err + } + ep.stack.UnregisterRawTransportEndpoint(ep.registeredNIC, ep.netProto, ep.transProto, ep) + + ep.registeredNIC = addr.NIC + ep.boundNIC = addr.NIC + ep.boundAddr = addr.Addr + ep.bound = true + + return nil +} + +// GetLocalAddress implements tcpip.Endpoint.GetLocalAddress. +func (ep *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { + return tcpip.FullAddress{}, tcpip.ErrNotSupported +} + +// GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress. +func (ep *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) { + ep.mu.RLock() + defer ep.mu.RUnlock() + + if !ep.connected { + return tcpip.FullAddress{}, tcpip.ErrNotConnected + } + + return tcpip.FullAddress{ + NIC: ep.registeredNIC, + Addr: ep.route.RemoteAddress, + }, nil +} + +// Readiness implements tcpip.Endpoint.Readiness. +func (ep *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { + // The endpoint is always writable. + result := waiter.EventOut & mask + + // Determine whether the endpoint is readable. + if (mask & waiter.EventIn) != 0 { + ep.rcvMu.Lock() + if !ep.rcvList.Empty() || ep.rcvClosed { + result |= waiter.EventIn + } + ep.rcvMu.Unlock() + } + + return result +} + +// SetSockOpt implements tcpip.Endpoint.SetSockOpt. +func (ep *endpoint) SetSockOpt(opt interface{}) *tcpip.Error { + return nil +} + +// GetSockOpt implements tcpip.Endpoint.GetSockOpt. +func (ep *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { + switch o := opt.(type) { + case tcpip.ErrorOption: + return nil + + case *tcpip.SendBufferSizeOption: + ep.mu.Lock() + *o = tcpip.SendBufferSizeOption(ep.sndBufSize) + ep.mu.Unlock() + return nil + + case *tcpip.ReceiveBufferSizeOption: + ep.rcvMu.Lock() + *o = tcpip.ReceiveBufferSizeOption(ep.rcvBufSizeMax) + ep.rcvMu.Unlock() + return nil + + case *tcpip.ReceiveQueueSizeOption: + ep.rcvMu.Lock() + if ep.rcvList.Empty() { + *o = 0 + } else { + p := ep.rcvList.Front() + *o = tcpip.ReceiveQueueSizeOption(p.data.Size()) + } + ep.rcvMu.Unlock() + return nil + + case *tcpip.KeepaliveEnabledOption: + *o = 0 + return nil + + default: + return tcpip.ErrUnknownProtocolOption + } +} + +// HandlePacket implements stack.RawTransportEndpoint.HandlePacket. +func (ep *endpoint) HandlePacket(route *stack.Route, netHeader buffer.View, vv buffer.VectorisedView) { + ep.rcvMu.Lock() + + // Drop the packet if our buffer is currently full. + if ep.rcvClosed || ep.rcvBufSize >= ep.rcvBufSizeMax { + ep.stack.Stats().DroppedPackets.Increment() + ep.rcvMu.Unlock() + return + } + + if ep.bound { + // If bound to a NIC, only accept data for that NIC. + if ep.boundNIC != 0 && ep.boundNIC != route.NICID() { + ep.rcvMu.Unlock() + return + } + // If bound to an address, only accept data for that address. + if ep.boundAddr != "" && ep.boundAddr != route.RemoteAddress { + ep.rcvMu.Unlock() + return + } + } + + // If connected, only accept packets from the remote address we + // connected to. + if ep.connected && ep.route.RemoteAddress != route.RemoteAddress { + ep.rcvMu.Unlock() + return + } + + wasEmpty := ep.rcvBufSize == 0 + + // Push new packet into receive list and increment the buffer size. + packet := &packet{ + senderAddr: tcpip.FullAddress{ + NIC: route.NICID(), + Addr: route.RemoteAddress, + }, + } + + combinedVV := netHeader.ToVectorisedView() + combinedVV.Append(vv) + packet.data = combinedVV.Clone(packet.views[:]) + packet.timestampNS = ep.stack.NowNanoseconds() + + ep.rcvList.PushBack(packet) + ep.rcvBufSize += packet.data.Size() + + ep.rcvMu.Unlock() + + // Notify waiters that there's data to be read. + if wasEmpty { + ep.waiterQueue.Notify(waiter.EventIn) + } +} diff --git a/pkg/tcpip/transport/raw/state.go b/pkg/tcpip/transport/raw/state.go new file mode 100644 index 000000000..e3891a8b8 --- /dev/null +++ b/pkg/tcpip/transport/raw/state.go @@ -0,0 +1,88 @@ +// Copyright 2018 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package raw + +import ( + "gvisor.googlesource.com/gvisor/pkg/tcpip" + "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer" + "gvisor.googlesource.com/gvisor/pkg/tcpip/stack" +) + +// saveData saves packet.data field. +func (p *packet) saveData() buffer.VectorisedView { + // We cannot save p.data directly as p.data.views may alias to p.views, + // which is not allowed by state framework (in-struct pointer). + return p.data.Clone(nil) +} + +// loadData loads packet.data field. +func (p *packet) loadData(data buffer.VectorisedView) { + // NOTE: We cannot do the p.data = data.Clone(p.views[:]) optimization + // here because data.views is not guaranteed to be loaded by now. Plus, + // data.views will be allocated anyway so there really is little point + // of utilizing p.views for data.views. + p.data = data +} + +// beforeSave is invoked by stateify. +func (ep *endpoint) beforeSave() { + // Stop incoming packets from being handled (and mutate endpoint state). + // The lock will be released after saveRcvBufSizeMax(), which would have + // saved ep.rcvBufSizeMax and set it to 0 to continue blocking incoming + // packets. + ep.rcvMu.Lock() +} + +// saveRcvBufSizeMax is invoked by stateify. +func (ep *endpoint) saveRcvBufSizeMax() int { + max := ep.rcvBufSizeMax + // Make sure no new packets will be handled regardless of the lock. + ep.rcvBufSizeMax = 0 + // Release the lock acquired in beforeSave() so regular endpoint closing + // logic can proceed after save. + ep.rcvMu.Unlock() + return max +} + +// loadRcvBufSizeMax is invoked by stateify. +func (ep *endpoint) loadRcvBufSizeMax(max int) { + ep.rcvBufSizeMax = max +} + +// afterLoad is invoked by stateify. +func (ep *endpoint) afterLoad() { + // StackFromEnv is a stack used specifically for save/restore. + ep.stack = stack.StackFromEnv + + // If the endpoint is connected, re-connect via the save/restore stack. + if ep.connected { + var err *tcpip.Error + ep.route, err = ep.stack.FindRoute(ep.registeredNIC, ep.boundAddr, ep.route.RemoteAddress, ep.netProto, false) + if err != nil { + panic(*err) + } + } + + // If the endpoint is bound, re-bind via the save/restore stack. + if ep.bound { + if ep.stack.CheckLocalAddress(ep.registeredNIC, ep.netProto, ep.boundAddr) == 0 { + panic(tcpip.ErrBadLocalAddress) + } + } + + if err := ep.stack.RegisterRawTransportEndpoint(ep.registeredNIC, ep.netProto, ep.transProto, ep); err != nil { + panic(*err) + } +} diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index 0427af34f..41c87cc7e 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -1438,7 +1438,7 @@ func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) { // HandlePacket is called by the stack when new packets arrive to this transport // endpoint. -func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, netHeader buffer.View, vv buffer.VectorisedView) { +func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv buffer.VectorisedView) { s := newSegment(r, id, vv) if !s.parse() { e.stack.Stats().MalformedRcvdPackets.Increment() diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go index 5637f46e3..19e532180 100644 --- a/pkg/tcpip/transport/udp/endpoint.go +++ b/pkg/tcpip/transport/udp/endpoint.go @@ -940,7 +940,7 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { // HandlePacket is called by the stack when new packets arrive to this transport // endpoint. -func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, netHeader buffer.View, vv buffer.VectorisedView) { +func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv buffer.VectorisedView) { // Get the header then trim it from the view. hdr := header.UDP(vv.First()) if int(hdr.Length()) > vv.Size() { diff --git a/test/syscalls/linux/raw_socket_ipv4.cc b/test/syscalls/linux/raw_socket_ipv4.cc index 19cded07f..b13806dcb 100644 --- a/test/syscalls/linux/raw_socket_ipv4.cc +++ b/test/syscalls/linux/raw_socket_ipv4.cc @@ -16,8 +16,11 @@ #include <netinet/in.h> #include <netinet/ip.h> #include <netinet/ip_icmp.h> +#include <sys/poll.h> #include <sys/socket.h> #include <sys/types.h> +#include <unistd.h> +#include <algorithm> #include "gtest/gtest.h" #include "test/syscalls/linux/socket_test_util.h" @@ -39,22 +42,26 @@ class RawSocketTest : public ::testing::Test { // Closes the socket created by SetUp(). void TearDown() override; - // The socket used for both reading and writing. - int s_; + // Checks that both an ICMP echo request and reply are received. Calls should + // be wrapped in ASSERT_NO_FATAL_FAILURE. + void ExpectICMPSuccess(const struct icmphdr& icmp); - // The loopback address. - struct sockaddr_in addr_; + void SendEmptyICMP(const struct icmphdr& icmp); + + void SendEmptyICMPTo(int sock, struct sockaddr_in* addr, + const struct icmphdr& icmp); - void SendEmptyICMP(struct icmphdr *icmp); + void ReceiveICMP(char* recv_buf, size_t recv_buf_len, size_t expected_size, + struct sockaddr_in* src); - void SendEmptyICMPTo(int sock, struct sockaddr_in *addr, - struct icmphdr *icmp); + void ReceiveICMPFrom(char* recv_buf, size_t recv_buf_len, + size_t expected_size, struct sockaddr_in* src, int sock); - void ReceiveICMP(char *recv_buf, size_t recv_buf_len, size_t expected_size, - struct sockaddr_in *src); + // The socket used for both reading and writing. + int s_; - void ReceiveICMPFrom(char *recv_buf, size_t recv_buf_len, - size_t expected_size, struct sockaddr_in *src, int sock); + // The loopback address. + struct sockaddr_in addr_; }; void RawSocketTest::SetUp() { @@ -100,49 +107,9 @@ TEST_F(RawSocketTest, SendAndReceive) { icmp.checksum = 2011; icmp.un.echo.sequence = 2012; icmp.un.echo.id = 2014; - ASSERT_NO_FATAL_FAILURE(SendEmptyICMP(&icmp)); - - // We're going to receive both the echo request and reply, but the order is - // indeterminate. - char recv_buf[512]; - struct sockaddr_in src; - bool received_request = false; - bool received_reply = false; - - for (int i = 0; i < 2; i++) { - // Receive the packet. - ASSERT_NO_FATAL_FAILURE(ReceiveICMP(recv_buf, ABSL_ARRAYSIZE(recv_buf), - sizeof(struct icmphdr), &src)); - EXPECT_EQ(memcmp(&src, &addr_, sizeof(sockaddr_in)), 0); - struct icmphdr *recvd_icmp = - reinterpret_cast<struct icmphdr *>(recv_buf + sizeof(struct iphdr)); - switch (recvd_icmp->type) { - case ICMP_ECHO: - EXPECT_FALSE(received_request); - received_request = true; - // The packet should be identical to what we sent. - EXPECT_EQ(memcmp(recv_buf + sizeof(struct iphdr), &icmp, sizeof(icmp)), - 0); - break; - - case ICMP_ECHOREPLY: - EXPECT_FALSE(received_reply); - received_reply = true; - // Most fields should be the same. - EXPECT_EQ(recvd_icmp->code, icmp.code); - EXPECT_EQ(recvd_icmp->un.echo.sequence, icmp.un.echo.sequence); - EXPECT_EQ(recvd_icmp->un.echo.id, icmp.un.echo.id); - // A couple are different. - EXPECT_EQ(recvd_icmp->type, ICMP_ECHOREPLY); - // The checksum is computed in such a way that it is guaranteed to have - // changed. - EXPECT_NE(recvd_icmp->checksum, icmp.checksum); - break; - } - } + ASSERT_NO_FATAL_FAILURE(SendEmptyICMP(icmp)); - ASSERT_TRUE(received_request); - ASSERT_TRUE(received_reply); + ASSERT_NO_FATAL_FAILURE(ExpectICMPSuccess(icmp)); } // We should be able to create multiple raw sockets for the same protocol and @@ -162,7 +129,7 @@ TEST_F(RawSocketTest, MultipleSocketReceive) { icmp.checksum = 2014; icmp.un.echo.sequence = 2016; icmp.un.echo.id = 2018; - ASSERT_NO_FATAL_FAILURE(SendEmptyICMP(&icmp)); + ASSERT_NO_FATAL_FAILURE(SendEmptyICMP(icmp)); // Both sockets will receive the echo request and reply in indeterminate // order, so we'll need to read 2 packets from each. @@ -191,12 +158,14 @@ TEST_F(RawSocketTest, MultipleSocketReceive) { int types[] = {ICMP_ECHO, ICMP_ECHOREPLY}; for (int type : types) { auto match_type = [=](char buf[kBufSize]) { - struct icmphdr *icmp = - reinterpret_cast<struct icmphdr *>(buf + sizeof(struct iphdr)); + struct icmphdr* icmp = + reinterpret_cast<struct icmphdr*>(buf + sizeof(struct iphdr)); return icmp->type == type; }; - char *icmp1 = *std::find_if(recv_buf1.begin(), recv_buf1.end(), match_type); - char *icmp2 = *std::find_if(recv_buf2.begin(), recv_buf2.end(), match_type); + const char* icmp1 = + *std::find_if(recv_buf1.begin(), recv_buf1.end(), match_type); + const char* icmp2 = + *std::find_if(recv_buf2.begin(), recv_buf2.end(), match_type); ASSERT_NE(icmp1, *recv_buf1.end()); ASSERT_NE(icmp2, *recv_buf2.end()); EXPECT_EQ(memcmp(icmp1 + sizeof(struct iphdr), icmp2 + sizeof(struct iphdr), @@ -217,10 +186,10 @@ TEST_F(RawSocketTest, RawAndPingSockets) { struct icmphdr icmp; icmp.type = ICMP_ECHO; icmp.code = 0; - icmp.un.echo.sequence = - *static_cast<unsigned short *>(&icmp.un.echo.sequence); + icmp.un.echo.sequence = *static_cast<unsigned short*>(&icmp.un.echo.sequence); ASSERT_THAT(RetryEINTR(sendto)(ping_sock.get(), &icmp, sizeof(icmp), 0, - (struct sockaddr *)&addr_, sizeof(addr_)), + reinterpret_cast<struct sockaddr*>(&addr_), + sizeof(addr_)), SyscallSucceedsWithValue(sizeof(icmp))); // Both sockets will receive the echo request and reply in indeterminate @@ -247,12 +216,12 @@ TEST_F(RawSocketTest, RawAndPingSockets) { int types[] = {ICMP_ECHO, ICMP_ECHOREPLY}; for (int type : types) { auto match_type_ping = [=](char buf[kBufSize]) { - struct icmphdr *icmp = reinterpret_cast<struct icmphdr *>(buf); + struct icmphdr* icmp = reinterpret_cast<struct icmphdr*>(buf); return icmp->type == type; }; auto match_type_raw = [=](char buf[kBufSize]) { - struct icmphdr *icmp = - reinterpret_cast<struct icmphdr *>(buf + sizeof(struct iphdr)); + struct icmphdr* icmp = + reinterpret_cast<struct icmphdr*>(buf + sizeof(struct iphdr)); return icmp->type == type; }; @@ -266,39 +235,317 @@ TEST_F(RawSocketTest, RawAndPingSockets) { } } -void RawSocketTest::SendEmptyICMP(struct icmphdr *icmp) { +// Test that shutting down an unconnected socket fails. +TEST_F(RawSocketTest, FailShutdownWithoutConnect) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + ASSERT_THAT(shutdown(s_, SHUT_WR), SyscallFailsWithErrno(ENOTCONN)); + ASSERT_THAT(shutdown(s_, SHUT_RD), SyscallFailsWithErrno(ENOTCONN)); +} + +// Test that writing to a shutdown write socket fails. +TEST_F(RawSocketTest, FailWritingToShutdown) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + ASSERT_THAT( + connect(s_, reinterpret_cast<struct sockaddr*>(&addr_), sizeof(addr_)), + SyscallSucceeds()); + ASSERT_THAT(shutdown(s_, SHUT_WR), SyscallSucceeds()); + + char c; + ASSERT_THAT(RetryEINTR(write)(s_, &c, sizeof(c)), + SyscallFailsWithErrno(EPIPE)); +} + +// Test that reading from a shutdown read socket gets nothing. +TEST_F(RawSocketTest, FailReadingFromShutdown) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + ASSERT_THAT( + connect(s_, reinterpret_cast<struct sockaddr*>(&addr_), sizeof(addr_)), + SyscallSucceeds()); + ASSERT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds()); + + char c; + ASSERT_THAT(read(s_, &c, sizeof(c)), SyscallSucceedsWithValue(0)); +} + +// Test that listen() fails. +TEST_F(RawSocketTest, FailListen) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + ASSERT_THAT(listen(s_, 1), SyscallFailsWithErrno(ENOTSUP)); +} + +// Test that accept() fails. +TEST_F(RawSocketTest, FailAccept) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + struct sockaddr saddr; + socklen_t addrlen; + ASSERT_THAT(accept(s_, &saddr, &addrlen), SyscallFailsWithErrno(ENOTSUP)); +} + +// Test that getpeername() returns nothing before connect(). +TEST_F(RawSocketTest, FailGetPeerNameBeforeConnect) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + struct sockaddr saddr; + socklen_t addrlen; + ASSERT_THAT(getpeername(s_, &saddr, &addrlen), + SyscallFailsWithErrno(ENOTCONN)); +} + +// Test that getpeername() returns something after connect(). +TEST_F(RawSocketTest, GetPeerName) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + ASSERT_THAT( + connect(s_, reinterpret_cast<struct sockaddr*>(&addr_), sizeof(addr_)), + SyscallSucceeds()); + struct sockaddr saddr; + socklen_t addrlen; + ASSERT_THAT(getpeername(s_, &saddr, &addrlen), SyscallSucceeds()); + ASSERT_GT(addrlen, 0); +} + +// Test that the socket is writable immediately. +TEST_F(RawSocketTest, PollWritableImmediately) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + struct pollfd pfd = {}; + pfd.fd = s_; + pfd.events = POLLOUT; + ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, 10000), SyscallSucceedsWithValue(1)); +} + +// Test that the socket isn't readable before receiving anything. +TEST_F(RawSocketTest, PollNotReadableInitially) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + // Try to receive data with MSG_DONTWAIT, which returns immediately if there's + // nothing to be read. + char buf[117]; + ASSERT_THAT(RetryEINTR(recv)(s_, buf, sizeof(buf), MSG_DONTWAIT), + SyscallFailsWithErrno(EAGAIN)); +} + +// Test that the socket becomes readable once something is written to it. +TEST_F(RawSocketTest, PollTriggeredOnWrite) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + // Write something so that there's data to be read. + struct icmphdr icmp = {}; + ASSERT_NO_FATAL_FAILURE(SendEmptyICMP(icmp)); + + struct pollfd pfd = {}; + pfd.fd = s_; + pfd.events = POLLIN; + ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, 10000), SyscallSucceedsWithValue(1)); +} + +// Test that we can connect() to a valid IP (loopback). +TEST_F(RawSocketTest, ConnectToLoopback) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + ASSERT_THAT( + connect(s_, reinterpret_cast<struct sockaddr*>(&addr_), sizeof(addr_)), + SyscallSucceeds()); +} + +// Test that connect() sends packets to the right place. +TEST_F(RawSocketTest, SendAndReceiveViaConnect) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + ASSERT_THAT( + connect(s_, reinterpret_cast<struct sockaddr*>(&addr_), sizeof(addr_)), + SyscallSucceeds()); + + // Prepare and send an ICMP packet. Use arbitrary junk for checksum, sequence, + // and ID. None of that should matter for raw sockets - the kernel should + // still give us the packet. + struct icmphdr icmp; + icmp.type = ICMP_ECHO; + icmp.code = 0; + icmp.checksum = 2001; + icmp.un.echo.sequence = 2003; + icmp.un.echo.id = 2004; + ASSERT_THAT(send(s_, &icmp, sizeof(icmp), 0), + SyscallSucceedsWithValue(sizeof(icmp))); + + ASSERT_NO_FATAL_FAILURE(ExpectICMPSuccess(icmp)); +} + +// Test that calling send() without connect() fails. +TEST_F(RawSocketTest, SendWithoutConnectFails) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + // Prepare and send an ICMP packet. Use arbitrary junk for checksum, sequence, + // and ID. None of that should matter for raw sockets - the kernel should + // still give us the packet. + struct icmphdr icmp; + icmp.type = ICMP_ECHO; + icmp.code = 0; + icmp.checksum = 2015; + icmp.un.echo.sequence = 2017; + icmp.un.echo.id = 2019; + ASSERT_THAT(send(s_, &icmp, sizeof(icmp), 0), + SyscallFailsWithErrno(ENOTCONN)); +} + +// Bind to localhost. +TEST_F(RawSocketTest, BindToLocalhost) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + ASSERT_THAT( + bind(s_, reinterpret_cast<struct sockaddr*>(&addr_), sizeof(addr_)), + SyscallSucceeds()); +} + +// Bind to a different address. +TEST_F(RawSocketTest, BindToInvalid) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + struct sockaddr_in bind_addr = {}; + bind_addr.sin_family = AF_INET; + bind_addr.sin_addr = {1}; // 1.0.0.0 - An address that we can't bind to. + ASSERT_THAT(bind(s_, reinterpret_cast<struct sockaddr*>(&bind_addr), + sizeof(bind_addr)), + SyscallFailsWithErrno(EADDRNOTAVAIL)); +} + +// Bind to localhost, then send and receive packets. +TEST_F(RawSocketTest, BindSendAndReceive) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + ASSERT_THAT( + bind(s_, reinterpret_cast<struct sockaddr*>(&addr_), sizeof(addr_)), + SyscallSucceeds()); + + // Prepare and send an ICMP packet. Use arbitrary junk for checksum, sequence, + // and ID. None of that should matter for raw sockets - the kernel should + // still give us the packet. + struct icmphdr icmp; + icmp.type = ICMP_ECHO; + icmp.code = 0; + icmp.checksum = 2001; + icmp.un.echo.sequence = 2004; + icmp.un.echo.id = 2007; + ASSERT_NO_FATAL_FAILURE(SendEmptyICMP(icmp)); + + ASSERT_NO_FATAL_FAILURE(ExpectICMPSuccess(icmp)); +} + +// Bind and connect to localhost and send/receive packets. +TEST_F(RawSocketTest, BindConnectSendAndReceive) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + ASSERT_THAT( + bind(s_, reinterpret_cast<struct sockaddr*>(&addr_), sizeof(addr_)), + SyscallSucceeds()); + ASSERT_THAT( + connect(s_, reinterpret_cast<struct sockaddr*>(&addr_), sizeof(addr_)), + SyscallSucceeds()); + + // Prepare and send an ICMP packet. Use arbitrary junk for checksum, sequence, + // and ID. None of that should matter for raw sockets - the kernel should + // still give us the packet. + struct icmphdr icmp; + icmp.type = ICMP_ECHO; + icmp.code = 0; + icmp.checksum = 2009; + icmp.un.echo.sequence = 2010; + icmp.un.echo.id = 7; + ASSERT_NO_FATAL_FAILURE(SendEmptyICMP(icmp)); + + ASSERT_NO_FATAL_FAILURE(ExpectICMPSuccess(icmp)); +} + +void RawSocketTest::ExpectICMPSuccess(const struct icmphdr& icmp) { + // We're going to receive both the echo request and reply, but the order is + // indeterminate. + char recv_buf[512]; + struct sockaddr_in src; + bool received_request = false; + bool received_reply = false; + + for (int i = 0; i < 2; i++) { + // Receive the packet. + ASSERT_NO_FATAL_FAILURE(ReceiveICMP(recv_buf, ABSL_ARRAYSIZE(recv_buf), + sizeof(struct icmphdr), &src)); + EXPECT_EQ(memcmp(&src, &addr_, sizeof(sockaddr_in)), 0); + struct icmphdr* recvd_icmp = + reinterpret_cast<struct icmphdr*>(recv_buf + sizeof(struct iphdr)); + switch (recvd_icmp->type) { + case ICMP_ECHO: + EXPECT_FALSE(received_request); + received_request = true; + // The packet should be identical to what we sent. + EXPECT_EQ(memcmp(recv_buf + sizeof(struct iphdr), &icmp, sizeof(icmp)), + 0); + break; + + case ICMP_ECHOREPLY: + EXPECT_FALSE(received_reply); + received_reply = true; + // Most fields should be the same. + EXPECT_EQ(recvd_icmp->code, icmp.code); + EXPECT_EQ(recvd_icmp->un.echo.sequence, icmp.un.echo.sequence); + EXPECT_EQ(recvd_icmp->un.echo.id, icmp.un.echo.id); + // A couple are different. + EXPECT_EQ(recvd_icmp->type, ICMP_ECHOREPLY); + // The checksum is computed in such a way that it is guaranteed to have + // changed. + EXPECT_NE(recvd_icmp->checksum, icmp.checksum); + break; + } + } + + ASSERT_TRUE(received_request); + ASSERT_TRUE(received_reply); +} + +void RawSocketTest::SendEmptyICMP(const struct icmphdr& icmp) { ASSERT_NO_FATAL_FAILURE(SendEmptyICMPTo(s_, &addr_, icmp)); } -void RawSocketTest::SendEmptyICMPTo(int sock, struct sockaddr_in *addr, - struct icmphdr *icmp) { - struct iovec iov = {.iov_base = icmp, .iov_len = sizeof(*icmp)}; - struct msghdr msg { - .msg_name = addr, .msg_namelen = sizeof(*addr), .msg_iov = &iov, - .msg_iovlen = 1, .msg_control = NULL, .msg_controllen = 0, .msg_flags = 0, - }; - ASSERT_THAT(sendmsg(sock, &msg, 0), SyscallSucceedsWithValue(sizeof(*icmp))); +void RawSocketTest::SendEmptyICMPTo(int sock, struct sockaddr_in* addr, + const struct icmphdr& icmp) { + // It's safe to use const_cast here because sendmsg won't modify the iovec. + struct iovec iov = {}; + iov.iov_base = static_cast<void*>(const_cast<struct icmphdr*>(&icmp)); + iov.iov_len = sizeof(icmp); + struct msghdr msg = {}; + msg.msg_name = addr; + msg.msg_namelen = sizeof(*addr); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + ASSERT_THAT(sendmsg(sock, &msg, 0), SyscallSucceedsWithValue(sizeof(icmp))); } -void RawSocketTest::ReceiveICMP(char *recv_buf, size_t recv_buf_len, - size_t expected_size, struct sockaddr_in *src) { +void RawSocketTest::ReceiveICMP(char* recv_buf, size_t recv_buf_len, + size_t expected_size, struct sockaddr_in* src) { ASSERT_NO_FATAL_FAILURE( ReceiveICMPFrom(recv_buf, recv_buf_len, expected_size, src, s_)); } -void RawSocketTest::ReceiveICMPFrom(char *recv_buf, size_t recv_buf_len, +void RawSocketTest::ReceiveICMPFrom(char* recv_buf, size_t recv_buf_len, size_t expected_size, - struct sockaddr_in *src, int sock) { - struct iovec iov = {.iov_base = recv_buf, .iov_len = recv_buf_len}; - struct msghdr msg = { - .msg_name = src, - .msg_namelen = sizeof(*src), - .msg_iov = &iov, - .msg_iovlen = 1, - .msg_control = NULL, - .msg_controllen = 0, - .msg_flags = 0, - }; + struct sockaddr_in* src, int sock) { + struct iovec iov = {}; + iov.iov_base = recv_buf; + iov.iov_len = recv_buf_len; + struct msghdr msg = {}; + msg.msg_name = src; + msg.msg_namelen = sizeof(*src); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; // We should receive the ICMP packet plus 20 bytes of IP header. ASSERT_THAT(recvmsg(sock, &msg, 0), SyscallSucceedsWithValue(expected_size + sizeof(struct iphdr))); |