summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorBhasker Hariharan <bhaskerh@google.com>2019-06-06 08:05:46 -0700
committerShentubot <shentubot@google.com>2019-06-06 08:07:02 -0700
commit85be01b42d4ac48698d1e8f50a4cf2607a4fc50b (patch)
tree18d58e55f9c21a177ed149d460f751649df7f0e5
parent79f7cb6c1c4c16e3aca44d7fdc8e9f2487a605cf (diff)
Add multi-fd support to fdbased endpoint.
This allows an fdbased endpoint to have multiple underlying fd's from which packets can be read and dispatched/written to. This should allow for higher throughput as well as better scalability of the network stack as number of connections increases. Updates #231 PiperOrigin-RevId: 251852825
-rw-r--r--pkg/tcpip/link/fdbased/endpoint.go168
-rw-r--r--pkg/tcpip/link/fdbased/endpoint_test.go2
-rw-r--r--pkg/tcpip/sample/tun_tcp_connect/main.go2
-rw-r--r--pkg/tcpip/sample/tun_tcp_echo/main.go2
-rw-r--r--pkg/urpc/urpc.go2
-rw-r--r--runsc/boot/config.go6
-rw-r--r--runsc/boot/network.go39
-rw-r--r--runsc/main.go25
-rw-r--r--runsc/sandbox/network.go119
-rw-r--r--runsc/test/testutil/testutil.go1
10 files changed, 249 insertions, 117 deletions
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index 1f889c2a0..b88e2e7bf 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -21,12 +21,29 @@
// FD based endpoints can be used in the networking stack by calling New() to
// create a new endpoint, and then passing it as an argument to
// Stack.CreateNIC().
+//
+// FD based endpoints can use more than one file descriptor to read incoming
+// packets. If there are more than one FDs specified and the underlying FD is an
+// AF_PACKET then the endpoint will enable FANOUT mode on the socket so that the
+// host kernel will consistently hash the packets to the sockets. This ensures
+// that packets for the same TCP streams are not reordered.
+//
+// Similarly if more than one FD's are specified where the underlying FD is not
+// AF_PACKET then it's the caller's responsibility to ensure that all inbound
+// packets on the descriptors are consistently 5 tuple hashed to one of the
+// descriptors to prevent TCP reordering.
+//
+// Since netstack today does not compute 5 tuple hashes for outgoing packets we
+// only use the first FD to write outbound packets. Once 5 tuple hashes for
+// all outbound packets are available we will make use of all underlying FD's to
+// write outbound packets.
package fdbased
import (
"fmt"
"syscall"
+ "golang.org/x/sys/unix"
"gvisor.googlesource.com/gvisor/pkg/tcpip"
"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
@@ -65,8 +82,10 @@ const (
)
type endpoint struct {
- // fd is the file descriptor used to send and receive packets.
- fd int
+ // fds is the set of file descriptors each identifying one inbound/outbound
+ // channel. The endpoint will dispatch from all inbound channels as well as
+ // hash outbound packets to specific channels based on the packet hash.
+ fds []int
// mtu (maximum transmission unit) is the maximum size of a packet.
mtu uint32
@@ -85,8 +104,8 @@ type endpoint struct {
// its end of the communication pipe.
closed func(*tcpip.Error)
- inboundDispatcher linkDispatcher
- dispatcher stack.NetworkDispatcher
+ inboundDispatchers []linkDispatcher
+ dispatcher stack.NetworkDispatcher
// packetDispatchMode controls the packet dispatcher used by this
// endpoint.
@@ -99,17 +118,47 @@ type endpoint struct {
// Options specify the details about the fd-based endpoint to be created.
type Options struct {
- FD int
- MTU uint32
- EthernetHeader bool
- ClosedFunc func(*tcpip.Error)
- Address tcpip.LinkAddress
- SaveRestore bool
- DisconnectOk bool
- GSOMaxSize uint32
+ // FDs is a set of FDs used to read/write packets.
+ FDs []int
+
+ // MTU is the mtu to use for this endpoint.
+ MTU uint32
+
+ // EthernetHeader if true, indicates that the endpoint should read/write
+ // ethernet frames instead of IP packets.
+ EthernetHeader bool
+
+ // ClosedFunc is a function to be called when an endpoint's peer (if
+ // any) closes its end of the communication pipe.
+ ClosedFunc func(*tcpip.Error)
+
+ // Address is the link address for this endpoint. Only used if
+ // EthernetHeader is true.
+ Address tcpip.LinkAddress
+
+ // SaveRestore if true, indicates that this NIC capability set should
+ // include CapabilitySaveRestore
+ SaveRestore bool
+
+ // DisconnectOk if true, indicates that this NIC capability set should
+ // include CapabilityDisconnectOk.
+ DisconnectOk bool
+
+ // GSOMaxSize is the maximum GSO packet size. It is zero if GSO is
+ // disabled.
+ GSOMaxSize uint32
+
+ // PacketDispatchMode specifies the type of inbound dispatcher to be
+ // used for this endpoint.
PacketDispatchMode PacketDispatchMode
- TXChecksumOffload bool
- RXChecksumOffload bool
+
+ // TXChecksumOffload if true, indicates that this endpoints capability
+ // set should include CapabilityTXChecksumOffload.
+ TXChecksumOffload bool
+
+ // RXChecksumOffload if true, indicates that this endpoints capability
+ // set should include CapabilityRXChecksumOffload.
+ RXChecksumOffload bool
}
// New creates a new fd-based endpoint.
@@ -117,10 +166,6 @@ type Options struct {
// Makes fd non-blocking, but does not take ownership of fd, which must remain
// open for the lifetime of the returned endpoint.
func New(opts *Options) (tcpip.LinkEndpointID, error) {
- if err := syscall.SetNonblock(opts.FD, true); err != nil {
- return 0, fmt.Errorf("syscall.SetNonblock(%v) failed: %v", opts.FD, err)
- }
-
caps := stack.LinkEndpointCapabilities(0)
if opts.RXChecksumOffload {
caps |= stack.CapabilityRXChecksumOffload
@@ -144,8 +189,12 @@ func New(opts *Options) (tcpip.LinkEndpointID, error) {
caps |= stack.CapabilityDisconnectOk
}
+ if len(opts.FDs) == 0 {
+ return 0, fmt.Errorf("opts.FD is empty, at least one FD must be specified")
+ }
+
e := &endpoint{
- fd: opts.FD,
+ fds: opts.FDs,
mtu: opts.MTU,
caps: caps,
closed: opts.ClosedFunc,
@@ -154,46 +203,71 @@ func New(opts *Options) (tcpip.LinkEndpointID, error) {
packetDispatchMode: opts.PacketDispatchMode,
}
- isSocket, err := isSocketFD(e.fd)
- if err != nil {
- return 0, err
- }
- if isSocket {
- if opts.GSOMaxSize != 0 {
- e.caps |= stack.CapabilityGSO
- e.gsoMaxSize = opts.GSOMaxSize
+ // Create per channel dispatchers.
+ for i := 0; i < len(e.fds); i++ {
+ fd := e.fds[i]
+ if err := syscall.SetNonblock(fd, true); err != nil {
+ return 0, fmt.Errorf("syscall.SetNonblock(%v) failed: %v", fd, err)
}
- }
- e.inboundDispatcher, err = createInboundDispatcher(e, isSocket)
- if err != nil {
- return 0, fmt.Errorf("createInboundDispatcher(...) = %v", err)
+
+ isSocket, err := isSocketFD(fd)
+ if err != nil {
+ return 0, err
+ }
+ if isSocket {
+ if opts.GSOMaxSize != 0 {
+ e.caps |= stack.CapabilityGSO
+ e.gsoMaxSize = opts.GSOMaxSize
+ }
+ }
+ inboundDispatcher, err := createInboundDispatcher(e, fd, isSocket)
+ if err != nil {
+ return 0, fmt.Errorf("createInboundDispatcher(...) = %v", err)
+ }
+ e.inboundDispatchers = append(e.inboundDispatchers, inboundDispatcher)
}
return stack.RegisterLinkEndpoint(e), nil
}
-func createInboundDispatcher(e *endpoint, isSocket bool) (linkDispatcher, error) {
+func createInboundDispatcher(e *endpoint, fd int, isSocket bool) (linkDispatcher, error) {
// By default use the readv() dispatcher as it works with all kinds of
// FDs (tap/tun/unix domain sockets and af_packet).
- inboundDispatcher, err := newReadVDispatcher(e.fd, e)
+ inboundDispatcher, err := newReadVDispatcher(fd, e)
if err != nil {
- return nil, fmt.Errorf("newReadVDispatcher(%d, %+v) = %v", e.fd, e, err)
+ return nil, fmt.Errorf("newReadVDispatcher(%d, %+v) = %v", fd, e, err)
}
if isSocket {
+ sa, err := unix.Getsockname(fd)
+ if err != nil {
+ return nil, fmt.Errorf("unix.Getsockname(%d) = %v", fd, err)
+ }
+ switch sa.(type) {
+ case *unix.SockaddrLinklayer:
+ // enable PACKET_FANOUT mode is the underlying socket is
+ // of type AF_PACKET.
+ const fanoutID = 1
+ const fanoutType = 0x8000 // PACKET_FANOUT_HASH | PACKET_FANOUT_FLAG_DEFRAG
+ fanoutArg := fanoutID | fanoutType<<16
+ if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_FANOUT, fanoutArg); err != nil {
+ return nil, fmt.Errorf("failed to enable PACKET_FANOUT option: %v", err)
+ }
+ }
+
switch e.packetDispatchMode {
case PacketMMap:
- inboundDispatcher, err = newPacketMMapDispatcher(e.fd, e)
+ inboundDispatcher, err = newPacketMMapDispatcher(fd, e)
if err != nil {
- return nil, fmt.Errorf("newPacketMMapDispatcher(%d, %+v) = %v", e.fd, e, err)
+ return nil, fmt.Errorf("newPacketMMapDispatcher(%d, %+v) = %v", fd, e, err)
}
case RecvMMsg:
// If the provided FD is a socket then we optimize
// packet reads by using recvmmsg() instead of read() to
// read packets in a batch.
- inboundDispatcher, err = newRecvMMsgDispatcher(e.fd, e)
+ inboundDispatcher, err = newRecvMMsgDispatcher(fd, e)
if err != nil {
- return nil, fmt.Errorf("newRecvMMsgDispatcher(%d, %+v) = %v", e.fd, e, err)
+ return nil, fmt.Errorf("newRecvMMsgDispatcher(%d, %+v) = %v", fd, e, err)
}
}
}
@@ -215,7 +289,9 @@ func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) {
// Link endpoints are not savable. When transportation endpoints are
// saved, they stop sending outgoing packets and all incoming packets
// are rejected.
- go e.dispatchLoop() // S/R-SAFE: See above.
+ for i := range e.inboundDispatchers {
+ go e.dispatchLoop(e.inboundDispatchers[i]) // S/R-SAFE: See above.
+ }
}
// IsAttached implements stack.LinkEndpoint.IsAttached.
@@ -305,26 +381,26 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen
}
}
- return rawfile.NonBlockingWrite3(e.fd, vnetHdrBuf, hdr.View(), payload.ToView())
+ return rawfile.NonBlockingWrite3(e.fds[0], vnetHdrBuf, hdr.View(), payload.ToView())
}
if payload.Size() == 0 {
- return rawfile.NonBlockingWrite(e.fd, hdr.View())
+ return rawfile.NonBlockingWrite(e.fds[0], hdr.View())
}
- return rawfile.NonBlockingWrite3(e.fd, hdr.View(), payload.ToView(), nil)
+ return rawfile.NonBlockingWrite3(e.fds[0], hdr.View(), payload.ToView(), nil)
}
// WriteRawPacket writes a raw packet directly to the file descriptor.
func (e *endpoint) WriteRawPacket(dest tcpip.Address, packet []byte) *tcpip.Error {
- return rawfile.NonBlockingWrite(e.fd, packet)
+ return rawfile.NonBlockingWrite(e.fds[0], packet)
}
// dispatchLoop reads packets from the file descriptor in a loop and dispatches
// them to the network stack.
-func (e *endpoint) dispatchLoop() *tcpip.Error {
+func (e *endpoint) dispatchLoop(inboundDispatcher linkDispatcher) *tcpip.Error {
for {
- cont, err := e.inboundDispatcher.dispatch()
+ cont, err := inboundDispatcher.dispatch()
if err != nil || !cont {
if e.closed != nil {
e.closed(err)
@@ -363,7 +439,7 @@ func NewInjectable(fd int, mtu uint32, capabilities stack.LinkEndpointCapabiliti
syscall.SetNonblock(fd, true)
e := &InjectableEndpoint{endpoint: endpoint{
- fd: fd,
+ fds: []int{fd},
mtu: mtu,
caps: capabilities,
}}
diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go
index fd1722074..ba3e09192 100644
--- a/pkg/tcpip/link/fdbased/endpoint_test.go
+++ b/pkg/tcpip/link/fdbased/endpoint_test.go
@@ -67,7 +67,7 @@ func newContext(t *testing.T, opt *Options) *context {
done <- struct{}{}
}
- opt.FD = fds[1]
+ opt.FDs = []int{fds[1]}
epID, err := New(opt)
if err != nil {
t.Fatalf("Failed to create FD endpoint: %v", err)
diff --git a/pkg/tcpip/sample/tun_tcp_connect/main.go b/pkg/tcpip/sample/tun_tcp_connect/main.go
index 1681de56e..1fa899e7e 100644
--- a/pkg/tcpip/sample/tun_tcp_connect/main.go
+++ b/pkg/tcpip/sample/tun_tcp_connect/main.go
@@ -137,7 +137,7 @@ func main() {
log.Fatal(err)
}
- linkID, err := fdbased.New(&fdbased.Options{FD: fd, MTU: mtu})
+ linkID, err := fdbased.New(&fdbased.Options{FDs: []int{fd}, MTU: mtu})
if err != nil {
log.Fatal(err)
}
diff --git a/pkg/tcpip/sample/tun_tcp_echo/main.go b/pkg/tcpip/sample/tun_tcp_echo/main.go
index 642607f83..d47085581 100644
--- a/pkg/tcpip/sample/tun_tcp_echo/main.go
+++ b/pkg/tcpip/sample/tun_tcp_echo/main.go
@@ -129,7 +129,7 @@ func main() {
}
linkID, err := fdbased.New(&fdbased.Options{
- FD: fd,
+ FDs: []int{fd},
MTU: mtu,
EthernetHeader: *tap,
Address: tcpip.LinkAddress(maddr),
diff --git a/pkg/urpc/urpc.go b/pkg/urpc/urpc.go
index 0f155ec74..4ea684659 100644
--- a/pkg/urpc/urpc.go
+++ b/pkg/urpc/urpc.go
@@ -35,7 +35,7 @@ import (
)
// maxFiles determines the maximum file payload.
-const maxFiles = 16
+const maxFiles = 32
// ErrTooManyFiles is returned when too many file descriptors are mapped.
var ErrTooManyFiles = errors.New("too many files")
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 15f624f9b..8564c502d 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -221,6 +221,11 @@ type Config struct {
// user, and without chrooting the sandbox process. This can be
// necessary in test environments that have limited capabilities.
TestOnlyAllowRunAsCurrentUserWithoutChroot bool
+
+ // NumNetworkChannels controls the number of AF_PACKET sockets that map
+ // to the same underlying network device. This allows netstack to better
+ // scale for high throughput use cases.
+ NumNetworkChannels int
}
// ToFlags returns a slice of flags that correspond to the given Config.
@@ -244,6 +249,7 @@ func (c *Config) ToFlags() []string {
"--panic-signal=" + strconv.Itoa(c.PanicSignal),
"--profile=" + strconv.FormatBool(c.ProfileEnable),
"--net-raw=" + strconv.FormatBool(c.EnableRaw),
+ "--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels),
}
if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
// Only include if set since it is never to be used by users.
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 0a154d90b..82c259f47 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -57,6 +57,10 @@ type FDBasedLink struct {
Routes []Route
GSOMaxSize uint32
LinkAddress []byte
+
+ // NumChannels controls how many underlying FD's are to be used to
+ // create this endpoint.
+ NumChannels int
}
// LoopbackLink configures a loopback li nk.
@@ -68,8 +72,9 @@ type LoopbackLink struct {
// CreateLinksAndRoutesArgs are arguments to CreateLinkAndRoutes.
type CreateLinksAndRoutesArgs struct {
- // FilePayload contains the fds associated with the FDBasedLinks. The
- // two slices must have the same length.
+ // FilePayload contains the fds associated with the FDBasedLinks. The
+ // number of fd's should match the sum of the NumChannels field of the
+ // FDBasedLink entries below.
urpc.FilePayload
LoopbackLinks []LoopbackLink
@@ -95,8 +100,12 @@ func (r *Route) toTcpipRoute(id tcpip.NICID) tcpip.Route {
// CreateLinksAndRoutes creates links and routes in a network stack. It should
// only be called once.
func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct{}) error {
- if len(args.FilePayload.Files) != len(args.FDBasedLinks) {
- return fmt.Errorf("FilePayload must be same length at FDBasedLinks")
+ wantFDs := 0
+ for _, l := range args.FDBasedLinks {
+ wantFDs += l.NumChannels
+ }
+ if got := len(args.FilePayload.Files); got != wantFDs {
+ return fmt.Errorf("args.FilePayload.Files has %d FD's but we need %d entries based on FDBasedLinks", got, wantFDs)
}
var nicID tcpip.NICID
@@ -123,20 +132,26 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
}
}
- for i, link := range args.FDBasedLinks {
+ fdOffset := 0
+ for _, link := range args.FDBasedLinks {
nicID++
nicids[link.Name] = nicID
- // Copy the underlying FD.
- oldFD := args.FilePayload.Files[i].Fd()
- newFD, err := syscall.Dup(int(oldFD))
- if err != nil {
- return fmt.Errorf("failed to dup FD %v: %v", oldFD, err)
+ FDs := []int{}
+ for j := 0; j < link.NumChannels; j++ {
+ // Copy the underlying FD.
+ oldFD := args.FilePayload.Files[fdOffset].Fd()
+ newFD, err := syscall.Dup(int(oldFD))
+ if err != nil {
+ return fmt.Errorf("failed to dup FD %v: %v", oldFD, err)
+ }
+ FDs = append(FDs, newFD)
+ fdOffset++
}
mac := tcpip.LinkAddress(link.LinkAddress)
linkEP, err := fdbased.New(&fdbased.Options{
- FD: newFD,
+ FDs: FDs,
MTU: uint32(link.MTU),
EthernetHeader: true,
Address: mac,
@@ -148,7 +163,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
return err
}
- log.Infof("Enabling interface %q with id %d on addresses %+v (%v)", link.Name, nicID, link.Addresses, mac)
+ log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels)
if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses, false /* loopback */); err != nil {
return err
}
diff --git a/runsc/main.go b/runsc/main.go
index 11bc73f75..44ad23cba 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -60,16 +60,16 @@ var (
straceLogSize = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs")
// Flags that control sandbox runtime behavior.
- platform = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm")
- network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
- gso = flag.Bool("gso", true, "enable generic segmenation offload")
- fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.")
- overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
- watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
- panicSignal = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
- profile = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).")
- netRaw = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.")
-
+ platform = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm")
+ network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
+ gso = flag.Bool("gso", true, "enable generic segmenation offload")
+ fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.")
+ overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
+ watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
+ panicSignal = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
+ profile = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).")
+ netRaw = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.")
+ numNetworkChannels = flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.")
testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
)
@@ -141,6 +141,10 @@ func main() {
cmd.Fatalf("%v", err)
}
+ if *numNetworkChannels <= 0 {
+ cmd.Fatalf("num_network_channels must be > 0, got: %d", *numNetworkChannels)
+ }
+
// Create a new Config from the flags.
conf := &boot.Config{
RootDir: *rootDir,
@@ -162,6 +166,7 @@ func main() {
ProfileEnable: *profile,
EnableRaw: *netRaw,
TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot,
+ NumNetworkChannels: *numNetworkChannels,
}
if len(*straceSyscalls) != 0 {
conf.StraceSyscalls = strings.Split(*straceSyscalls, ",")
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index 0460d5f1a..1fd091514 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -68,7 +68,7 @@ func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Confi
// Build the path to the net namespace of the sandbox process.
// This is what we will copy.
nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net")
- if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.GSO); err != nil {
+ if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.GSO, conf.NumNetworkChannels); err != nil {
return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err)
}
case boot.NetworkHost:
@@ -138,7 +138,7 @@ func isRootNS() (bool, error) {
// createInterfacesAndRoutesFromNS scrapes the interface and routes from the
// net namespace with the given path, creates them in the sandbox, and removes
// them from the host.
-func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO bool) error {
+func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO bool, numNetworkChannels int) error {
// Join the network namespace that we will be copying.
restore, err := joinNetNS(nsPath)
if err != nil {
@@ -202,25 +202,6 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
continue
}
- // Create the socket.
- const protocol = 0x0300 // htons(ETH_P_ALL)
- fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol)
- if err != nil {
- return fmt.Errorf("unable to create raw socket: %v", err)
- }
- deviceFile := os.NewFile(uintptr(fd), "raw-device-fd")
-
- // Bind to the appropriate device.
- ll := syscall.SockaddrLinklayer{
- Protocol: protocol,
- Ifindex: iface.Index,
- Hatype: 0, // No ARP type.
- Pkttype: syscall.PACKET_OTHERHOST,
- }
- if err := syscall.Bind(fd, &ll); err != nil {
- return fmt.Errorf("unable to bind to %q: %v", iface.Name, err)
- }
-
// Scrape the routes before removing the address, since that
// will remove the routes as well.
routes, def, err := routesForIface(iface)
@@ -236,9 +217,10 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
}
link := boot.FDBasedLink{
- Name: iface.Name,
- MTU: iface.MTU,
- Routes: routes,
+ Name: iface.Name,
+ MTU: iface.MTU,
+ Routes: routes,
+ NumChannels: numNetworkChannels,
}
// Get the link for the interface.
@@ -248,30 +230,23 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
}
link.LinkAddress = []byte(ifaceLink.Attrs().HardwareAddr)
- if enableGSO {
- gso, err := isGSOEnabled(fd, iface.Name)
+ log.Debugf("Setting up network channels")
+ // Create the socket for the device.
+ for i := 0; i < link.NumChannels; i++ {
+ log.Debugf("Creating Channel %d", i)
+ socketEntry, err := createSocket(iface, ifaceLink, enableGSO)
if err != nil {
- return fmt.Errorf("getting GSO for interface %q: %v", iface.Name, err)
+ return fmt.Errorf("failed to createSocket for %s : %v", iface.Name, err)
}
- if gso {
- if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil {
- return fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err)
- }
- link.GSOMaxSize = ifaceLink.Attrs().GSOMaxSize
+ if i == 0 {
+ link.GSOMaxSize = socketEntry.gsoMaxSize
} else {
- log.Infof("GSO not available in host.")
+ if link.GSOMaxSize != socketEntry.gsoMaxSize {
+ return fmt.Errorf("inconsistent gsoMaxSize %d and %d when creating multiple channels for same interface: %s",
+ link.GSOMaxSize, socketEntry.gsoMaxSize, iface.Name)
+ }
}
- }
-
- // Use SO_RCVBUFFORCE because on linux the receive buffer for an
- // AF_PACKET socket is capped by "net.core.rmem_max". rmem_max
- // defaults to a unusually low value of 208KB. This is too low
- // for gVisor to be able to receive packets at high throughputs
- // without incurring packet drops.
- const rcvBufSize = 4 << 20 // 4MB.
-
- if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, rcvBufSize); err != nil {
- return fmt.Errorf("failed to increase socket rcv buffer to %d: %v", rcvBufSize, err)
+ args.FilePayload.Files = append(args.FilePayload.Files, socketEntry.deviceFile)
}
// Collect the addresses for the interface, enable forwarding,
@@ -285,7 +260,6 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
}
}
- args.FilePayload.Files = append(args.FilePayload.Files, deviceFile)
args.FDBasedLinks = append(args.FDBasedLinks, link)
}
@@ -296,6 +270,61 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
return nil
}
+type socketEntry struct {
+ deviceFile *os.File
+ gsoMaxSize uint32
+}
+
+// createSocket creates an underlying AF_PACKET socket and configures it for use by
+// the sentry and returns an *os.File that wraps the underlying socket fd.
+func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) (*socketEntry, error) {
+ // Create the socket.
+ const protocol = 0x0300 // htons(ETH_P_ALL)
+ fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol)
+ if err != nil {
+ return nil, fmt.Errorf("unable to create raw socket: %v", err)
+ }
+ deviceFile := os.NewFile(uintptr(fd), "raw-device-fd")
+ // Bind to the appropriate device.
+ ll := syscall.SockaddrLinklayer{
+ Protocol: protocol,
+ Ifindex: iface.Index,
+ Hatype: 0, // No ARP type.
+ Pkttype: syscall.PACKET_OTHERHOST,
+ }
+ if err := syscall.Bind(fd, &ll); err != nil {
+ return nil, fmt.Errorf("unable to bind to %q: %v", iface.Name, err)
+ }
+
+ gsoMaxSize := uint32(0)
+ if enableGSO {
+ gso, err := isGSOEnabled(fd, iface.Name)
+ if err != nil {
+ return nil, fmt.Errorf("getting GSO for interface %q: %v", iface.Name, err)
+ }
+ if gso {
+ if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil {
+ return nil, fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err)
+ }
+ gsoMaxSize = ifaceLink.Attrs().GSOMaxSize
+ } else {
+ log.Infof("GSO not available in host.")
+ }
+ }
+
+ // Use SO_RCVBUFFORCE because on linux the receive buffer for an
+ // AF_PACKET socket is capped by "net.core.rmem_max". rmem_max
+ // defaults to a unusually low value of 208KB. This is too low
+ // for gVisor to be able to receive packets at high throughputs
+ // without incurring packet drops.
+ const rcvBufSize = 4 << 20 // 4MB.
+
+ if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, rcvBufSize); err != nil {
+ return nil, fmt.Errorf("failed to increase socket rcv buffer to %d: %v", rcvBufSize, err)
+ }
+ return &socketEntry{deviceFile, gsoMaxSize}, nil
+}
+
// loopbackLinks collects the links for a loopback interface.
func loopbackLinks(iface net.Interface, addrs []net.Addr) ([]boot.LoopbackLink, error) {
var links []boot.LoopbackLink
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 9efb1ba8e..727b648a6 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -136,6 +136,7 @@ func TestConfig() *boot.Config {
Strace: true,
FileAccess: boot.FileAccessExclusive,
TestOnlyAllowRunAsCurrentUserWithoutChroot: true,
+ NumNetworkChannels: 1,
}
}