summaryrefslogtreecommitdiffhomepage
path: root/runsc
diff options
context:
space:
mode:
authorBhasker Hariharan <bhaskerh@google.com>2019-06-06 08:05:46 -0700
committerShentubot <shentubot@google.com>2019-06-06 08:07:02 -0700
commit85be01b42d4ac48698d1e8f50a4cf2607a4fc50b (patch)
tree18d58e55f9c21a177ed149d460f751649df7f0e5 /runsc
parent79f7cb6c1c4c16e3aca44d7fdc8e9f2487a605cf (diff)
Add multi-fd support to fdbased endpoint.
This allows an fdbased endpoint to have multiple underlying fd's from which packets can be read and dispatched/written to. This should allow for higher throughput as well as better scalability of the network stack as number of connections increases. Updates #231 PiperOrigin-RevId: 251852825
Diffstat (limited to 'runsc')
-rw-r--r--runsc/boot/config.go6
-rw-r--r--runsc/boot/network.go39
-rw-r--r--runsc/main.go25
-rw-r--r--runsc/sandbox/network.go119
-rw-r--r--runsc/test/testutil/testutil.go1
5 files changed, 123 insertions, 67 deletions
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 15f624f9b..8564c502d 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -221,6 +221,11 @@ type Config struct {
// user, and without chrooting the sandbox process. This can be
// necessary in test environments that have limited capabilities.
TestOnlyAllowRunAsCurrentUserWithoutChroot bool
+
+ // NumNetworkChannels controls the number of AF_PACKET sockets that map
+ // to the same underlying network device. This allows netstack to better
+ // scale for high throughput use cases.
+ NumNetworkChannels int
}
// ToFlags returns a slice of flags that correspond to the given Config.
@@ -244,6 +249,7 @@ func (c *Config) ToFlags() []string {
"--panic-signal=" + strconv.Itoa(c.PanicSignal),
"--profile=" + strconv.FormatBool(c.ProfileEnable),
"--net-raw=" + strconv.FormatBool(c.EnableRaw),
+ "--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels),
}
if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
// Only include if set since it is never to be used by users.
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 0a154d90b..82c259f47 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -57,6 +57,10 @@ type FDBasedLink struct {
Routes []Route
GSOMaxSize uint32
LinkAddress []byte
+
+ // NumChannels controls how many underlying FD's are to be used to
+ // create this endpoint.
+ NumChannels int
}
// LoopbackLink configures a loopback li nk.
@@ -68,8 +72,9 @@ type LoopbackLink struct {
// CreateLinksAndRoutesArgs are arguments to CreateLinkAndRoutes.
type CreateLinksAndRoutesArgs struct {
- // FilePayload contains the fds associated with the FDBasedLinks. The
- // two slices must have the same length.
+ // FilePayload contains the fds associated with the FDBasedLinks. The
+ // number of fd's should match the sum of the NumChannels field of the
+ // FDBasedLink entries below.
urpc.FilePayload
LoopbackLinks []LoopbackLink
@@ -95,8 +100,12 @@ func (r *Route) toTcpipRoute(id tcpip.NICID) tcpip.Route {
// CreateLinksAndRoutes creates links and routes in a network stack. It should
// only be called once.
func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct{}) error {
- if len(args.FilePayload.Files) != len(args.FDBasedLinks) {
- return fmt.Errorf("FilePayload must be same length at FDBasedLinks")
+ wantFDs := 0
+ for _, l := range args.FDBasedLinks {
+ wantFDs += l.NumChannels
+ }
+ if got := len(args.FilePayload.Files); got != wantFDs {
+ return fmt.Errorf("args.FilePayload.Files has %d FD's but we need %d entries based on FDBasedLinks", got, wantFDs)
}
var nicID tcpip.NICID
@@ -123,20 +132,26 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
}
}
- for i, link := range args.FDBasedLinks {
+ fdOffset := 0
+ for _, link := range args.FDBasedLinks {
nicID++
nicids[link.Name] = nicID
- // Copy the underlying FD.
- oldFD := args.FilePayload.Files[i].Fd()
- newFD, err := syscall.Dup(int(oldFD))
- if err != nil {
- return fmt.Errorf("failed to dup FD %v: %v", oldFD, err)
+ FDs := []int{}
+ for j := 0; j < link.NumChannels; j++ {
+ // Copy the underlying FD.
+ oldFD := args.FilePayload.Files[fdOffset].Fd()
+ newFD, err := syscall.Dup(int(oldFD))
+ if err != nil {
+ return fmt.Errorf("failed to dup FD %v: %v", oldFD, err)
+ }
+ FDs = append(FDs, newFD)
+ fdOffset++
}
mac := tcpip.LinkAddress(link.LinkAddress)
linkEP, err := fdbased.New(&fdbased.Options{
- FD: newFD,
+ FDs: FDs,
MTU: uint32(link.MTU),
EthernetHeader: true,
Address: mac,
@@ -148,7 +163,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
return err
}
- log.Infof("Enabling interface %q with id %d on addresses %+v (%v)", link.Name, nicID, link.Addresses, mac)
+ log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels)
if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses, false /* loopback */); err != nil {
return err
}
diff --git a/runsc/main.go b/runsc/main.go
index 11bc73f75..44ad23cba 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -60,16 +60,16 @@ var (
straceLogSize = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs")
// Flags that control sandbox runtime behavior.
- platform = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm")
- network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
- gso = flag.Bool("gso", true, "enable generic segmenation offload")
- fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.")
- overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
- watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
- panicSignal = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
- profile = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).")
- netRaw = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.")
-
+ platform = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm")
+ network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
+ gso = flag.Bool("gso", true, "enable generic segmenation offload")
+ fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.")
+ overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
+ watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
+ panicSignal = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
+ profile = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).")
+ netRaw = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.")
+ numNetworkChannels = flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.")
testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
)
@@ -141,6 +141,10 @@ func main() {
cmd.Fatalf("%v", err)
}
+ if *numNetworkChannels <= 0 {
+ cmd.Fatalf("num_network_channels must be > 0, got: %d", *numNetworkChannels)
+ }
+
// Create a new Config from the flags.
conf := &boot.Config{
RootDir: *rootDir,
@@ -162,6 +166,7 @@ func main() {
ProfileEnable: *profile,
EnableRaw: *netRaw,
TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot,
+ NumNetworkChannels: *numNetworkChannels,
}
if len(*straceSyscalls) != 0 {
conf.StraceSyscalls = strings.Split(*straceSyscalls, ",")
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index 0460d5f1a..1fd091514 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -68,7 +68,7 @@ func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Confi
// Build the path to the net namespace of the sandbox process.
// This is what we will copy.
nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net")
- if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.GSO); err != nil {
+ if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.GSO, conf.NumNetworkChannels); err != nil {
return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err)
}
case boot.NetworkHost:
@@ -138,7 +138,7 @@ func isRootNS() (bool, error) {
// createInterfacesAndRoutesFromNS scrapes the interface and routes from the
// net namespace with the given path, creates them in the sandbox, and removes
// them from the host.
-func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO bool) error {
+func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO bool, numNetworkChannels int) error {
// Join the network namespace that we will be copying.
restore, err := joinNetNS(nsPath)
if err != nil {
@@ -202,25 +202,6 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
continue
}
- // Create the socket.
- const protocol = 0x0300 // htons(ETH_P_ALL)
- fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol)
- if err != nil {
- return fmt.Errorf("unable to create raw socket: %v", err)
- }
- deviceFile := os.NewFile(uintptr(fd), "raw-device-fd")
-
- // Bind to the appropriate device.
- ll := syscall.SockaddrLinklayer{
- Protocol: protocol,
- Ifindex: iface.Index,
- Hatype: 0, // No ARP type.
- Pkttype: syscall.PACKET_OTHERHOST,
- }
- if err := syscall.Bind(fd, &ll); err != nil {
- return fmt.Errorf("unable to bind to %q: %v", iface.Name, err)
- }
-
// Scrape the routes before removing the address, since that
// will remove the routes as well.
routes, def, err := routesForIface(iface)
@@ -236,9 +217,10 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
}
link := boot.FDBasedLink{
- Name: iface.Name,
- MTU: iface.MTU,
- Routes: routes,
+ Name: iface.Name,
+ MTU: iface.MTU,
+ Routes: routes,
+ NumChannels: numNetworkChannels,
}
// Get the link for the interface.
@@ -248,30 +230,23 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
}
link.LinkAddress = []byte(ifaceLink.Attrs().HardwareAddr)
- if enableGSO {
- gso, err := isGSOEnabled(fd, iface.Name)
+ log.Debugf("Setting up network channels")
+ // Create the socket for the device.
+ for i := 0; i < link.NumChannels; i++ {
+ log.Debugf("Creating Channel %d", i)
+ socketEntry, err := createSocket(iface, ifaceLink, enableGSO)
if err != nil {
- return fmt.Errorf("getting GSO for interface %q: %v", iface.Name, err)
+ return fmt.Errorf("failed to createSocket for %s : %v", iface.Name, err)
}
- if gso {
- if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil {
- return fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err)
- }
- link.GSOMaxSize = ifaceLink.Attrs().GSOMaxSize
+ if i == 0 {
+ link.GSOMaxSize = socketEntry.gsoMaxSize
} else {
- log.Infof("GSO not available in host.")
+ if link.GSOMaxSize != socketEntry.gsoMaxSize {
+ return fmt.Errorf("inconsistent gsoMaxSize %d and %d when creating multiple channels for same interface: %s",
+ link.GSOMaxSize, socketEntry.gsoMaxSize, iface.Name)
+ }
}
- }
-
- // Use SO_RCVBUFFORCE because on linux the receive buffer for an
- // AF_PACKET socket is capped by "net.core.rmem_max". rmem_max
- // defaults to a unusually low value of 208KB. This is too low
- // for gVisor to be able to receive packets at high throughputs
- // without incurring packet drops.
- const rcvBufSize = 4 << 20 // 4MB.
-
- if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, rcvBufSize); err != nil {
- return fmt.Errorf("failed to increase socket rcv buffer to %d: %v", rcvBufSize, err)
+ args.FilePayload.Files = append(args.FilePayload.Files, socketEntry.deviceFile)
}
// Collect the addresses for the interface, enable forwarding,
@@ -285,7 +260,6 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
}
}
- args.FilePayload.Files = append(args.FilePayload.Files, deviceFile)
args.FDBasedLinks = append(args.FDBasedLinks, link)
}
@@ -296,6 +270,61 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
return nil
}
+type socketEntry struct {
+ deviceFile *os.File
+ gsoMaxSize uint32
+}
+
+// createSocket creates an underlying AF_PACKET socket and configures it for use by
+// the sentry and returns an *os.File that wraps the underlying socket fd.
+func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) (*socketEntry, error) {
+ // Create the socket.
+ const protocol = 0x0300 // htons(ETH_P_ALL)
+ fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol)
+ if err != nil {
+ return nil, fmt.Errorf("unable to create raw socket: %v", err)
+ }
+ deviceFile := os.NewFile(uintptr(fd), "raw-device-fd")
+ // Bind to the appropriate device.
+ ll := syscall.SockaddrLinklayer{
+ Protocol: protocol,
+ Ifindex: iface.Index,
+ Hatype: 0, // No ARP type.
+ Pkttype: syscall.PACKET_OTHERHOST,
+ }
+ if err := syscall.Bind(fd, &ll); err != nil {
+ return nil, fmt.Errorf("unable to bind to %q: %v", iface.Name, err)
+ }
+
+ gsoMaxSize := uint32(0)
+ if enableGSO {
+ gso, err := isGSOEnabled(fd, iface.Name)
+ if err != nil {
+ return nil, fmt.Errorf("getting GSO for interface %q: %v", iface.Name, err)
+ }
+ if gso {
+ if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil {
+ return nil, fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err)
+ }
+ gsoMaxSize = ifaceLink.Attrs().GSOMaxSize
+ } else {
+ log.Infof("GSO not available in host.")
+ }
+ }
+
+ // Use SO_RCVBUFFORCE because on linux the receive buffer for an
+ // AF_PACKET socket is capped by "net.core.rmem_max". rmem_max
+ // defaults to a unusually low value of 208KB. This is too low
+ // for gVisor to be able to receive packets at high throughputs
+ // without incurring packet drops.
+ const rcvBufSize = 4 << 20 // 4MB.
+
+ if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, rcvBufSize); err != nil {
+ return nil, fmt.Errorf("failed to increase socket rcv buffer to %d: %v", rcvBufSize, err)
+ }
+ return &socketEntry{deviceFile, gsoMaxSize}, nil
+}
+
// loopbackLinks collects the links for a loopback interface.
func loopbackLinks(iface net.Interface, addrs []net.Addr) ([]boot.LoopbackLink, error) {
var links []boot.LoopbackLink
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 9efb1ba8e..727b648a6 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -136,6 +136,7 @@ func TestConfig() *boot.Config {
Strace: true,
FileAccess: boot.FileAccessExclusive,
TestOnlyAllowRunAsCurrentUserWithoutChroot: true,
+ NumNetworkChannels: 1,
}
}