diff options
Diffstat (limited to 'runsc/sandbox/network.go')
-rw-r--r-- | runsc/sandbox/network.go | 411 |
1 files changed, 411 insertions, 0 deletions
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go new file mode 100644 index 000000000..817a923ad --- /dev/null +++ b/runsc/sandbox/network.go @@ -0,0 +1,411 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sandbox + +import ( + "fmt" + "net" + "os" + "path/filepath" + "runtime" + "strconv" + "syscall" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/vishvananda/netlink" + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/urpc" + "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/specutils" +) + +// setupNetwork configures the network stack to mimic the local network +// configuration. Docker uses network namespaces with vnets to configure the +// network for the container. The untrusted app expects to see the same network +// inside the sandbox. Routing and port mapping is handled directly by docker +// with most of network information not even available to the runtime. +// +// Netstack inside the sandbox speaks directly to the device using a raw socket. +// All IP addresses assigned to the NIC, are removed and passed on to netstack's +// device. +// +// If 'conf.Network' is NoNetwork, skips local configuration and creates a +// loopback interface only. +// +// Run the following container to test it: +// docker run -di --runtime=runsc -p 8080:80 -v $PWD:/usr/local/apache2/htdocs/ httpd:2.4 +func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Config) error { + log.Infof("Setting up network") + + switch conf.Network { + case boot.NetworkNone: + log.Infof("Network is disabled, create loopback interface only") + if err := createDefaultLoopbackInterface(conn); err != nil { + return fmt.Errorf("creating default loopback interface: %v", err) + } + case boot.NetworkSandbox: + // Build the path to the net namespace of the sandbox process. + // This is what we will copy. + nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net") + if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.HardwareGSO, conf.SoftwareGSO, conf.TXChecksumOffload, conf.RXChecksumOffload, conf.NumNetworkChannels, conf.QDisc); err != nil { + return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err) + } + case boot.NetworkHost: + // Nothing to do here. + default: + return fmt.Errorf("invalid network type: %d", conf.Network) + } + return nil +} + +func createDefaultLoopbackInterface(conn *urpc.Client) error { + if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &boot.CreateLinksAndRoutesArgs{ + LoopbackLinks: []boot.LoopbackLink{boot.DefaultLoopbackLink}, + }, nil); err != nil { + return fmt.Errorf("creating loopback link and routes: %v", err) + } + return nil +} + +func joinNetNS(nsPath string) (func(), error) { + runtime.LockOSThread() + restoreNS, err := specutils.ApplyNS(specs.LinuxNamespace{ + Type: specs.NetworkNamespace, + Path: nsPath, + }) + if err != nil { + runtime.UnlockOSThread() + return nil, fmt.Errorf("joining net namespace %q: %v", nsPath, err) + } + return func() { + restoreNS() + runtime.UnlockOSThread() + }, nil +} + +// isRootNS determines whether we are running in the root net namespace. +// /proc/sys/net/core/rmem_default only exists in root network namespace. +func isRootNS() (bool, error) { + err := syscall.Access("/proc/sys/net/core/rmem_default", syscall.F_OK) + switch err { + case nil: + return true, nil + case syscall.ENOENT: + return false, nil + default: + return false, fmt.Errorf("failed to access /proc/sys/net/core/rmem_default: %v", err) + } +} + +// createInterfacesAndRoutesFromNS scrapes the interface and routes from the +// net namespace with the given path, creates them in the sandbox, and removes +// them from the host. +func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareGSO bool, softwareGSO bool, txChecksumOffload bool, rxChecksumOffload bool, numNetworkChannels int, qDisc boot.QueueingDiscipline) error { + // Join the network namespace that we will be copying. + restore, err := joinNetNS(nsPath) + if err != nil { + return err + } + defer restore() + + // Get all interfaces in the namespace. + ifaces, err := net.Interfaces() + if err != nil { + return fmt.Errorf("querying interfaces: %v", err) + } + + isRoot, err := isRootNS() + if err != nil { + return err + } + if isRoot { + return fmt.Errorf("cannot run with network enabled in root network namespace") + } + + // Collect addresses and routes from the interfaces. + var args boot.CreateLinksAndRoutesArgs + for _, iface := range ifaces { + if iface.Flags&net.FlagUp == 0 { + log.Infof("Skipping down interface: %+v", iface) + continue + } + + allAddrs, err := iface.Addrs() + if err != nil { + return fmt.Errorf("fetching interface addresses for %q: %v", iface.Name, err) + } + + // We build our own loopback device. + if iface.Flags&net.FlagLoopback != 0 { + link, err := loopbackLink(iface, allAddrs) + if err != nil { + return fmt.Errorf("getting loopback link for iface %q: %v", iface.Name, err) + } + args.LoopbackLinks = append(args.LoopbackLinks, link) + continue + } + + var ipAddrs []*net.IPNet + for _, ifaddr := range allAddrs { + ipNet, ok := ifaddr.(*net.IPNet) + if !ok { + return fmt.Errorf("address is not IPNet: %+v", ifaddr) + } + ipAddrs = append(ipAddrs, ipNet) + } + if len(ipAddrs) == 0 { + log.Warningf("No usable IP addresses found for interface %q, skipping", iface.Name) + continue + } + + // Scrape the routes before removing the address, since that + // will remove the routes as well. + routes, defv4, defv6, err := routesForIface(iface) + if err != nil { + return fmt.Errorf("getting routes for interface %q: %v", iface.Name, err) + } + if defv4 != nil { + if !args.Defaultv4Gateway.Route.Empty() { + return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv4, args.Defaultv4Gateway) + } + args.Defaultv4Gateway.Route = *defv4 + args.Defaultv4Gateway.Name = iface.Name + } + + if defv6 != nil { + if !args.Defaultv6Gateway.Route.Empty() { + return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv6, args.Defaultv6Gateway) + } + args.Defaultv6Gateway.Route = *defv6 + args.Defaultv6Gateway.Name = iface.Name + } + + link := boot.FDBasedLink{ + Name: iface.Name, + MTU: iface.MTU, + Routes: routes, + TXChecksumOffload: txChecksumOffload, + RXChecksumOffload: rxChecksumOffload, + NumChannels: numNetworkChannels, + QDisc: qDisc, + } + + // Get the link for the interface. + ifaceLink, err := netlink.LinkByName(iface.Name) + if err != nil { + return fmt.Errorf("getting link for interface %q: %v", iface.Name, err) + } + link.LinkAddress = ifaceLink.Attrs().HardwareAddr + + log.Debugf("Setting up network channels") + // Create the socket for the device. + for i := 0; i < link.NumChannels; i++ { + log.Debugf("Creating Channel %d", i) + socketEntry, err := createSocket(iface, ifaceLink, hardwareGSO) + if err != nil { + return fmt.Errorf("failed to createSocket for %s : %v", iface.Name, err) + } + if i == 0 { + link.GSOMaxSize = socketEntry.gsoMaxSize + } else { + if link.GSOMaxSize != socketEntry.gsoMaxSize { + return fmt.Errorf("inconsistent gsoMaxSize %d and %d when creating multiple channels for same interface: %s", + link.GSOMaxSize, socketEntry.gsoMaxSize, iface.Name) + } + } + args.FilePayload.Files = append(args.FilePayload.Files, socketEntry.deviceFile) + } + + if link.GSOMaxSize == 0 && softwareGSO { + // Hardware GSO is disabled. Let's enable software GSO. + link.GSOMaxSize = stack.SoftwareGSOMaxSize + link.SoftwareGSOEnabled = true + } + + // Collect the addresses for the interface, enable forwarding, + // and remove them from the host. + for _, addr := range ipAddrs { + link.Addresses = append(link.Addresses, addr.IP) + + // Steal IP address from NIC. + if err := removeAddress(ifaceLink, addr.String()); err != nil { + return fmt.Errorf("removing address %v from device %q: %v", iface.Name, addr, err) + } + } + + args.FDBasedLinks = append(args.FDBasedLinks, link) + } + + log.Debugf("Setting up network, config: %+v", args) + if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &args, nil); err != nil { + return fmt.Errorf("creating links and routes: %v", err) + } + return nil +} + +type socketEntry struct { + deviceFile *os.File + gsoMaxSize uint32 +} + +// createSocket creates an underlying AF_PACKET socket and configures it for use by +// the sentry and returns an *os.File that wraps the underlying socket fd. +func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) (*socketEntry, error) { + // Create the socket. + const protocol = 0x0300 // htons(ETH_P_ALL) + fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol) + if err != nil { + return nil, fmt.Errorf("unable to create raw socket: %v", err) + } + deviceFile := os.NewFile(uintptr(fd), "raw-device-fd") + // Bind to the appropriate device. + ll := syscall.SockaddrLinklayer{ + Protocol: protocol, + Ifindex: iface.Index, + Hatype: 0, // No ARP type. + Pkttype: syscall.PACKET_OTHERHOST, + } + if err := syscall.Bind(fd, &ll); err != nil { + return nil, fmt.Errorf("unable to bind to %q: %v", iface.Name, err) + } + + gsoMaxSize := uint32(0) + if enableGSO { + gso, err := isGSOEnabled(fd, iface.Name) + if err != nil { + return nil, fmt.Errorf("getting GSO for interface %q: %v", iface.Name, err) + } + if gso { + if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil { + return nil, fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err) + } + gsoMaxSize = ifaceLink.Attrs().GSOMaxSize + } else { + log.Infof("GSO not available in host.") + } + } + + // Use SO_RCVBUFFORCE/SO_SNDBUFFORCE because on linux the receive/send buffer + // for an AF_PACKET socket is capped by "net.core.rmem_max/wmem_max". + // wmem_max/rmem_max default to a unusually low value of 208KB. This is too low + // for gVisor to be able to receive packets at high throughputs without + // incurring packet drops. + const bufSize = 4 << 20 // 4MB. + + if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, bufSize); err != nil { + return nil, fmt.Errorf("failed to increase socket rcv buffer to %d: %v", bufSize, err) + } + + if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUFFORCE, bufSize); err != nil { + return nil, fmt.Errorf("failed to increase socket snd buffer to %d: %v", bufSize, err) + } + + return &socketEntry{deviceFile, gsoMaxSize}, nil +} + +// loopbackLink returns the link with addresses and routes for a loopback +// interface. +func loopbackLink(iface net.Interface, addrs []net.Addr) (boot.LoopbackLink, error) { + link := boot.LoopbackLink{ + Name: iface.Name, + } + for _, addr := range addrs { + ipNet, ok := addr.(*net.IPNet) + if !ok { + return boot.LoopbackLink{}, fmt.Errorf("address is not IPNet: %+v", addr) + } + dst := *ipNet + dst.IP = dst.IP.Mask(dst.Mask) + link.Addresses = append(link.Addresses, ipNet.IP) + link.Routes = append(link.Routes, boot.Route{ + Destination: dst, + }) + } + return link, nil +} + +// routesForIface iterates over all routes for the given interface and converts +// them to boot.Routes. It also returns the a default v4/v6 route if found. +func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, *boot.Route, error) { + link, err := netlink.LinkByIndex(iface.Index) + if err != nil { + return nil, nil, nil, err + } + rs, err := netlink.RouteList(link, netlink.FAMILY_ALL) + if err != nil { + return nil, nil, nil, fmt.Errorf("getting routes from %q: %v", iface.Name, err) + } + + var defv4, defv6 *boot.Route + var routes []boot.Route + for _, r := range rs { + // Is it a default route? + if r.Dst == nil { + if r.Gw == nil { + return nil, nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r) + } + // Create a catch all route to the gateway. + switch len(r.Gw) { + case header.IPv4AddressSize: + if defv4 != nil { + return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv4, r) + } + defv4 = &boot.Route{ + Destination: net.IPNet{ + IP: net.IPv4zero, + Mask: net.IPMask(net.IPv4zero), + }, + Gateway: r.Gw, + } + case header.IPv6AddressSize: + if defv6 != nil { + return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv6, r) + } + + defv6 = &boot.Route{ + Destination: net.IPNet{ + IP: net.IPv6zero, + Mask: net.IPMask(net.IPv6zero), + }, + Gateway: r.Gw, + } + default: + return nil, nil, nil, fmt.Errorf("unexpected address size for gateway: %+v for route: %+v", r.Gw, r) + } + continue + } + + dst := *r.Dst + dst.IP = dst.IP.Mask(dst.Mask) + routes = append(routes, boot.Route{ + Destination: dst, + Gateway: r.Gw, + }) + } + return routes, defv4, defv6, nil +} + +// removeAddress removes IP address from network device. It's equivalent to: +// ip addr del <ipAndMask> dev <name> +func removeAddress(source netlink.Link, ipAndMask string) error { + addr, err := netlink.ParseAddr(ipAndMask) + if err != nil { + return err + } + return netlink.AddrDel(source, addr) +} |