diff options
author | Googler <noreply@google.com> | 2018-04-27 10:37:02 -0700 |
---|---|---|
committer | Adin Scannell <ascannell@google.com> | 2018-04-28 01:44:26 -0400 |
commit | d02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 (patch) | |
tree | 54f95eef73aee6bacbfc736fffc631be2605ed53 /runsc/sandbox | |
parent | f70210e742919f40aa2f0934a22f1c9ba6dada62 (diff) |
Check in gVisor.
PiperOrigin-RevId: 194583126
Change-Id: Ica1d8821a90f74e7e745962d71801c598c652463
Diffstat (limited to 'runsc/sandbox')
-rw-r--r-- | runsc/sandbox/BUILD | 53 | ||||
-rw-r--r-- | runsc/sandbox/console.go | 60 | ||||
-rw-r--r-- | runsc/sandbox/hook.go | 111 | ||||
-rw-r--r-- | runsc/sandbox/namespace.go | 204 | ||||
-rw-r--r-- | runsc/sandbox/network.go | 348 | ||||
-rw-r--r-- | runsc/sandbox/sandbox.go | 666 | ||||
-rw-r--r-- | runsc/sandbox/sandbox_test.go | 649 | ||||
-rw-r--r-- | runsc/sandbox/status.go | 56 |
8 files changed, 2147 insertions, 0 deletions
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD new file mode 100644 index 000000000..bdd95903e --- /dev/null +++ b/runsc/sandbox/BUILD @@ -0,0 +1,53 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") + +go_library( + name = "sandbox", + srcs = [ + "console.go", + "hook.go", + "namespace.go", + "network.go", + "sandbox.go", + "status.go", + ], + importpath = "gvisor.googlesource.com/gvisor/runsc/sandbox", + visibility = [ + "//runsc:__subpackages__", + ], + deps = [ + "//pkg/control/client", + "//pkg/control/server", + "//pkg/log", + "//pkg/sentry/control", + "//pkg/urpc", + "//runsc/boot", + "//runsc/specutils", + "@com_github_kr_pty//:go_default_library", + "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_vishvananda_netlink//:go_default_library", + "@org_golang_x_sys//unix:go_default_library", + ], +) + +go_test( + name = "sandbox_test", + size = "small", + srcs = ["sandbox_test.go"], + pure = "on", + rundir = ".", + deps = [ + "//pkg/abi/linux", + "//pkg/log", + "//pkg/sentry/control", + "//pkg/sentry/kernel/auth", + "//pkg/unet", + "//runsc/boot", + "//runsc/cmd", + "//runsc/sandbox", + "@com_github_google_subcommands//:go_default_library", + "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@org_golang_x_sys//unix:go_default_library", + ], +) diff --git a/runsc/sandbox/console.go b/runsc/sandbox/console.go new file mode 100644 index 000000000..3f133e12a --- /dev/null +++ b/runsc/sandbox/console.go @@ -0,0 +1,60 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sandbox + +import ( + "fmt" + "net" + "os" + + "github.com/kr/pty" + "golang.org/x/sys/unix" +) + +// setupConsole creates pty master/slave pair, sends the master FD over the +// given socket, and returns the slave. +func setupConsole(socketPath string) (*os.File, error) { + // Create a new pty master and slave. + ptyMaster, ptySlave, err := pty.Open() + if err != nil { + return nil, fmt.Errorf("error opening pty: %v", err) + } + defer ptyMaster.Close() + + // Get a connection to the socket path. + conn, err := net.Dial("unix", socketPath) + if err != nil { + ptySlave.Close() + return nil, fmt.Errorf("error dial socket %q: %v", socketPath, err) + } + uc, ok := conn.(*net.UnixConn) + if !ok { + ptySlave.Close() + return nil, fmt.Errorf("connection is not a UnixConn: %T", conn) + } + socket, err := uc.File() + if err != nil { + ptySlave.Close() + return nil, fmt.Errorf("error getting file for unix socket %v: %v", uc, err) + } + + // Send the master FD over the connection. + msg := unix.UnixRights(int(ptyMaster.Fd())) + if err := unix.Sendmsg(int(socket.Fd()), []byte("pty-master"), msg, nil, 0); err != nil { + ptySlave.Close() + return nil, fmt.Errorf("error sending console over unix socket %q: %v", socketPath, err) + } + return ptySlave, nil +} diff --git a/runsc/sandbox/hook.go b/runsc/sandbox/hook.go new file mode 100644 index 000000000..40b064cdc --- /dev/null +++ b/runsc/sandbox/hook.go @@ -0,0 +1,111 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sandbox + +import ( + "bytes" + "encoding/json" + "fmt" + "os/exec" + "path/filepath" + "strings" + "time" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.googlesource.com/gvisor/pkg/log" +) + +// This file implements hooks as defined in OCI spec: +// https://github.com/opencontainers/runtime-spec/blob/master/config.md#toc22 +// +// "hooks":{ +// "prestart":[{ +// "path":"/usr/bin/dockerd", +// "args":[ +// "libnetwork-setkey", "arg2", +// ] +// }] +// }, + +// executeHooksBestEffort executes hooks and logs warning in case they fail. +// Runs all hooks, always. +func executeHooksBestEffort(hooks []specs.Hook, s specs.State) { + for _, h := range hooks { + if err := executeHook(h, s); err != nil { + log.Warningf("Failure to execute hook %+v, err: %v", h, err) + } + } +} + +// executeHooks executes hooks until the first one fails or they all execute. +func executeHooks(hooks []specs.Hook, s specs.State) error { + for _, h := range hooks { + if err := executeHook(h, s); err != nil { + return err + } + } + return nil +} + +func executeHook(h specs.Hook, s specs.State) error { + log.Debugf("Executing hook %+v, state: %+v", h, s) + + if strings.TrimSpace(h.Path) == "" { + return fmt.Errorf("empty path for hook") + } + if !filepath.IsAbs(h.Path) { + return fmt.Errorf("path for hook is not absolute: %q", h.Path) + } + + b, err := json.Marshal(s) + if err != nil { + return err + } + var stdout, stderr bytes.Buffer + cmd := exec.Cmd{ + Path: h.Path, + Args: h.Args, + Env: h.Env, + Stdin: bytes.NewReader(b), + Stdout: &stdout, + Stderr: &stderr, + } + if err := cmd.Start(); err != nil { + return err + } + + c := make(chan error, 1) + go func() { + c <- cmd.Wait() + }() + + var timer <-chan time.Time + if h.Timeout != nil { + timer = time.After(time.Duration(*h.Timeout) * time.Second) + } + select { + case err := <-c: + if err != nil { + return fmt.Errorf("failure executing hook %q, err: %v\nstdout: %s\nstderr: %s", h.Path, err, stdout.String(), stderr.String()) + } + case <-timer: + cmd.Process.Kill() + cmd.Wait() + return fmt.Errorf("timeout executing hook %q\nstdout: %s\nstderr: %s", h.Path, stdout.String(), stderr.String()) + } + + log.Debugf("Execute hook %q success!", h.Path) + return nil +} diff --git a/runsc/sandbox/namespace.go b/runsc/sandbox/namespace.go new file mode 100644 index 000000000..1d3bcfbb5 --- /dev/null +++ b/runsc/sandbox/namespace.go @@ -0,0 +1,204 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sandbox + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "runtime" + "syscall" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" + "gvisor.googlesource.com/gvisor/pkg/log" +) + +// nsCloneFlag returns the clone flag that can be used to set a namespace of +// the given type. +func nsCloneFlag(nst specs.LinuxNamespaceType) uintptr { + switch nst { + case specs.IPCNamespace: + return syscall.CLONE_NEWIPC + case specs.MountNamespace: + return syscall.CLONE_NEWNS + case specs.NetworkNamespace: + return syscall.CLONE_NEWNET + case specs.PIDNamespace: + return syscall.CLONE_NEWPID + case specs.UTSNamespace: + return syscall.CLONE_NEWUTS + case specs.UserNamespace: + return syscall.CLONE_NEWUSER + case specs.CgroupNamespace: + panic("cgroup namespace has no associated clone flag") + default: + panic(fmt.Sprintf("unknown namespace %v", nst)) + } +} + +// nsPath returns the path of the namespace for the current process and the +// given namespace. +func nsPath(nst specs.LinuxNamespaceType) string { + base := "/proc/self/ns" + switch nst { + case specs.CgroupNamespace: + return filepath.Join(base, "cgroup") + case specs.IPCNamespace: + return filepath.Join(base, "ipc") + case specs.MountNamespace: + return filepath.Join(base, "mnt") + case specs.NetworkNamespace: + return filepath.Join(base, "net") + case specs.PIDNamespace: + return filepath.Join(base, "pid") + case specs.UserNamespace: + return filepath.Join(base, "user") + case specs.UTSNamespace: + return filepath.Join(base, "uts") + default: + panic(fmt.Sprintf("unknown namespace %v", nst)) + } +} + +// getNS returns true and the namespace with the given type from the slice of +// namespaces in the spec. It returns false if the slice does not contain a +// namespace with the type. +func getNS(nst specs.LinuxNamespaceType, s *specs.Spec) (specs.LinuxNamespace, bool) { + if s.Linux == nil { + return specs.LinuxNamespace{}, false + } + for _, ns := range s.Linux.Namespaces { + if ns.Type == nst { + return ns, true + } + } + return specs.LinuxNamespace{}, false +} + +// filterNS returns a slice of namespaces from the spec with types that match +// those in the `filter` slice. +func filterNS(filter []specs.LinuxNamespaceType, s *specs.Spec) []specs.LinuxNamespace { + if s.Linux == nil { + return nil + } + var out []specs.LinuxNamespace + for _, nst := range filter { + if ns, ok := getNS(nst, s); ok { + out = append(out, ns) + } + } + return out +} + +// setNS sets the namespace of the given type. It must be called with +// OSThreadLocked. +func setNS(fd, nsType uintptr) error { + if _, _, err := syscall.RawSyscall(unix.SYS_SETNS, fd, nsType, 0); err != 0 { + return err + } + return nil +} + +// applyNS applies the namespace on the current thread and returns a function +// that will restore the namespace to the original value. +// +// Preconditions: Must be called with os thread locked. +func applyNS(ns specs.LinuxNamespace) (func(), error) { + log.Infof("applying namespace %v at path %q", ns.Type, ns.Path) + newNS, err := os.Open(ns.Path) + if err != nil { + return nil, fmt.Errorf("error opening %q: %v", ns.Path, err) + } + defer newNS.Close() + + // Store current netns to restore back after child is started. + curPath := nsPath(ns.Type) + oldNS, err := os.Open(curPath) + if err != nil { + return nil, fmt.Errorf("error opening %q: %v", curPath, err) + } + + // Set netns to the one requested and setup function to restore it back. + flag := nsCloneFlag(ns.Type) + if err := setNS(newNS.Fd(), flag); err != nil { + oldNS.Close() + return nil, fmt.Errorf("error setting namespace of type %v and path %q: %v", ns.Type, ns.Path, err) + } + return func() { + log.Infof("restoring namespace %v", ns.Type) + defer oldNS.Close() + if err := setNS(oldNS.Fd(), flag); err != nil { + panic(fmt.Sprintf("error restoring namespace: of type %v: %v", ns.Type, err)) + } + }, nil +} + +// startInNS joins or creates the given namespaces and calls cmd.Start before +// restoring the namespaces to the original values. +func startInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) error { + // We are about to setup namespaces, which requires the os thread being + // locked so that Go doesn't change the thread out from under us. + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + if cmd.SysProcAttr == nil { + cmd.SysProcAttr = &syscall.SysProcAttr{} + } + + for _, ns := range nss { + if ns.Path == "" { + // No path. Just set a flag to create a new namespace. + cmd.SysProcAttr.Cloneflags |= nsCloneFlag(ns.Type) + continue + } + // Join the given namespace, and restore the current namespace + // before exiting. + restoreNS, err := applyNS(ns) + if err != nil { + return err + } + defer restoreNS() + } + + return cmd.Start() +} + +// setUIDGIDMappings sets the given uid/gid mappings from the spec on the cmd. +func setUIDGIDMappings(cmd *exec.Cmd, s *specs.Spec) { + if s.Linux == nil { + return + } + if cmd.SysProcAttr == nil { + cmd.SysProcAttr = &syscall.SysProcAttr{} + } + for _, idMap := range s.Linux.UIDMappings { + log.Infof("Mapping host uid %d to container uid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size) + cmd.SysProcAttr.UidMappings = append(cmd.SysProcAttr.UidMappings, syscall.SysProcIDMap{ + ContainerID: int(idMap.ContainerID), + HostID: int(idMap.HostID), + Size: int(idMap.Size), + }) + } + for _, idMap := range s.Linux.GIDMappings { + log.Infof("Mapping host gid %d to container gid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size) + cmd.SysProcAttr.GidMappings = append(cmd.SysProcAttr.GidMappings, syscall.SysProcIDMap{ + ContainerID: int(idMap.ContainerID), + HostID: int(idMap.HostID), + Size: int(idMap.Size), + }) + } +} diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go new file mode 100644 index 000000000..1b6a1d9a6 --- /dev/null +++ b/runsc/sandbox/network.go @@ -0,0 +1,348 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sandbox + +import ( + "fmt" + "net" + "os" + "path/filepath" + "runtime" + "strconv" + "strings" + "syscall" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/vishvananda/netlink" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/urpc" + "gvisor.googlesource.com/gvisor/runsc/boot" +) + +// setupNetwork configures the network stack to mimic the local network +// configuration. Docker uses network namespaces with vnets to configure the +// network for the container. The untrusted app expects to see the same network +// inside the sandbox. Routing and port mapping is handled directly by docker +// with most of network information not even available to the runtime. +// +// Netstack inside the sandbox speaks directly to the device using a raw socket. +// All IP addresses assigned to the NIC, are removed and passed on to netstack's +// device. +// +// If 'conf.Network' is NoNetwork, skips local configuration and creates a +// loopback interface only. +// +// Run the following container to test it: +// docker run -di --runtime=runsc -p 8080:80 -v $PWD:/usr/local/apache2/htdocs/ httpd:2.4 +func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Config) error { + log.Infof("Setting up network") + + // HACK! + // + // When kubernetes starts a pod, it first creates a sandbox with an + // application that just pauses forever. Later, when a container is + // added to the pod, kubernetes will create another sandbox with a + // config that corresponds to the containerized application, and add it + // to the same namespaces as the pause sandbox. + // + // Running a second sandbox currently breaks because the two sandboxes + // have the same network namespace and configuration, and try to create + // a tap device on the same host device which fails. + // + // Runsc will eventually need to detect that this container is meant to + // be run in the same sandbox as the pausing application, and somehow + // make that happen. + // + // For now the following HACK disables networking for the "pause" + // sandbox, allowing the second sandbox to start up successfully. + // + // Cri-o helpfully adds the "ContainerType" annotation that we can use + // to detect whether we are a pod or container. Cri-containerd will + // support this eventually, but does not currently + // (https://github.com/kubernetes-incubator/cri-containerd/issues/512). + // + // Thus, to support cri-containerd, we check if the exec args is + // "/pause", which is pretty gross. + // + // TODO: Remove this once multiple containers per sandbox + // is properly supported. + if spec.Annotations["io.kubernetes.cri-o.ContainerType"] == "sandbox" || spec.Process.Args[0] == "/pause" { + log.Warningf("HACK: Disabling network") + conf.Network = boot.NetworkNone + } + + switch conf.Network { + case boot.NetworkNone: + log.Infof("Network is disabled, create loopback interface only") + if err := createDefaultLoopbackInterface(conn); err != nil { + return fmt.Errorf("error creating default loopback interface: %v", err) + } + case boot.NetworkSandbox: + // Build the path to the net namespace of the sandbox process. + // This is what we will copy. + nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net") + if err := createInterfacesAndRoutesFromNS(conn, nsPath); err != nil { + return fmt.Errorf("error creating interfaces from net namespace %q: %v", nsPath, err) + } + case boot.NetworkHost: + // Nothing to do here. + default: + return fmt.Errorf("Invalid network type: %d", conf.Network) + } + return nil +} + +func createDefaultLoopbackInterface(conn *urpc.Client) error { + link := boot.LoopbackLink{ + Name: "lo", + Addresses: []net.IP{ + net.IP("\x7f\x00\x00\x01"), + net.IPv6loopback, + }, + Routes: []boot.Route{ + { + Destination: net.IP("\x7f\x00\x00\x00"), + Mask: net.IPMask("\xff\x00\x00\x00"), + }, + { + Destination: net.IPv6loopback, + Mask: net.IPMask(strings.Repeat("\xff", 16)), + }, + }, + } + if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &boot.CreateLinksAndRoutesArgs{ + LoopbackLinks: []boot.LoopbackLink{link}, + }, nil); err != nil { + return fmt.Errorf("error creating loopback link and routes: %v", err) + } + return nil +} + +func joinNetNS(nsPath string) (func(), error) { + runtime.LockOSThread() + restoreNS, err := applyNS(specs.LinuxNamespace{ + Type: specs.NetworkNamespace, + Path: nsPath, + }) + if err != nil { + runtime.UnlockOSThread() + return nil, fmt.Errorf("error joining net namespace %q: %v", nsPath, err) + } + return func() { + restoreNS() + runtime.UnlockOSThread() + }, nil +} + +// isRootNS determines whether we are running in the root net namespace. +// +// TODO: Find a better way to detect root network. +func isRootNS(ifaces []net.Interface) bool { + for _, iface := range ifaces { + if iface.Name == "docker0" { + return true + } + } + return false + +} + +// createInterfacesAndRoutesFromNS scrapes the interface and routes from the +// net namespace with the given path, creates them in the sandbox, and removes +// them from the host. +func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string) error { + // Join the network namespace that we will be copying. + restore, err := joinNetNS(nsPath) + if err != nil { + return err + } + defer restore() + + // Get all interfaces in the namespace. + ifaces, err := net.Interfaces() + if err != nil { + return fmt.Errorf("error querying interfaces: %v", err) + } + + if isRootNS(ifaces) { + return fmt.Errorf("cannot run in with network enabled in root network namespace") + } + + // Collect addresses and routes from the interfaces. + var args boot.CreateLinksAndRoutesArgs + for _, iface := range ifaces { + if iface.Flags&net.FlagUp == 0 { + log.Infof("Skipping down interface: %+v", iface) + continue + } + + ifaddrs, err := iface.Addrs() + if err != nil { + return fmt.Errorf("error fetching interface addresses for %q: %v", iface.Name, err) + } + + // We build our own loopback devices. + if iface.Flags&net.FlagLoopback != 0 { + links, err := loopbackLinks(iface, ifaddrs) + if err != nil { + return fmt.Errorf("error getting loopback routes and links for iface %q: %v", iface.Name, err) + } + args.LoopbackLinks = append(args.LoopbackLinks, links...) + continue + } + + // Get the link for the interface. + ifaceLink, err := netlink.LinkByName(iface.Name) + if err != nil { + return fmt.Errorf("error getting link for interface %q: %v", iface.Name, err) + } + + // Create the socket. + const protocol = 0x0300 // htons(ETH_P_ALL) + fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol) + if err != nil { + return fmt.Errorf("unable to create raw socket: %v", err) + } + deviceFile := os.NewFile(uintptr(fd), "raw-device-fd") + + // Bind to the appropriate device. + ll := syscall.SockaddrLinklayer{ + Protocol: protocol, + Ifindex: ifaceLink.Attrs().Index, + Hatype: 0, // No ARP type. + Pkttype: syscall.PACKET_OTHERHOST, + } + if err := syscall.Bind(fd, &ll); err != nil { + return fmt.Errorf("unable to bind to %q: %v", iface.Name, err) + } + + // Scrape the routes before removing the address, since that + // will remove the routes as well. + routes, def, err := routesForIface(iface) + if err != nil { + return fmt.Errorf("error getting routes for interface %q: %v", iface.Name, err) + } + if def != nil { + if !args.DefaultGateway.Route.Empty() { + return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, def, args.DefaultGateway) + } + args.DefaultGateway.Route = *def + args.DefaultGateway.Name = iface.Name + } + + link := boot.FDBasedLink{ + Name: iface.Name, + MTU: iface.MTU, + Routes: routes, + } + + // Collect the addresses for the interface, enable forwarding, + // and remove them from the host. + for _, ifaddr := range ifaddrs { + ipNet, ok := ifaddr.(*net.IPNet) + if !ok { + return fmt.Errorf("address is not IPNet: %t %+v", ifaddr, ifaddr) + } + link.Addresses = append(link.Addresses, ipNet.IP) + + // Steal IP address from NIC. + if err := removeAddress(ifaceLink, ipNet.String()); err != nil { + return fmt.Errorf("error removing address %v from device %q: %v", iface.Name, ipNet, err) + } + } + + args.FilePayload.Files = append(args.FilePayload.Files, deviceFile) + args.FDBasedLinks = append(args.FDBasedLinks, link) + } + + log.Debugf("Setting up network, config: %+v", args) + if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &args, nil); err != nil { + return fmt.Errorf("error creating links and routes: %v", err) + } + return nil +} + +// loopbackLinks collects the links for a loopback interface. +func loopbackLinks(iface net.Interface, addrs []net.Addr) ([]boot.LoopbackLink, error) { + var links []boot.LoopbackLink + for _, addr := range addrs { + ipNet, ok := addr.(*net.IPNet) + if !ok { + return nil, fmt.Errorf("address is not IPNet: %t %+v", addr, addr) + } + links = append(links, boot.LoopbackLink{ + Name: iface.Name, + Addresses: []net.IP{ipNet.IP}, + Routes: []boot.Route{{ + Destination: ipNet.IP.Mask(ipNet.Mask), + Mask: ipNet.Mask, + }}, + }) + } + return links, nil +} + +// routesForIface iterates over all routes for the given interface and converts +// them to boot.Routes. +func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, error) { + link, err := netlink.LinkByIndex(iface.Index) + if err != nil { + return nil, nil, err + } + rs, err := netlink.RouteList(link, netlink.FAMILY_ALL) + if err != nil { + return nil, nil, fmt.Errorf("error getting routes from %q: %v", iface.Name, err) + } + + var def *boot.Route + var routes []boot.Route + for _, r := range rs { + // Is it a default route? + if r.Dst == nil { + if r.Gw == nil { + return nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r) + } + if def != nil { + return nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, def, r) + } + emptyAddr := net.IPv6zero + if r.Gw.To4() != nil { + emptyAddr = net.IPv4zero + } + // Create a catch all route to the gateway. + def = &boot.Route{ + Destination: emptyAddr, + Mask: net.IPMask(emptyAddr), + Gateway: r.Gw, + } + continue + } + routes = append(routes, boot.Route{ + Destination: r.Dst.IP.Mask(r.Dst.Mask), + Mask: r.Dst.Mask, + }) + } + return routes, def, nil +} + +// removeAddress removes IP address from network device. It's equivalent to: +// ip addr del <ipAndMask> dev <name> +func removeAddress(source netlink.Link, ipAndMask string) error { + addr, err := netlink.ParseAddr(ipAndMask) + if err != nil { + return err + } + return netlink.AddrDel(source, addr) +} diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go new file mode 100644 index 000000000..b2fa1d58e --- /dev/null +++ b/runsc/sandbox/sandbox.go @@ -0,0 +1,666 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package sandbox creates and manipulates sandboxes. +package sandbox + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "os" + "os/exec" + "path/filepath" + "regexp" + "strconv" + "syscall" + "time" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" + "gvisor.googlesource.com/gvisor/pkg/control/client" + "gvisor.googlesource.com/gvisor/pkg/control/server" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/control" + "gvisor.googlesource.com/gvisor/pkg/urpc" + "gvisor.googlesource.com/gvisor/runsc/boot" + "gvisor.googlesource.com/gvisor/runsc/specutils" +) + +// metadataFilename is the name of the metadata file relative to sandboxRoot +// that holds sandbox metadata. +const metadataFilename = "meta.json" + +// See libcontainer/factory_linux.go +var idRegex = regexp.MustCompile(`^[\w+-\.]+$`) + +// validateID validates the sandbox id. +func validateID(id string) error { + if !idRegex.MatchString(id) { + return fmt.Errorf("invalid sandbox id: %v", id) + } + return nil +} + +// Sandbox wraps a child sandbox process, and is responsible for saving and +// loading sandbox metadata to disk. +// +// Within a root directory, we maintain subdirectories for each sandbox named +// with the sandbox id. The sandbox metadata is is stored as json within the +// sandbox directoy in a file named "meta.json". This metadata format is +// defined by us, and is not part of the OCI spec. +// +// Sandboxes must write this metadata file after any change to their internal +// state. The entire sandbox directory is deleted when the sandbox is +// destroyed. +// +// TODO: Protect against concurrent changes to the sandbox metadata +// file. +type Sandbox struct { + // ID is the sandbox ID. + ID string `json:"id"` + + // Spec is the OCI runtime spec that configures this sandbox. + Spec *specs.Spec `json:"spec"` + + // BundleDir is the directory containing the sandbox bundle. + BundleDir string `json:"bundleDir"` + + // SandboxRoot is the directory containing the sandbox metadata file. + SandboxRoot string `json:"sandboxRoot"` + + // CreatedAt is the time the sandbox was created. + CreatedAt time.Time `json:"createdAt"` + + // Owner is the sandbox owner. + Owner string `json:"owner"` + + // ConsoleSocket is the path to a unix domain socket that will receive + // the console FD. It is only used during create, so we don't need to + // store it in the metadata. + ConsoleSocket string `json:"-"` + + // Pid is the pid of the running sandbox. Only valid if Status is + // Created or Running. + Pid int `json:"pid"` + + // GoferPid is the pid of the gofer running along side the sandbox. May be 0 + // if the gofer has been killed or it's not being used. + GoferPid int `json:"goferPid"` + + // Status is the current sandbox Status. + Status Status `json:"status"` +} + +// Create creates the sandbox subprocess and writes the metadata file. Args +// are additional arguments that will be passed to the sandbox process. +func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile string, args []string) (*Sandbox, error) { + log.Debugf("Create sandbox %q in root dir: %s", id, conf.RootDir) + if err := validateID(id); err != nil { + return nil, err + } + + sandboxRoot := filepath.Join(conf.RootDir, id) + if exists(sandboxRoot) { + return nil, fmt.Errorf("sandbox with id %q already exists: %q ", id, sandboxRoot) + } + + s := &Sandbox{ + ID: id, + Spec: spec, + ConsoleSocket: consoleSocket, + BundleDir: bundleDir, + SandboxRoot: sandboxRoot, + Status: Creating, + Owner: os.Getenv("USER"), + } + + // Create sandbox process. If anything errors between now and the end of this + // function, we MUST clean up all sandbox resources. + if err := s.createProcesses(conf, args); err != nil { + s.Destroy() + return nil, err + } + + // Wait for the control server to come up (or timeout). The sandbox is + // not "created" until that happens. + if err := s.waitForCreated(10 * time.Second); err != nil { + s.Destroy() + return nil, err + } + + s.Status = Created + s.CreatedAt = time.Now() + + // Save the metadata file. + if err := s.save(); err != nil { + s.Destroy() + return nil, err + } + + // Write the pid file. Containerd consideres the create complete after + // this file is created, so it must be the last thing we do. + if pidFile != "" { + if err := ioutil.WriteFile(pidFile, []byte(strconv.Itoa(s.Pid)), 0644); err != nil { + s.Destroy() + return nil, fmt.Errorf("error writing pid file: %v", err) + } + } + + return s, nil +} + +// Run is a helper that calls Create + Start + Wait. +func Run(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile string, args []string) (syscall.WaitStatus, error) { + s, err := Create(id, spec, conf, bundleDir, consoleSocket, pidFile, args) + if err != nil { + return 0, fmt.Errorf("error creating sandbox: %v", err) + } + if err := s.Start(conf); err != nil { + return 0, fmt.Errorf("error starting sandbox: %v", err) + } + return s.Wait() +} + +// Load loads a sandbox from with the given id from a metadata file. +func Load(rootDir, id string) (*Sandbox, error) { + log.Debugf("Load sandbox %q %q", rootDir, id) + if err := validateID(id); err != nil { + return nil, err + } + sandboxRoot := filepath.Join(rootDir, id) + if !exists(sandboxRoot) { + return nil, fmt.Errorf("sandbox with id %q does not exist", id) + } + metaFile := filepath.Join(sandboxRoot, metadataFilename) + if !exists(metaFile) { + return nil, fmt.Errorf("sandbox with id %q does not have metadata file %q", id, metaFile) + } + metaBytes, err := ioutil.ReadFile(metaFile) + if err != nil { + return nil, fmt.Errorf("error reading sandbox metadata file %q: %v", metaFile, err) + } + var s Sandbox + if err := json.Unmarshal(metaBytes, &s); err != nil { + return nil, fmt.Errorf("error unmarshaling sandbox metadata from %q: %v", metaFile, err) + } + + // If the status is "Running" or "Created", check that the process + // still exists, and set it to Stopped if it does not. + // + // This is inherintly racey. + if s.Status == Running || s.Status == Created { + // Send signal 0 to check if process exists. + if err := s.Signal(0); err != nil { + // Process no longer exists. + s.Status = Stopped + s.Pid = 0 + } + } + + return &s, nil +} + +// List returns all sandbox ids in the given root directory. +func List(rootDir string) ([]string, error) { + log.Debugf("List sandboxes %q", rootDir) + fs, err := ioutil.ReadDir(rootDir) + if err != nil { + return nil, fmt.Errorf("ReadDir(%s) failed: %v", rootDir, err) + } + var out []string + for _, f := range fs { + out = append(out, f.Name()) + } + return out, nil +} + +// State returns the metadata of the sandbox. +func (s *Sandbox) State() specs.State { + return specs.State{ + Version: specs.Version, + ID: s.ID, + Status: s.Status.String(), + Pid: s.Pid, + Bundle: s.BundleDir, + } +} + +// Start starts running the containerized process inside the sandbox. +func (s *Sandbox) Start(conf *boot.Config) error { + log.Debugf("Start sandbox %q, pid: %d", s.ID, s.Pid) + if s.Status != Created { + return fmt.Errorf("cannot start container in state %s", s.Status) + } + + // "If any prestart hook fails, the runtime MUST generate an error, + // stop and destroy the container". + if s.Spec.Hooks != nil { + if err := executeHooks(s.Spec.Hooks.Prestart, s.State()); err != nil { + s.Destroy() + return err + } + } + + c, err := s.connect() + if err != nil { + s.Destroy() + return err + } + defer c.Close() + + // Configure the network. + if err := setupNetwork(c, s.Pid, s.Spec, conf); err != nil { + s.Destroy() + return fmt.Errorf("error setting up network: %v", err) + } + + // Send a message to the sandbox control server to start the + // application. + if err := c.Call(boot.ApplicationStart, nil, nil); err != nil { + s.Destroy() + return fmt.Errorf("error starting sandbox: %v", err) + } + + // "If any poststart hook fails, the runtime MUST log a warning, but + // the remaining hooks and lifecycle continue as if the hook had + // succeeded". + if s.Spec.Hooks != nil { + executeHooksBestEffort(s.Spec.Hooks.Poststart, s.State()) + } + + s.Status = Running + return s.save() +} + +// Processes retrieves the list of processes and associated metadata inside a +// sandbox. +func (s *Sandbox) Processes() ([]*control.Process, error) { + if s.Status != Running { + return nil, fmt.Errorf("cannot get processes of container %q because it isn't running. It is in state %v", s.ID, s.Status) + } + + c, err := s.connect() + if err != nil { + return nil, err + } + defer c.Close() + + var pl []*control.Process + if err := c.Call(boot.ApplicationProcesses, nil, &pl); err != nil { + return nil, fmt.Errorf("error retrieving process data from sandbox: %v", err) + } + return pl, nil +} + +// Execute runs the specified command in the sandbox. +func (s *Sandbox) Execute(e *control.ExecArgs) (syscall.WaitStatus, error) { + log.Debugf("Execute in sandbox %q, pid: %d, args: %+v", s.ID, s.Pid, e) + if s.Status != Created && s.Status != Running { + return 0, fmt.Errorf("cannot exec in container in state %s", s.Status) + } + + log.Debugf("Connecting to sandbox...") + c, err := s.connect() + if err != nil { + return 0, fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err) + } + defer c.Close() + + // Send a message to the sandbox control server to start the application. + var waitStatus uint32 + if err := c.Call(boot.ApplicationExecute, e, &waitStatus); err != nil { + return 0, fmt.Errorf("error executing in sandbox: %v", err) + } + + return syscall.WaitStatus(waitStatus), nil +} + +// Event retrieves stats about the sandbox such as memory and CPU utilization. +func (s *Sandbox) Event() (*boot.Event, error) { + if s.Status != Running && s.Status != Created { + return nil, fmt.Errorf("cannot get events for container in state: %s", s.Status) + } + + c, err := s.connect() + if err != nil { + return nil, err + } + defer c.Close() + + var e boot.Event + if err := c.Call(boot.ApplicationEvent, nil, &e); err != nil { + return nil, fmt.Errorf("error retrieving event data from sandbox: %v", err) + } + e.ID = s.ID + return &e, nil +} + +func (s *Sandbox) connect() (*urpc.Client, error) { + log.Debugf("Connecting to sandbox...") + c, err := client.ConnectTo(boot.ControlSocketAddr(s.ID)) + if err != nil { + return nil, fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err) + } + return c, nil +} + +func (s *Sandbox) createProcesses(conf *boot.Config, args []string) error { + binPath, err := specutils.BinPath() + if err != nil { + return err + } + + ioFiles, err := s.createGoferProcess(conf, binPath, args) + if err != nil { + return err + } + return s.createSandboxProcess(conf, binPath, args, ioFiles) +} + +func (s *Sandbox) createGoferProcess(conf *boot.Config, binPath string, commonArgs []string) ([]*os.File, error) { + if conf.FileAccess != boot.FileAccessProxy { + // Don't start a gofer. The sandbox will access host FS directly. + return nil, nil + } + + var args []string + args = append(args, commonArgs...) + args = append(args, "gofer", "--bundle", s.BundleDir) + + // Start with root mount and then add any other additional mount. + mountCount := 1 + for _, m := range s.Spec.Mounts { + if specutils.Is9PMount(m) { + mountCount++ + } + } + + sandEnds := make([]*os.File, 0, mountCount) + goferEnds := make([]*os.File, 0, mountCount) + for i := 0; i < mountCount; i++ { + // Create socket that connects the sandbox and gofer. + fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) + if err != nil { + return nil, err + } + sandEnds = append(sandEnds, os.NewFile(uintptr(fds[0]), "sandbox io fd")) + + goferEnd := os.NewFile(uintptr(fds[1]), "gofer io fd") + defer goferEnd.Close() + goferEnds = append(goferEnds, goferEnd) + + args = append(args, fmt.Sprintf("--io-fds=%d", 3+i)) + } + + cmd := exec.Command(binPath, args...) + cmd.ExtraFiles = goferEnds + + // Setup any uid/gid mappings, and create or join the configured user + // namespace so the gofer's view of the filesystem aligns with the + // users in the sandbox. + setUIDGIDMappings(cmd, s.Spec) + nss := filterNS([]specs.LinuxNamespaceType{specs.UserNamespace}, s.Spec) + + // Start the gofer in the given namespace. + log.Debugf("Starting gofer: %s %v", binPath, args) + if err := startInNS(cmd, nss); err != nil { + return nil, err + } + s.GoferPid = cmd.Process.Pid + log.Infof("Gofer started, pid: %d", cmd.Process.Pid) + return sandEnds, nil +} + +// createSandboxProcess starts the sandbox as a subprocess by running the "boot" +// command, passing in the bundle dir. +func (s *Sandbox) createSandboxProcess(conf *boot.Config, binPath string, commonArgs []string, ioFiles []*os.File) error { + // nextFD is used to get unused FDs that we can pass to the sandbox. It + // starts at 3 because 0, 1, and 2 are taken by stdin/out/err. + nextFD := 3 + + // Create control server socket here and donate FD to child process because + // it may be in a different network namespace and won't be reachable from + // outside. + fd, err := server.CreateSocket(boot.ControlSocketAddr(s.ID)) + if err != nil { + return fmt.Errorf("error creating control server socket for sandbox %q: %v", s.ID, err) + } + + consoleEnabled := s.ConsoleSocket != "" + + cmd := exec.Command(binPath, commonArgs...) + cmd.SysProcAttr = &syscall.SysProcAttr{} + cmd.Args = append(cmd.Args, + "boot", + "--bundle", s.BundleDir, + "--controller-fd="+strconv.Itoa(nextFD), + fmt.Sprintf("--console=%t", consoleEnabled)) + nextFD++ + + controllerFile := os.NewFile(uintptr(fd), "control_server_socket") + defer controllerFile.Close() + cmd.ExtraFiles = append(cmd.ExtraFiles, controllerFile) + + // If there is a gofer, sends all socket ends to the sandbox. + for _, f := range ioFiles { + defer f.Close() + cmd.ExtraFiles = append(cmd.ExtraFiles, f) + cmd.Args = append(cmd.Args, "--io-fds="+strconv.Itoa(nextFD)) + nextFD++ + } + + // If the console control socket file is provided, then create a new + // pty master/slave pair and set the tty on the sandox process. + if consoleEnabled { + // setupConsole will send the master on the socket, and return + // the slave. + tty, err := setupConsole(s.ConsoleSocket) + if err != nil { + return fmt.Errorf("error setting up control socket %q: %v", s.ConsoleSocket, err) + } + defer tty.Close() + + cmd.Stdin = tty + cmd.Stdout = tty + cmd.Stderr = tty + cmd.SysProcAttr.Setctty = true + cmd.SysProcAttr.Ctty = int(tty.Fd()) + } else { + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + } + + // Detach from this session, otherwise cmd will get SIGHUP and SIGCONT + // when re-parented. + cmd.SysProcAttr.Setsid = true + + // nss is the set of namespaces to join or create before starting the sandbox + // process. IPC and UTS namespaces from the host are not used as they + // are virtualized inside the sandbox. Be paranoid and run inside an empty + // namespace for these. + log.Infof("Sandbox will be started in empty IPC and UTS namespaces") + nss := []specs.LinuxNamespace{ + specs.LinuxNamespace{Type: specs.IPCNamespace}, + specs.LinuxNamespace{Type: specs.UTSNamespace}, + } + + if conf.Platform == boot.PlatformPtrace { + // TODO: Also set an empty PID namespace so that we limit + // access to other host processes. + log.Infof("Sandbox will be started in the current PID namespace") + } else { + log.Infof("Sandbox will be started in empty PID namespace") + nss = append(nss, specs.LinuxNamespace{Type: specs.PIDNamespace}) + } + + if conf.FileAccess == boot.FileAccessProxy { + log.Infof("Sandbox will be started in empty mount namespace") + nss = append(nss, specs.LinuxNamespace{Type: specs.MountNamespace}) + } else { + log.Infof("Sandbox will be started in the current mount namespace") + } + + // Joins the network namespace if network is enabled. the sandbox talks + // directly to the host network, which may have been configured in the + // namespace. + if ns, ok := getNS(specs.NetworkNamespace, s.Spec); ok && conf.Network != boot.NetworkNone { + log.Infof("Sandbox will be started in the container's network namespace: %+v", ns) + nss = append(nss, ns) + } else { + log.Infof("Sandbox will be started in empty network namespace") + nss = append(nss, specs.LinuxNamespace{Type: specs.NetworkNamespace}) + } + + // User namespace depends on the following options: + // - Host network/filesystem: requires to run inside the user namespace + // specified in the spec or the current namespace if none is configured. + // - Gofer: when using a Gofer, the sandbox process can run isolated in an + // empty namespace. + if conf.Network == boot.NetworkHost || conf.FileAccess == boot.FileAccessDirect { + if userns, ok := getNS(specs.UserNamespace, s.Spec); ok { + log.Infof("Sandbox will be started in container's user namespace: %+v", userns) + nss = append(nss, userns) + setUIDGIDMappings(cmd, s.Spec) + } else { + // TODO: Retrict capabilities since it's using current user + // namespace, i.e. root. + log.Infof("Sandbox will be started in the current user namespace") + } + // When running in the caller's defined user namespace, apply the same + // capabilities to the sandbox process to ensure it abides to the same + // rules. + cmd.Args = append(cmd.Args, "--apply-caps=true") + + } else { + log.Infof("Sandbox will be started in empty user namespace") + nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace}) + } + + log.Debugf("Starting sandbox: %s %v", binPath, cmd.Args) + if err := startInNS(cmd, nss); err != nil { + return err + } + s.Pid = cmd.Process.Pid + log.Infof("Sandbox started, pid: %d", s.Pid) + return nil +} + +// waitForCreated waits for the sandbox subprocess control server to be +// running, at which point the sandbox is in Created state. +func (s *Sandbox) waitForCreated(timeout time.Duration) error { + log.Debugf("Waiting for sandbox %q creation", s.ID) + tchan := time.After(timeout) + for { + select { + case <-tchan: + return fmt.Errorf("timed out waiting for sandbox control server") + default: + if c, err := client.ConnectTo(boot.ControlSocketAddr(s.ID)); err == nil { + // It's alive! + c.Close() + return nil + } + } + } +} + +// Wait waits for the containerized process to exit, and returns its WaitStatus. +func (s *Sandbox) Wait() (syscall.WaitStatus, error) { + log.Debugf("Wait on sandbox %q with pid %d", s.ID, s.Pid) + p, err := os.FindProcess(s.Pid) + if err != nil { + // "On Unix systems, FindProcess always succeeds and returns a + // Process for the given pid." + panic(err) + } + ps, err := p.Wait() + if err != nil { + return 0, err + } + return ps.Sys().(syscall.WaitStatus), nil +} + +// Destroy frees all resources associated with the sandbox. +func (s *Sandbox) Destroy() error { + log.Debugf("Destroy sandbox %q", s.ID) + if s.Pid != 0 { + // TODO: Too harsh? + log.Debugf("Killing sandbox %q", s.ID) + sendSignal(s.Pid, unix.SIGKILL) + s.Pid = 0 + } + if s.GoferPid != 0 { + log.Debugf("Killing gofer for sandbox %q", s.ID) + sendSignal(s.GoferPid, unix.SIGKILL) + s.GoferPid = 0 + } + if err := os.RemoveAll(s.SandboxRoot); err != nil { + log.Warningf("Failed to delete sandbox root directory %q, err: %v", s.SandboxRoot, err) + } + + // "If any poststop hook fails, the runtime MUST log a warning, but the + // remaining hooks and lifecycle continue as if the hook had succeeded". + if s.Spec.Hooks != nil && (s.Status == Created || s.Status == Running) { + executeHooksBestEffort(s.Spec.Hooks.Poststop, s.State()) + } + + s.Status = Stopped + return nil +} + +// Signal sends the signal to the sandbox. +func (s *Sandbox) Signal(sig syscall.Signal) error { + log.Debugf("Signal sandbox %q", s.ID) + if s.Status == Stopped { + log.Warningf("sandbox %q not running, not sending signal %v to pid %d", s.ID, sig, s.Pid) + return nil + } + return sendSignal(s.Pid, sig) +} + +func sendSignal(pid int, sig syscall.Signal) error { + if err := syscall.Kill(pid, sig); err != nil { + return fmt.Errorf("error sending signal %d to pid %d: %v", sig, pid, err) + } + return nil +} + +// save saves the sandbox metadata to a file. +func (s *Sandbox) save() error { + log.Debugf("Save sandbox %q", s.ID) + if err := os.MkdirAll(s.SandboxRoot, 0711); err != nil { + return fmt.Errorf("error creating sandbox root directory %q: %v", s.SandboxRoot, err) + } + meta, err := json.Marshal(s) + if err != nil { + return fmt.Errorf("error marshaling sandbox metadata: %v", err) + } + metaFile := filepath.Join(s.SandboxRoot, metadataFilename) + if err := ioutil.WriteFile(metaFile, meta, 0640); err != nil { + return fmt.Errorf("error writing sandbox metadata: %v", err) + } + return nil +} + +// exists returns true if the given file exists. +func exists(f string) bool { + if _, err := os.Stat(f); err == nil { + return true + } else if !os.IsNotExist(err) { + log.Warningf("error checking for file %q: %v", f, err) + } + return false +} diff --git a/runsc/sandbox/sandbox_test.go b/runsc/sandbox/sandbox_test.go new file mode 100644 index 000000000..6c71cac30 --- /dev/null +++ b/runsc/sandbox/sandbox_test.go @@ -0,0 +1,649 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sandbox_test + +import ( + "encoding/json" + "fmt" + "io" + "io/ioutil" + "os" + "os/signal" + "path/filepath" + "reflect" + "strings" + "syscall" + "testing" + "time" + + "context" + "flag" + "github.com/google/subcommands" + specs "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/control" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/unet" + "gvisor.googlesource.com/gvisor/runsc/boot" + "gvisor.googlesource.com/gvisor/runsc/cmd" + "gvisor.googlesource.com/gvisor/runsc/sandbox" +) + +func init() { + log.SetLevel(log.Debug) +} + +// writeSpec writes the spec to disk in the given directory. +func writeSpec(dir string, spec *specs.Spec) error { + b, err := json.Marshal(spec) + if err != nil { + return err + } + return ioutil.WriteFile(filepath.Join(dir, "config.json"), b, 0755) +} + +// newSpecWithArgs creates a simple spec with the given args suitable for use +// in tests. +func newSpecWithArgs(args ...string) *specs.Spec { + spec := &specs.Spec{ + // The host filesystem root is the sandbox root. + Root: &specs.Root{ + Path: "/", + Readonly: true, + }, + Process: &specs.Process{ + Args: args, + Env: []string{ + "PATH=" + os.Getenv("PATH"), + }, + }, + } + return spec +} + +// shutdownSignal will be sent to the sandbox in order to shut down cleanly. +const shutdownSignal = syscall.SIGUSR2 + +// setupSandbox creates a bundle and root dir for the sandbox, generates a test +// config, and writes the spec to config.json in the bundle dir. +func setupSandbox(spec *specs.Spec) (rootDir, bundleDir string, conf *boot.Config, err error) { + rootDir, err = ioutil.TempDir("", "sandboxes") + if err != nil { + return "", "", nil, fmt.Errorf("error creating root dir: %v", err) + } + + bundleDir, err = ioutil.TempDir("", "bundle") + if err != nil { + return "", "", nil, fmt.Errorf("error creating bundle dir: %v", err) + } + + if err = writeSpec(bundleDir, spec); err != nil { + return "", "", nil, fmt.Errorf("error writing spec: %v", err) + } + + conf = &boot.Config{ + RootDir: rootDir, + Network: boot.NetworkNone, + } + + return rootDir, bundleDir, conf, nil +} + +// uniqueSandboxID generates a unique sandbox id for each test. +// +// The sandbox id is used to create an abstract unix domain socket, which must +// be unique. While the sandbox forbids creating two sandboxes with the same +// name, sometimes between test runs the socket does not get cleaned up quickly +// enough, causing sandbox creation to fail. +func uniqueSandboxID() string { + return fmt.Sprintf("test-sandbox-%d", time.Now().UnixNano()) +} + +// waitForProcessList waits for the given process list to show up in the sandbox. +func waitForProcessList(s *sandbox.Sandbox, expected []*control.Process) error { + var got []*control.Process + for start := time.Now(); time.Now().Sub(start) < 10*time.Second; { + var err error + got, err := s.Processes() + if err != nil { + return fmt.Errorf("error getting process data from sandbox: %v", err) + } + if procListsEqual(got, expected) { + return nil + } + // Process might not have started, try again... + time.Sleep(10 * time.Millisecond) + } + return fmt.Errorf("sandbox got process list: %s, want: %s", procListToString(got), procListToString(expected)) +} + +// TestLifecycle tests the basic Create/Start/Signal/Destory sandbox lifecycle. +// It verifies after each step that the sandbox can be loaded from disk, and +// has the correct status. +func TestLifecycle(t *testing.T) { + // The sandbox will just sleep for a long time. We will kill it before + // it finishes sleeping. + spec := newSpecWithArgs("sleep", "100") + + rootDir, bundleDir, conf, err := setupSandbox(spec) + if err != nil { + t.Fatalf("error setting up sandbox: %v", err) + } + defer os.RemoveAll(rootDir) + defer os.RemoveAll(bundleDir) + + // expectedPL lists the expected process state of the sandbox. + expectedPL := []*control.Process{ + { + UID: 0, + PID: 1, + PPID: 0, + C: 0, + Cmd: "sleep", + }, + } + // Create the sandbox. + id := uniqueSandboxID() + if _, err := sandbox.Create(id, spec, conf, bundleDir, "", "", nil); err != nil { + t.Fatalf("error creating sandbox: %v", err) + } + // Load the sandbox from disk and check the status. + s, err := sandbox.Load(rootDir, id) + if err != nil { + t.Fatalf("error loading sandbox: %v", err) + } + if got, want := s.Status, sandbox.Created; got != want { + t.Errorf("sandbox status got %v, want %v", got, want) + } + + // List should return the sandbox id. + ids, err := sandbox.List(rootDir) + if err != nil { + t.Fatalf("error listing sandboxes: %v", err) + } + if got, want := ids, []string{id}; !reflect.DeepEqual(got, want) { + t.Errorf("sandbox list got %v, want %v", got, want) + } + + // Start the sandbox. + if err := s.Start(conf); err != nil { + t.Fatalf("error starting sandbox: %v", err) + } + // Load the sandbox from disk and check the status. + s, err = sandbox.Load(rootDir, id) + if err != nil { + t.Fatalf("error loading sandbox: %v", err) + } + if got, want := s.Status, sandbox.Running; got != want { + t.Errorf("sandbox status got %v, want %v", got, want) + } + + // Verify that "sleep 100" is running. + if err := waitForProcessList(s, expectedPL); err != nil { + t.Error(err) + } + + // Send the sandbox a signal, which we catch and use to cleanly + // shutdown. + if err := s.Signal(shutdownSignal); err != nil { + t.Fatalf("error sending signal %v to sandbox: %v", shutdownSignal, err) + } + // Wait for it to die. + if _, err := s.Wait(); err != nil { + t.Fatalf("error waiting on sandbox: %v", err) + } + // Load the sandbox from disk and check the status. + s, err = sandbox.Load(rootDir, id) + if err != nil { + t.Fatalf("error loading sandbox: %v", err) + } + if got, want := s.Status, sandbox.Stopped; got != want { + t.Errorf("sandbox status got %v, want %v", got, want) + } + + // Destroy the sandbox. + if err := s.Destroy(); err != nil { + t.Fatalf("error destroying sandbox: %v", err) + } + + // List should not return the sandbox id. + ids, err = sandbox.List(rootDir) + if err != nil { + t.Fatalf("error listing sandboxes: %v", err) + } + if len(ids) != 0 { + t.Errorf("expected sandbox list to be empty, but got %v", ids) + } + + // Loading the sandbox by id should fail. + if _, err = sandbox.Load(rootDir, id); err == nil { + t.Errorf("expected loading destroyed sandbox to fail, but it did not") + } +} + +// Test the we can execute the application with different path formats. +func TestExePath(t *testing.T) { + for _, test := range []struct { + path string + success bool + }{ + {path: "true", success: true}, + {path: "bin/true", success: true}, + {path: "/bin/true", success: true}, + {path: "thisfiledoesntexit", success: false}, + {path: "bin/thisfiledoesntexit", success: false}, + {path: "/bin/thisfiledoesntexit", success: false}, + } { + spec := newSpecWithArgs(test.path) + rootDir, bundleDir, conf, err := setupSandbox(spec) + if err != nil { + t.Fatalf("exec: %s, error setting up sandbox: %v", test.path, err) + } + + ws, err := sandbox.Run(uniqueSandboxID(), spec, conf, bundleDir, "", "", nil) + + os.RemoveAll(rootDir) + os.RemoveAll(bundleDir) + + if test.success { + if err != nil { + t.Errorf("exec: %s, error running sandbox: %v", test.path, err) + } + if ws.ExitStatus() != 0 { + t.Errorf("exec: %s, got exit status %v want %v", test.path, ws.ExitStatus(), 0) + } + } else { + if err == nil { + t.Errorf("exec: %s, got: no error, want: error", test.path) + } + } + } +} + +// Test the we can retrieve the application exit status from the sandbox. +func TestAppExitStatus(t *testing.T) { + // First sandbox will succeed. + succSpec := newSpecWithArgs("true") + + rootDir, bundleDir, conf, err := setupSandbox(succSpec) + if err != nil { + t.Fatalf("error setting up sandbox: %v", err) + } + defer os.RemoveAll(rootDir) + defer os.RemoveAll(bundleDir) + + ws, err := sandbox.Run(uniqueSandboxID(), succSpec, conf, bundleDir, "", "", nil) + if err != nil { + t.Fatalf("error running sandbox: %v", err) + } + if ws.ExitStatus() != 0 { + t.Errorf("got exit status %v want %v", ws.ExitStatus(), 0) + } + + // Second sandbox exits with non-zero status. + wantStatus := 123 + errSpec := newSpecWithArgs("bash", "-c", fmt.Sprintf("exit %d", wantStatus)) + + rootDir2, bundleDir2, conf, err := setupSandbox(errSpec) + if err != nil { + t.Fatalf("error setting up sandbox: %v", err) + } + defer os.RemoveAll(rootDir2) + defer os.RemoveAll(bundleDir2) + + ws, err = sandbox.Run(uniqueSandboxID(), succSpec, conf, bundleDir2, "", "", nil) + if err != nil { + t.Fatalf("error running sandbox: %v", err) + } + if ws.ExitStatus() != wantStatus { + t.Errorf("got exit status %v want %v", ws.ExitStatus(), wantStatus) + } +} + +// TestExec verifies that a sandbox can exec a new program. +func TestExec(t *testing.T) { + const uid = 343 + spec := newSpecWithArgs("sleep", "100") + + rootDir, bundleDir, conf, err := setupSandbox(spec) + if err != nil { + t.Fatalf("error setting up sandbox: %v", err) + } + defer os.RemoveAll(rootDir) + defer os.RemoveAll(bundleDir) + + // Create and start the sandbox. + s, err := sandbox.Create(uniqueSandboxID(), spec, conf, bundleDir, "", "", nil) + if err != nil { + t.Fatalf("error creating sandbox: %v", err) + } + defer s.Destroy() + if err := s.Start(conf); err != nil { + t.Fatalf("error starting sandbox: %v", err) + } + + // expectedPL lists the expected process state of the sandbox. + expectedPL := []*control.Process{ + { + UID: 0, + PID: 1, + PPID: 0, + C: 0, + Cmd: "sleep", + }, + { + UID: uid, + PID: 2, + PPID: 0, + C: 0, + Cmd: "sleep", + }, + } + + // Verify that "sleep 100" is running. + if err := waitForProcessList(s, expectedPL[:1]); err != nil { + t.Error(err) + } + + execArgs := control.ExecArgs{ + Filename: "/bin/sleep", + Argv: []string{"sleep", "5"}, + Envv: []string{"PATH=" + os.Getenv("PATH")}, + WorkingDirectory: "/", + KUID: uid, + Detach: false, + } + + // Verify that "sleep 100" and "sleep 5" are running after exec. + // First, start running exec (whick blocks). + status := make(chan error, 1) + go func() { + exitStatus, err := s.Execute(&execArgs) + if err != nil { + status <- err + } else if exitStatus != 0 { + status <- fmt.Errorf("failed with exit status: %v", exitStatus) + } else { + status <- nil + } + }() + + if err := waitForProcessList(s, expectedPL); err != nil { + t.Fatal(err) + } + + // Ensure that exec finished without error. + select { + case <-time.After(10 * time.Second): + t.Fatalf("sandbox timed out waiting for exec to finish.") + case st := <-status: + if st != nil { + t.Errorf("sandbox failed to exec %v: %v", execArgs, err) + } + } +} + +// TestCapabilities verifies that: +// - Running exec as non-root UID and GID will result in an error (because the +// executable file can't be read). +// - Running exec as non-root with CAP_DAC_OVERRIDE succeeds because it skips +// this check. +func TestCapabilities(t *testing.T) { + const uid = 343 + const gid = 2401 + spec := newSpecWithArgs("sleep", "100") + + // We generate files in the host temporary directory. + spec.Mounts = append(spec.Mounts, specs.Mount{ + Destination: os.TempDir(), + Source: os.TempDir(), + Type: "bind", + }) + + rootDir, bundleDir, conf, err := setupSandbox(spec) + if err != nil { + t.Fatalf("error setting up sandbox: %v", err) + } + defer os.RemoveAll(rootDir) + defer os.RemoveAll(bundleDir) + + // Create and start the sandbox. + s, err := sandbox.Create(uniqueSandboxID(), spec, conf, bundleDir, "", "", nil) + if err != nil { + t.Fatalf("error creating sandbox: %v", err) + } + defer s.Destroy() + if err := s.Start(conf); err != nil { + t.Fatalf("error starting sandbox: %v", err) + } + + // expectedPL lists the expected process state of the sandbox. + expectedPL := []*control.Process{ + { + UID: 0, + PID: 1, + PPID: 0, + C: 0, + Cmd: "sleep", + }, + { + UID: uid, + PID: 2, + PPID: 0, + C: 0, + Cmd: "exe", + }, + } + if err := waitForProcessList(s, expectedPL[:1]); err != nil { + t.Fatalf("Failed to wait for sleep to start, err: %v", err) + } + + // Create an executable that can't be run with the specified UID:GID. + // This shouldn't be callable within the sandbox until we add the + // CAP_DAC_OVERRIDE capability to skip the access check. + exePath := filepath.Join(rootDir, "exe") + if err := ioutil.WriteFile(exePath, []byte("#!/bin/sh\necho hello"), 0770); err != nil { + t.Fatalf("couldn't create executable: %v", err) + } + defer os.Remove(exePath) + + // Need to traverse the intermediate directory. + os.Chmod(rootDir, 0755) + + execArgs := control.ExecArgs{ + Filename: exePath, + Argv: []string{exePath}, + Envv: []string{"PATH=" + os.Getenv("PATH")}, + WorkingDirectory: "/", + KUID: uid, + KGID: gid, + Capabilities: &auth.TaskCapabilities{}, + Detach: true, + } + + // "exe" should fail because we don't have the necessary permissions. + if _, err := s.Execute(&execArgs); err == nil { + t.Fatalf("sandbox executed without error, but an error was expected") + } + + // Now we run with the capability enabled and should succeed. + execArgs.Capabilities = &auth.TaskCapabilities{ + EffectiveCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE), + } + // First, start running exec. + if _, err := s.Execute(&execArgs); err != nil { + t.Fatalf("sandbox failed to exec %v: %v", execArgs, err) + } + + if err := waitForProcessList(s, expectedPL); err != nil { + t.Error(err) + } +} + +// Test that an tty FD is sent over the console socket if one is provided. +func TestConsoleSocket(t *testing.T) { + spec := newSpecWithArgs("true") + rootDir, bundleDir, conf, err := setupSandbox(spec) + if err != nil { + t.Fatalf("error setting up sandbox: %v", err) + } + defer os.RemoveAll(rootDir) + defer os.RemoveAll(bundleDir) + + // Create a named socket and start listening. We use a relative path + // to avoid overflowing the unix path length limit (108 chars). + socketPath := filepath.Join(bundleDir, "socket") + cwd, err := os.Getwd() + if err != nil { + t.Fatalf("error getting cwd: %v", err) + } + socketRelPath, err := filepath.Rel(cwd, socketPath) + if err != nil { + t.Fatalf("error getting relative path for %q from cwd %q: %v", socketPath, cwd, err) + } + if len(socketRelPath) > len(socketPath) { + socketRelPath = socketPath + } + srv, err := unet.BindAndListen(socketRelPath, false) + if err != nil { + t.Fatalf("error binding and listening to socket %q: %v", socketPath, err) + } + defer os.Remove(socketPath) + + // Create the sandbox and pass the socket name. + id := uniqueSandboxID() + s, err := sandbox.Create(id, spec, conf, bundleDir, socketRelPath, "", nil) + if err != nil { + t.Fatalf("error creating sandbox: %v", err) + } + + // Open the othe end of the socket. + sock, err := srv.Accept() + if err != nil { + t.Fatalf("error accepting socket connection: %v", err) + } + + // Allow 3 fds to be received. We only expect 1. + r := sock.Reader(true /* blocking */) + r.EnableFDs(1) + + // The socket is closed right after sending the FD, so EOF is + // an allowed error. + b := [][]byte{{}} + if _, err := r.ReadVec(b); err != nil && err != io.EOF { + t.Fatalf("error reading from socket connection: %v", err) + } + + // We should have gotten a control message. + fds, err := r.ExtractFDs() + if err != nil { + t.Fatalf("error extracting fds from socket connection: %v", err) + } + if len(fds) != 1 { + t.Fatalf("got %d fds from socket, wanted 1", len(fds)) + } + + // Verify that the fd is a terminal. + if _, err := unix.IoctlGetTermios(fds[0], unix.TCGETS); err != nil { + t.Errorf("fd is not a terminal (ioctl TGGETS got %v)", err) + } + + // Shut it down. + if err := s.Destroy(); err != nil { + t.Fatalf("error destroying sandbox: %v", err) + } + + // Close socket. + if err := srv.Close(); err != nil { + t.Fatalf("error destroying sandbox: %v", err) + } +} + +// procListsEqual is used to check whether 2 Process lists are equal for all +// implemented fields. +func procListsEqual(got, want []*control.Process) bool { + if len(got) != len(want) { + return false + } + for i := range got { + pd1 := got[i] + pd2 := want[i] + // Zero out unimplemented and timing dependant fields. + pd1.Time, pd2.Time = "", "" + pd1.STime, pd2.STime = "", "" + pd1.C, pd2.C = 0, 0 + if *pd1 != *pd2 { + return false + } + } + return true +} + +func procListToString(pl []*control.Process) string { + strs := make([]string, 0, len(pl)) + for _, p := range pl { + strs = append(strs, fmt.Sprintf("%+v", p)) + } + return fmt.Sprintf("[%s]", strings.Join(strs, ",")) +} + +// TestMain acts like runsc if it is called with the "boot" argument, otherwise +// it just runs the tests. This is required because creating a sandbox will +// call "/proc/self/exe boot". Normally /proc/self/exe is the runsc binary, +// but for tests we have to fake it. +func TestMain(m *testing.M) { + // exit writes coverage data before exiting. + exit := func(status int) { + os.Exit(status) + } + + if !flag.Parsed() { + flag.Parse() + } + + // If we are passed one of the commands then run it. + subcommands.Register(new(cmd.Boot), "boot") + subcommands.Register(new(cmd.Gofer), "gofer") + switch flag.Arg(0) { + case "boot", "gofer": + // Run the command in a goroutine so we can block the main + // thread waiting for shutdownSignal. + go func() { + conf := &boot.Config{ + RootDir: "unused-root-dir", + Network: boot.NetworkNone, + } + var ws syscall.WaitStatus + subcmdCode := subcommands.Execute(context.Background(), conf, &ws) + if subcmdCode != subcommands.ExitSuccess { + panic(fmt.Sprintf("command failed to execute, err: %v", subcmdCode)) + } + // Sandbox exited normally. Shut down this process. + os.Exit(ws.ExitStatus()) + }() + + // Shutdown cleanly when the shutdownSignal is received. This + // allows us to write coverage data before exiting. + sigc := make(chan os.Signal, 1) + signal.Notify(sigc, shutdownSignal) + <-sigc + exit(0) + default: + // Otherwise run the tests. + exit(m.Run()) + } +} diff --git a/runsc/sandbox/status.go b/runsc/sandbox/status.go new file mode 100644 index 000000000..6fc936aba --- /dev/null +++ b/runsc/sandbox/status.go @@ -0,0 +1,56 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sandbox + +// Status enumerates sandbox statuses. The statuses and their semantics are +// part of the runtime CLI spec. +// +// TODO: Get precise about the transitions between statuses. +type Status int + +const ( + // Creating indicates "the container is being created". + Creating Status = iota + + // Created indicates "the runtime has finished the create operation and + // the container process has neither exited nor executed the + // user-specified program". + Created + + // Running indicates "the container process has executed the + // user-specified program but has not exited". + Running + + // Stopped indicates "the container process has exited". + Stopped +) + +// String converts a Status to a string. These strings are part of the runtime +// CLI spec and should not be changed. +func (s Status) String() string { + switch s { + case Creating: + return "creating" + case Created: + return "created" + case Running: + return "running" + case Stopped: + return "stopped" + default: + return "unknown" + } + +} |