diff options
Diffstat (limited to 'runsc')
-rw-r--r-- | runsc/BUILD | 2 | ||||
-rw-r--r-- | runsc/boot/BUILD | 1 | ||||
-rw-r--r-- | runsc/boot/config.go | 18 | ||||
-rw-r--r-- | runsc/boot/controller.go | 12 | ||||
-rw-r--r-- | runsc/boot/filter/config.go | 2 | ||||
-rw-r--r-- | runsc/boot/fs.go | 182 | ||||
-rw-r--r-- | runsc/boot/loader.go | 125 | ||||
-rw-r--r-- | runsc/boot/loader_test.go | 19 | ||||
-rw-r--r-- | runsc/boot/network.go | 42 | ||||
-rw-r--r-- | runsc/boot/user_test.go | 12 | ||||
-rw-r--r-- | runsc/container/BUILD | 1 | ||||
-rw-r--r-- | runsc/container/container.go | 111 | ||||
-rw-r--r-- | runsc/container/container_test.go | 18 | ||||
-rw-r--r-- | runsc/container/multi_container_test.go | 213 | ||||
-rw-r--r-- | runsc/fsgofer/filter/config.go | 2 | ||||
-rw-r--r-- | runsc/fsgofer/fsgofer.go | 10 | ||||
-rw-r--r-- | runsc/fsgofer/fsgofer_test.go | 50 | ||||
-rw-r--r-- | runsc/main.go | 13 | ||||
-rw-r--r-- | runsc/sandbox/network.go | 38 | ||||
-rw-r--r-- | runsc/sandbox/sandbox.go | 4 | ||||
-rw-r--r-- | runsc/specutils/specutils.go | 11 | ||||
-rw-r--r-- | runsc/test/testutil/docker.go | 6 | ||||
-rw-r--r-- | runsc/test/testutil/testutil.go | 38 |
23 files changed, 688 insertions, 242 deletions
diff --git a/runsc/BUILD b/runsc/BUILD index 6b8c92706..cc8852d7d 100644 --- a/runsc/BUILD +++ b/runsc/BUILD @@ -16,6 +16,7 @@ go_binary( x_defs = {"main.version": "{VERSION}"}, deps = [ "//pkg/log", + "//pkg/refs", "//pkg/sentry/platform", "//runsc/boot", "//runsc/cmd", @@ -48,6 +49,7 @@ go_binary( x_defs = {"main.version": "{VERSION}"}, deps = [ "//pkg/log", + "//pkg/refs", "//pkg/sentry/platform", "//runsc/boot", "//runsc/cmd", diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index 5025401dd..588bb8851 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -88,6 +88,7 @@ go_library( "//runsc/specutils", "@com_github_golang_protobuf//proto:go_default_library", "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/runsc/boot/config.go b/runsc/boot/config.go index 6a742f349..05b8f8761 100644 --- a/runsc/boot/config.go +++ b/runsc/boot/config.go @@ -113,7 +113,7 @@ func MakeWatchdogAction(s string) (watchdog.Action, error) { } } -// MakeRefsLeakMode converts type from string +// MakeRefsLeakMode converts type from string. func MakeRefsLeakMode(s string) (refs.LeakMode, error) { switch strings.ToLower(s) { case "disabled": @@ -127,6 +127,20 @@ func MakeRefsLeakMode(s string) (refs.LeakMode, error) { } } +func refsLeakModeToString(mode refs.LeakMode) string { + switch mode { + // If not set, default it to disabled. + case refs.UninitializedLeakChecking, refs.NoLeakChecking: + return "disabled" + case refs.LeaksLogWarning: + return "log-names" + case refs.LeaksLogTraces: + return "log-traces" + default: + panic(fmt.Sprintf("Invalid leakmode: %d", mode)) + } +} + // Config holds configuration that is not part of the runtime spec. type Config struct { // RootDir is the runtime root directory. @@ -245,7 +259,7 @@ func (c *Config) ToFlags() []string { "--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels), "--rootless=" + strconv.FormatBool(c.Rootless), "--alsologtostderr=" + strconv.FormatBool(c.AlsoLogToStderr), - "--ref-leak-mode=" + c.ReferenceLeakMode.String(), + "--ref-leak-mode=" + refsLeakModeToString(c.ReferenceLeakMode), } if c.TestOnlyAllowRunAsCurrentUserWithoutChroot { // Only include if set since it is never to be used by users. diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index d79aaff60..72cbabd16 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -328,10 +328,8 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { return fmt.Errorf("at most two files may be passed to Restore") } - networkStack := cm.l.k.NetworkStack() - // Destroy the old kernel and create a new kernel. + // Pause the kernel while we build a new one. cm.l.k.Pause() - cm.l.k.Destroy() p, err := createPlatform(cm.l.conf, deviceFile) if err != nil { @@ -345,10 +343,11 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { return fmt.Errorf("creating memory file: %v", err) } k.SetMemoryFile(mf) + networkStack := cm.l.k.NetworkStack() cm.l.k = k // Set up the restore environment. - mntr := newContainerMounter(cm.l.spec, "", cm.l.goferFDs, cm.l.k, cm.l.mountHints) + mntr := newContainerMounter(cm.l.spec, cm.l.goferFDs, cm.l.k, cm.l.mountHints) renv, err := mntr.createRestoreEnvironment(cm.l.conf) if err != nil { return fmt.Errorf("creating RestoreEnvironment: %v", err) @@ -380,13 +379,10 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { // Load the state. loadOpts := state.LoadOpts{Source: specFile} - if err := loadOpts.Load(k, networkStack); err != nil { + if err := loadOpts.Load(k, networkStack, time.NewCalibratedClocks()); err != nil { return err } - // Set timekeeper. - k.Timekeeper().SetClocks(time.NewCalibratedClocks()) - // Since we have a new kernel we also must make a new watchdog. dog := watchdog.New(k, watchdog.DefaultTimeout, cm.l.conf.WatchdogAction) diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index 0ee5b8bbd..7ca776b3a 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -207,7 +207,7 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_MPROTECT: {}, syscall.SYS_MUNMAP: {}, syscall.SYS_NANOSLEEP: {}, - syscall.SYS_POLL: {}, + syscall.SYS_PPOLL: {}, syscall.SYS_PREAD64: {}, syscall.SYS_PWRITE64: {}, syscall.SYS_READ: {}, diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 7e95e1f41..34c674840 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -25,10 +25,8 @@ import ( // Include filesystem types that OCI spec might mount. _ "gvisor.dev/gvisor/pkg/sentry/fs/dev" - "gvisor.dev/gvisor/pkg/sentry/fs/gofer" _ "gvisor.dev/gvisor/pkg/sentry/fs/host" _ "gvisor.dev/gvisor/pkg/sentry/fs/proc" - "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" _ "gvisor.dev/gvisor/pkg/sentry/fs/sys" _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" _ "gvisor.dev/gvisor/pkg/sentry/fs/tty" @@ -38,6 +36,8 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/gofer" + "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/syserror" @@ -263,6 +263,18 @@ func subtargets(root string, mnts []specs.Mount) []string { return targets } +func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { + mns, err := mntr.setupFS(conf, procArgs) + if err != nil { + return err + } + + // Set namespace here so that it can be found in ctx. + procArgs.MountNamespace = mns + + return setExecutablePath(ctx, procArgs) +} + // setExecutablePath sets the procArgs.Filename by searching the PATH for an // executable matching the procArgs.Argv[0]. func setExecutablePath(ctx context.Context, procArgs *kernel.CreateProcessArgs) error { @@ -478,9 +490,6 @@ func (p *podMountHints) findMount(mount specs.Mount) *mountHint { } type containerMounter struct { - // cid is the container ID. May be set to empty for the root container. - cid string - root *specs.Root // mounts is the set of submounts for the container. It's a copy from the spec @@ -495,9 +504,8 @@ type containerMounter struct { hints *podMountHints } -func newContainerMounter(spec *specs.Spec, cid string, goferFDs []int, k *kernel.Kernel, hints *podMountHints) *containerMounter { +func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hints *podMountHints) *containerMounter { return &containerMounter{ - cid: cid, root: spec.Root, mounts: compileMounts(spec), fds: fdDispenser{fds: goferFDs}, @@ -506,63 +514,88 @@ func newContainerMounter(spec *specs.Spec, cid string, goferFDs []int, k *kernel } } -// setupFS is used to set up the file system for containers and amend -// the procArgs accordingly. This is the main entry point for this rest of -// functions in this file. procArgs are passed by reference and the FDMap field -// is modified. It dups stdioFDs. -func (c *containerMounter) setupFS(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs, creds *auth.Credentials) error { - // Use root user to configure mounts. The current user might not have - // permission to do so. - rootProcArgs := kernel.CreateProcessArgs{ - WorkingDirectory: "/", - Credentials: auth.NewRootCredentials(creds.UserNamespace), - Umask: 0022, - MaxSymlinkTraversals: linux.MaxSymlinkTraversals, - PIDNamespace: procArgs.PIDNamespace, +// processHints processes annotations that container hints about how volumes +// should be mounted (e.g. a volume shared between containers). It must be +// called for the root container only. +func (c *containerMounter) processHints(conf *Config) error { + ctx := c.k.SupervisorContext() + for _, hint := range c.hints.mounts { + log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type) + inode, err := c.mountSharedMaster(ctx, conf, hint) + if err != nil { + return fmt.Errorf("mounting shared master %q: %v", hint.name, err) + } + hint.root = inode } + return nil +} + +// setupFS is used to set up the file system for all containers. This is the +// main entry point method, with most of the other being internal only. It +// returns the mount namespace that is created for the container. +func (c *containerMounter) setupFS(conf *Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) { + log.Infof("Configuring container's file system") + + // Create context with root credentials to mount the filesystem (the current + // user may not be privileged enough). + rootProcArgs := *procArgs + rootProcArgs.WorkingDirectory = "/" + rootProcArgs.Credentials = auth.NewRootCredentials(procArgs.Credentials.UserNamespace) + rootProcArgs.Umask = 0022 + rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals rootCtx := rootProcArgs.NewContext(c.k) - // If this is the root container, we also need to setup the root mount - // namespace. - rootMNS := c.k.RootMountNamespace() - if rootMNS == nil { - // Setup the root container. - if err := c.setupRootContainer(ctx, rootCtx, conf, func(rootMNS *fs.MountNamespace) { - // The callback to setupRootContainer inherits a - // reference on the rootMNS, so we don't need to take - // an additional reference here. - procArgs.MountNamespace = rootMNS - procArgs.Root = rootMNS.Root() - c.k.SetRootMountNamespace(rootMNS) - }); err != nil { - return err - } - return c.checkDispenser() + mns, err := c.createMountNamespace(rootCtx, conf) + if err != nil { + return nil, err } - // Setup a child container. - log.Infof("Creating new process in child container.") + // Set namespace here so that it can be found in rootCtx. + rootProcArgs.MountNamespace = mns - // Create a new root inode and mount namespace for the container. - rootInode, err := c.createRootMount(rootCtx, conf) + if err := c.mountSubmounts(rootCtx, conf, mns); err != nil { + return nil, err + } + return mns, nil +} + +func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Config) (*fs.MountNamespace, error) { + rootInode, err := c.createRootMount(ctx, conf) if err != nil { - return fmt.Errorf("creating filesystem for container: %v", err) + return nil, fmt.Errorf("creating filesystem for container: %v", err) } - mns, err := fs.NewMountNamespace(rootCtx, rootInode) + mns, err := fs.NewMountNamespace(ctx, rootInode) if err != nil { - return fmt.Errorf("creating new mount namespace for container: %v", err) + return nil, fmt.Errorf("creating new mount namespace for container: %v", err) } + return mns, nil +} - // Set process root here, so 'rootCtx.Value(CtxRoot)' will return it. - // This will also donate a reference to procArgs, as required. - procArgs.MountNamespace = mns - procArgs.Root = mns.Root() +func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace) error { + root := mns.Root() + defer root.DecRef() - // Mount all submounts. - if err := c.mountSubmounts(rootCtx, conf, mns, procArgs.Root); err != nil { + for _, m := range c.mounts { + log.Debugf("Mounting %q to %q, type: %s, options: %s", m.Source, m.Destination, m.Type, m.Options) + if hint := c.hints.findMount(m); hint != nil && hint.isSupported() { + if err := c.mountSharedSubmount(ctx, mns, root, m, hint); err != nil { + return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, m.Destination, err) + } + } else { + if err := c.mountSubmount(ctx, conf, mns, root, m); err != nil { + return fmt.Errorf("mount submount %q: %v", m.Destination, err) + } + } + } + + if err := c.mountTmp(ctx, conf, mns, root); err != nil { + return fmt.Errorf("mount submount %q: %v", "tmp", err) + } + + if err := c.checkDispenser(); err != nil { return err } - return c.checkDispenser() + return nil } func (c *containerMounter) checkDispenser() error { @@ -572,36 +605,6 @@ func (c *containerMounter) checkDispenser() error { return nil } -// setupRootContainer creates a mount namespace containing the root filesystem -// and all mounts. 'rootCtx' is used to walk directories to find mount points. -// The 'setMountNS' callback is called after the mount namespace is created and -// will get a reference on that namespace. The callback must ensure that the -// rootCtx has the provided mount namespace. -func (c *containerMounter) setupRootContainer(userCtx context.Context, rootCtx context.Context, conf *Config, setMountNS func(*fs.MountNamespace)) error { - for _, hint := range c.hints.mounts { - log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type) - inode, err := c.mountSharedMaster(rootCtx, conf, hint) - if err != nil { - return fmt.Errorf("mounting shared master %q: %v", hint.name, err) - } - hint.root = inode - } - - rootInode, err := c.createRootMount(rootCtx, conf) - if err != nil { - return fmt.Errorf("creating root mount: %v", err) - } - mns, err := fs.NewMountNamespace(userCtx, rootInode) - if err != nil { - return fmt.Errorf("creating root mount namespace: %v", err) - } - setMountNS(mns) - - root := mns.Root() - defer root.DecRef() - return c.mountSubmounts(rootCtx, conf, mns, root) -} - // mountSharedMaster mounts the master of a volume that is shared among // containers in a pod. It returns the root mount's inode. func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config, hint *mountHint) (*fs.Inode, error) { @@ -717,25 +720,6 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) ( return fsName, opts, useOverlay, err } -func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error { - for _, m := range c.mounts { - if hint := c.hints.findMount(m); hint != nil && hint.isSupported() { - if err := c.mountSharedSubmount(ctx, mns, root, m, hint); err != nil { - return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, m.Destination, err) - } - } else { - if err := c.mountSubmount(ctx, conf, mns, root, m); err != nil { - return fmt.Errorf("mount submount %q: %v", m.Destination, err) - } - } - } - - if err := c.mountTmp(ctx, conf, mns, root); err != nil { - return fmt.Errorf("mount submount %q: %v", "tmp", err) - } - return nil -} - // mountSubmount mounts volumes inside the container's root. Because mounts may // be readonly, a lower ramfs overlay is added to create the mount point dir. // Another overlay is added with tmpfs on top if Config.Overlay is true. diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 65ac67dbf..19b738705 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -27,12 +27,12 @@ import ( gtime "time" specs "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/memutil" "gvisor.dev/gvisor/pkg/rand" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -181,9 +181,6 @@ type Args struct { // New initializes a new kernel loader configured by spec. // New also handles setting up a kernel for restoring a container. func New(args Args) (*Loader, error) { - // Sets the reference leak check mode - refs.SetLeakMode(args.Conf.ReferenceLeakMode) - // We initialize the rand package now to make sure /dev/urandom is pre-opened // on kernels that do not support getrandom(2). if err := rand.Init(); err != nil { @@ -527,15 +524,14 @@ func (l *Loader) run() error { // ours either way. l.rootProcArgs.FDTable = fdTable - // cid for root container can be empty. Only subcontainers need it to set - // the mount location. - mntr := newContainerMounter(l.spec, "", l.goferFDs, l.k, l.mountHints) - if err := mntr.setupFS(ctx, l.conf, &l.rootProcArgs, l.rootProcArgs.Credentials); err != nil { + // Setup the root container file system. + l.startGoferMonitor(l.sandboxID, l.goferFDs) + + mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints) + if err := mntr.processHints(l.conf); err != nil { return err } - - rootCtx := l.rootProcArgs.NewContext(l.k) - if err := setExecutablePath(rootCtx, &l.rootProcArgs); err != nil { + if err := setupContainerFS(ctx, l.conf, mntr, &l.rootProcArgs); err != nil { return err } @@ -549,7 +545,7 @@ func (l *Loader) run() error { } } if !hasHomeEnvv { - homeDir, err := getExecUserHome(rootCtx, l.rootProcArgs.MountNamespace, uint32(l.rootProcArgs.Credentials.RealKUID)) + homeDir, err := getExecUserHome(ctx, l.rootProcArgs.MountNamespace, uint32(l.rootProcArgs.Credentials.RealKUID)) if err != nil { return fmt.Errorf("error reading exec user: %v", err) } @@ -631,7 +627,6 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file // sentry currently supports only 1 mount namespace, which is tied to a // single user namespace. Thus we must run in the same user namespace // to access mounts. - // TODO(b/63601033): Create a new mount namespace for the container. creds := auth.NewUserCredentials( auth.KUID(spec.Process.User.UID), auth.KGID(spec.Process.User.GID), @@ -687,13 +682,12 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file goferFDs = append(goferFDs, fd) } - mntr := newContainerMounter(spec, cid, goferFDs, l.k, l.mountHints) - if err := mntr.setupFS(ctx, conf, &procArgs, creds); err != nil { - return fmt.Errorf("configuring container FS: %v", err) - } + // Setup the child container file system. + l.startGoferMonitor(cid, goferFDs) - if err := setExecutablePath(ctx, &procArgs); err != nil { - return fmt.Errorf("setting executable path for %+v: %v", procArgs, err) + mntr := newContainerMounter(spec, goferFDs, l.k, l.mountHints) + if err := setupContainerFS(ctx, conf, mntr, &procArgs); err != nil { + return err } // Create and start the new process. @@ -710,17 +704,59 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file return nil } +// startGoferMonitor runs a goroutine to monitor gofer's health. It polls on +// the gofer FDs looking for disconnects, and destroys the container if a +// disconnect occurs in any of the gofer FDs. +func (l *Loader) startGoferMonitor(cid string, goferFDs []int) { + go func() { + log.Debugf("Monitoring gofer health for container %q", cid) + var events []unix.PollFd + for _, fd := range goferFDs { + events = append(events, unix.PollFd{ + Fd: int32(fd), + Events: unix.POLLHUP | unix.POLLRDHUP, + }) + } + _, _, err := specutils.RetryEintr(func() (uintptr, uintptr, error) { + // Use ppoll instead of poll because it's already whilelisted in seccomp. + n, err := unix.Ppoll(events, nil, nil) + return uintptr(n), 0, err + }) + if err != nil { + panic(fmt.Sprintf("Error monitoring gofer FDs: %v", err)) + } + + // Check if the gofer has stopped as part of normal container destruction. + // This is done just to avoid sending an annoying error message to the log. + // Note that there is a small race window in between mu.Unlock() and the + // lock being reacquired in destroyContainer(), but it's harmless to call + // destroyContainer() multiple times. + l.mu.Lock() + _, ok := l.processes[execID{cid: cid}] + l.mu.Unlock() + if ok { + log.Infof("Gofer socket disconnected, destroying container %q", cid) + if err := l.destroyContainer(cid); err != nil { + log.Warningf("Error destroying container %q after gofer stopped: %v", cid, err) + } + } + }() +} + // destroyContainer stops a container if it is still running and cleans up its // filesystem. func (l *Loader) destroyContainer(cid string) error { l.mu.Lock() defer l.mu.Unlock() - // Has the container started? - _, _, err := l.threadGroupFromIDLocked(execID{cid: cid}) + _, _, started, err := l.threadGroupFromIDLocked(execID{cid: cid}) + if err != nil { + // Container doesn't exist. + return err + } - // If the container has started, kill and wait for all processes. - if err == nil { + // The container exists, has it been started? + if started { if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil { return fmt.Errorf("sending SIGKILL to all container processes: %v", err) } @@ -754,27 +790,22 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { l.mu.Lock() defer l.mu.Unlock() - tg, _, err := l.threadGroupFromIDLocked(execID{cid: args.ContainerID}) + tg, _, started, err := l.threadGroupFromIDLocked(execID{cid: args.ContainerID}) if err != nil { - return 0, fmt.Errorf("no such container: %q", args.ContainerID) + return 0, err + } + if !started { + return 0, fmt.Errorf("container %q not started", args.ContainerID) } - // Get the container Root Dirent and MountNamespace from the Task. + // Get the container MountNamespace from the Task. tg.Leader().WithMuLocked(func(t *kernel.Task) { - // FSContext.RootDirectory() will take an extra ref for us. - args.Root = t.FSContext().RootDirectory() - // task.MountNamespace() does not take a ref, so we must do so // ourselves. args.MountNamespace = t.MountNamespace() args.MountNamespace.IncRef() }) - defer func() { - if args.Root != nil { - args.Root.DecRef() - } - args.MountNamespace.DecRef() - }() + defer args.MountNamespace.DecRef() // Start the process. proc := control.Proc{Kernel: l.k} @@ -895,6 +926,8 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) { return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err) } + s.FillDefaultIPTables() + return &s, nil default: @@ -1026,20 +1059,28 @@ func (l *Loader) signalAllProcesses(cid string, signo int32) error { func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, *host.TTYFileOperations, error) { l.mu.Lock() defer l.mu.Unlock() - return l.threadGroupFromIDLocked(key) + tg, tty, ok, err := l.threadGroupFromIDLocked(key) + if err != nil { + return nil, nil, err + } + if !ok { + return nil, nil, fmt.Errorf("container %q not started", key.cid) + } + return tg, tty, nil } // threadGroupFromIDLocked returns the thread group and TTY for the given // execution ID. TTY may be nil if the process is not attached to a terminal. -// Returns error if execution ID is invalid or if container/process has not -// started yet. Caller must hold 'mu'. -func (l *Loader) threadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, *host.TTYFileOperations, error) { +// Also returns a boolean indicating whether the container has already started. +// Returns error if execution ID is invalid or if the container cannot be +// found (maybe it has been deleted). Caller must hold 'mu'. +func (l *Loader) threadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, *host.TTYFileOperations, bool, error) { ep := l.processes[key] if ep == nil { - return nil, nil, fmt.Errorf("container not found") + return nil, nil, false, fmt.Errorf("container %q not found", key.cid) } if ep.tg == nil { - return nil, nil, fmt.Errorf("container not started") + return nil, nil, false, nil } - return ep.tg, ep.tty, nil + return ep.tg, ep.tty, true, nil } diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index ff713660d..147ff7703 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -401,17 +401,16 @@ func TestCreateMountNamespace(t *testing.T) { } defer cleanup() - // setupRootContainer needs to find root from the context after the - // namespace is created. - var mns *fs.MountNamespace - setMountNS := func(m *fs.MountNamespace) { - mns = m - ctx.(*contexttest.TestContext).RegisterValue(fs.CtxRoot, mns.Root()) + mntr := newContainerMounter(&tc.spec, []int{sandEnd}, nil, &podMountHints{}) + mns, err := mntr.createMountNamespace(ctx, conf) + if err != nil { + t.Fatalf("failed to create mount namespace: %v", err) } - mntr := newContainerMounter(&tc.spec, "", []int{sandEnd}, nil, &podMountHints{}) - if err := mntr.setupRootContainer(ctx, ctx, conf, setMountNS); err != nil { - t.Fatalf("createMountNamespace test case %q failed: %v", tc.name, err) + ctx = fs.WithRoot(ctx, mns.Root()) + if err := mntr.mountSubmounts(ctx, conf, mns); err != nil { + t.Fatalf("failed to create mount namespace: %v", err) } + root := mns.Root() defer root.DecRef() for _, p := range tc.expectedPaths { @@ -614,7 +613,7 @@ func TestRestoreEnvironment(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { conf := testConfig() - mntr := newContainerMounter(tc.spec, "", tc.ioFDs, nil, &podMountHints{}) + mntr := newContainerMounter(tc.spec, tc.ioFDs, nil, &podMountHints{}) actualRenv, err := mntr.createRestoreEnvironment(conf) if !tc.errorExpected && err != nil { t.Fatalf("could not create restore environment for test:%s", tc.name) diff --git a/runsc/boot/network.go b/runsc/boot/network.go index d3d98243d..ea0d9f790 100644 --- a/runsc/boot/network.go +++ b/runsc/boot/network.go @@ -38,8 +38,7 @@ type Network struct { // Route represents a route in the network stack. type Route struct { - Destination net.IP - Mask net.IPMask + Destination net.IPNet Gateway net.IP } @@ -85,16 +84,19 @@ type CreateLinksAndRoutesArgs struct { // Empty returns true if route hasn't been set. func (r *Route) Empty() bool { - return r.Destination == nil && r.Mask == nil && r.Gateway == nil + return r.Destination.IP == nil && r.Destination.Mask == nil && r.Gateway == nil } -func (r *Route) toTcpipRoute(id tcpip.NICID) tcpip.Route { +func (r *Route) toTcpipRoute(id tcpip.NICID) (tcpip.Route, error) { + subnet, err := tcpip.NewSubnet(ipToAddress(r.Destination.IP), ipMaskToAddressMask(r.Destination.Mask)) + if err != nil { + return tcpip.Route{}, err + } return tcpip.Route{ - Destination: ipToAddress(r.Destination), + Destination: subnet, Gateway: ipToAddress(r.Gateway), - Mask: ipToAddressMask(net.IP(r.Mask)), NIC: id, - } + }, nil } // CreateLinksAndRoutes creates links and routes in a network stack. It should @@ -128,7 +130,11 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct // Collect the routes from this link. for _, r := range link.Routes { - routes = append(routes, r.toTcpipRoute(nicID)) + route, err := r.toTcpipRoute(nicID) + if err != nil { + return err + } + routes = append(routes, route) } } @@ -170,7 +176,11 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct // Collect the routes from this link. for _, r := range link.Routes { - routes = append(routes, r.toTcpipRoute(nicID)) + route, err := r.toTcpipRoute(nicID) + if err != nil { + return err + } + routes = append(routes, route) } } @@ -179,7 +189,11 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct if !ok { return fmt.Errorf("invalid interface name %q for default route", args.DefaultGateway.Name) } - routes = append(routes, args.DefaultGateway.Route.toTcpipRoute(nicID)) + route, err := args.DefaultGateway.Route.toTcpipRoute(nicID) + if err != nil { + return err + } + routes = append(routes, route) } log.Infof("Setting routes %+v", routes) @@ -230,8 +244,8 @@ func ipToAddress(ip net.IP) tcpip.Address { return addr } -// ipToAddressMask converts IP to tcpip.AddressMask, ignoring the protocol. -func ipToAddressMask(ip net.IP) tcpip.AddressMask { - _, addr := ipToAddressAndProto(ip) - return tcpip.AddressMask(addr) +// ipMaskToAddressMask converts IPMask to tcpip.AddressMask, ignoring the +// protocol. +func ipMaskToAddressMask(ipMask net.IPMask) tcpip.AddressMask { + return tcpip.AddressMask(ipToAddress(net.IP(ipMask))) } diff --git a/runsc/boot/user_test.go b/runsc/boot/user_test.go index 834003430..906baf3e5 100644 --- a/runsc/boot/user_test.go +++ b/runsc/boot/user_test.go @@ -164,13 +164,13 @@ func TestGetExecUserHome(t *testing.T) { }, } - var mns *fs.MountNamespace - setMountNS := func(m *fs.MountNamespace) { - mns = m - ctx.(*contexttest.TestContext).RegisterValue(fs.CtxRoot, mns.Root()) + mntr := newContainerMounter(spec, []int{sandEnd}, nil, &podMountHints{}) + mns, err := mntr.createMountNamespace(ctx, conf) + if err != nil { + t.Fatalf("failed to create mount namespace: %v", err) } - mntr := newContainerMounter(spec, "", []int{sandEnd}, nil, &podMountHints{}) - if err := mntr.setupRootContainer(ctx, ctx, conf, setMountNS); err != nil { + ctx = fs.WithRoot(ctx, mns.Root()) + if err := mntr.mountSubmounts(ctx, conf, mns); err != nil { t.Fatalf("failed to create mount namespace: %v", err) } diff --git a/runsc/container/BUILD b/runsc/container/BUILD index e246c38ae..de8202bb1 100644 --- a/runsc/container/BUILD +++ b/runsc/container/BUILD @@ -49,6 +49,7 @@ go_test( "//pkg/abi/linux", "//pkg/log", "//pkg/sentry/control", + "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/unet", "//pkg/urpc", diff --git a/runsc/container/container.go b/runsc/container/container.go index 8320bb2ca..bbb364214 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -138,6 +138,34 @@ type Container struct { RootContainerDir string } +// loadSandbox loads all containers that belong to the sandbox with the given +// ID. +func loadSandbox(rootDir, id string) ([]*Container, error) { + cids, err := List(rootDir) + if err != nil { + return nil, err + } + + // Load the container metadata. + var containers []*Container + for _, cid := range cids { + container, err := Load(rootDir, cid) + if err != nil { + // Container file may not exist if it raced with creation/deletion or + // directory was left behind. Load provides a snapshot in time, so it's + // fine to skip it. + if os.IsNotExist(err) { + continue + } + return nil, fmt.Errorf("loading container %q: %v", id, err) + } + if container.Sandbox.ID == id { + containers = append(containers, container) + } + } + return containers, nil +} + // Load loads a container with the given id from a metadata file. id may be an // abbreviation of the full container id, in which case Load loads the // container to which id unambiguously refers to. @@ -180,7 +208,7 @@ func Load(rootDir, id string) (*Container, error) { // If the status is "Running" or "Created", check that the sandbox // process still exists, and set it to Stopped if it does not. // - // This is inherently racey. + // This is inherently racy. if c.Status == Running || c.Status == Created { // Check if the sandbox process is still running. if !c.isSandboxRunning() { @@ -237,7 +265,13 @@ func List(rootDir string) ([]string, error) { } var out []string for _, f := range fs { - out = append(out, f.Name()) + // Filter out directories that do no belong to a container. + cid := f.Name() + if validateID(cid) == nil { + if _, err := os.Stat(filepath.Join(rootDir, cid, metadataFilename)); err == nil { + out = append(out, f.Name()) + } + } } return out, nil } @@ -475,7 +509,13 @@ func (c *Container) Start(conf *boot.Config) error { } c.changeStatus(Running) - return c.save() + if err := c.save(); err != nil { + return err + } + + // Adjust the oom_score_adj for sandbox and gofers. This must be done after + // save(). + return c.adjustOOMScoreAdj(conf) } // Restore takes a container and replaces its kernel and file system @@ -1098,3 +1138,68 @@ func runInCgroup(cg *cgroup.Cgroup, fn func() error) error { } return fn() } + +// adjustOOMScoreAdj sets the oom_score_adj for the sandbox and all gofers. +// oom_score_adj is set to the lowest oom_score_adj among the containers +// running in the sandbox. +// +// TODO(gvisor.dev/issue/512): This call could race with other containers being +// created at the same time and end up setting the wrong oom_score_adj to the +// sandbox. +func (c *Container) adjustOOMScoreAdj(conf *boot.Config) error { + // If this container's OOMScoreAdj is nil then we can exit early as no + // change should be made to oom_score_adj for the sandbox. + if c.Spec.Process.OOMScoreAdj == nil { + return nil + } + + containers, err := loadSandbox(conf.RootDir, c.Sandbox.ID) + if err != nil { + return fmt.Errorf("loading sandbox containers: %v", err) + } + + // Get the lowest score for all containers. + var lowScore int + scoreFound := false + for _, container := range containers { + if container.Spec.Process.OOMScoreAdj != nil && (!scoreFound || *container.Spec.Process.OOMScoreAdj < lowScore) { + scoreFound = true + lowScore = *container.Spec.Process.OOMScoreAdj + } + } + + // Only set oom_score_adj if one of the containers has oom_score_adj set + // in the OCI bundle. If not, we need to inherit the parent process's + // oom_score_adj. + // See: https://github.com/opencontainers/runtime-spec/blob/master/config.md#linux-process + if !scoreFound { + return nil + } + + // Set the lowest of all containers oom_score_adj to the sandbox. + if err := setOOMScoreAdj(c.Sandbox.Pid, lowScore); err != nil { + return fmt.Errorf("setting oom_score_adj for sandbox %q: %v", c.Sandbox.ID, err) + } + + // Set container's oom_score_adj to the gofer since it is dedicated to the + // container, in case the gofer uses up too much memory. + if err := setOOMScoreAdj(c.GoferPid, *c.Spec.Process.OOMScoreAdj); err != nil { + return fmt.Errorf("setting gofer oom_score_adj for container %q: %v", c.ID, err) + } + return nil +} + +// setOOMScoreAdj sets oom_score_adj to the given value for the given PID. +// /proc must be available and mounted read-write. scoreAdj should be between +// -1000 and 1000. +func setOOMScoreAdj(pid int, scoreAdj int) error { + f, err := os.OpenFile(fmt.Sprintf("/proc/%d/oom_score_adj", pid), os.O_WRONLY, 0644) + if err != nil { + return err + } + defer f.Close() + if _, err := f.WriteString(strconv.Itoa(scoreAdj)); err != nil { + return err + } + return nil +} diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index ff68c586e..3d4f304f3 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -76,7 +76,7 @@ func waitForProcessCount(cont *Container, want int) error { } func blockUntilWaitable(pid int) error { - _, _, err := testutil.RetryEintr(func() (uintptr, uintptr, error) { + _, _, err := specutils.RetryEintr(func() (uintptr, uintptr, error) { var err error _, _, err1 := syscall.Syscall6(syscall.SYS_WAITID, 1, uintptr(pid), 0, syscall.WEXITED|syscall.WNOWAIT, 0, 0) if err1 != 0 { @@ -1310,10 +1310,13 @@ func TestRunNonRoot(t *testing.T) { t.Logf("Running test with conf: %+v", conf) spec := testutil.NewSpecWithArgs("/bin/true") + + // Set a random user/group with no access to "blocked" dir. spec.Process.User.UID = 343 spec.Process.User.GID = 2401 + spec.Process.Capabilities = nil - // User that container runs as can't list '$TMP/blocked' and would fail to + // User running inside container can't list '$TMP/blocked' and would fail to // mount it. dir, err := ioutil.TempDir(testutil.TmpDir(), "blocked") if err != nil { @@ -1327,6 +1330,17 @@ func TestRunNonRoot(t *testing.T) { t.Fatalf("os.MkDir(%q) failed: %v", dir, err) } + src, err := ioutil.TempDir(testutil.TmpDir(), "src") + if err != nil { + t.Fatalf("ioutil.TempDir() failed: %v", err) + } + + spec.Mounts = append(spec.Mounts, specs.Mount{ + Destination: dir, + Source: src, + Type: "bind", + }) + if err := run(spec, conf); err != nil { t.Fatalf("error running sandbox: %v", err) } diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go index 978a422f5..ae03d24b4 100644 --- a/runsc/container/multi_container_test.go +++ b/runsc/container/multi_container_test.go @@ -29,6 +29,7 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/sentry/control" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/specutils" "gvisor.dev/gvisor/runsc/test/testutil" @@ -59,11 +60,14 @@ func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) { } func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*Container, func(), error) { - rootDir, err := testutil.SetupRootDir() - if err != nil { - return nil, nil, fmt.Errorf("error creating root dir: %v", err) + // Setup root dir if one hasn't been provided. + if len(conf.RootDir) == 0 { + rootDir, err := testutil.SetupRootDir() + if err != nil { + return nil, nil, fmt.Errorf("error creating root dir: %v", err) + } + conf.RootDir = rootDir } - conf.RootDir = rootDir var containers []*Container var bundles []string @@ -74,7 +78,7 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C for _, b := range bundles { os.RemoveAll(b) } - os.RemoveAll(rootDir) + os.RemoveAll(conf.RootDir) } for i, spec := range specs { bundleDir, err := testutil.SetupBundleDir(spec) @@ -488,7 +492,7 @@ func TestMultiContainerSignal(t *testing.T) { if err := containers[1].Destroy(); err != nil { t.Errorf("failed to destroy container: %v", err) } - _, _, err = testutil.RetryEintr(func() (uintptr, uintptr, error) { + _, _, err = specutils.RetryEintr(func() (uintptr, uintptr, error) { cpid, err := syscall.Wait4(goferPid, nil, 0, nil) return uintptr(cpid), 0, err }) @@ -905,9 +909,9 @@ func TestMultiContainerDifferentFilesystems(t *testing.T) { } } -// TestMultiContainerGoferStop tests that IO operations continue to work after -// containers have been stopped and gofers killed. -func TestMultiContainerGoferStop(t *testing.T) { +// TestMultiContainerContainerDestroyStress tests that IO operations continue +// to work after containers have been stopped and gofers killed. +func TestMultiContainerContainerDestroyStress(t *testing.T) { app, err := testutil.FindFile("runsc/container/test_app/test_app") if err != nil { t.Fatal("error finding test_app:", err) @@ -1345,3 +1349,194 @@ func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) { } } } + +// Test that container is destroyed when Gofer is killed. +func TestMultiContainerGoferKilled(t *testing.T) { + sleep := []string{"sleep", "100"} + specs, ids := createSpecs(sleep, sleep, sleep) + conf := testutil.TestConfig() + containers, cleanup, err := startContainers(conf, specs, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + + // Ensure container is running + c := containers[2] + expectedPL := []*control.Process{ + {PID: 3, Cmd: "sleep"}, + } + if err := waitForProcessList(c, expectedPL); err != nil { + t.Errorf("failed to wait for sleep to start: %v", err) + } + + // Kill container's gofer. + if err := syscall.Kill(c.GoferPid, syscall.SIGKILL); err != nil { + t.Fatalf("syscall.Kill(%d, SIGKILL)=%v", c.GoferPid, err) + } + + // Wait until container stops. + if err := waitForProcessList(c, nil); err != nil { + t.Errorf("Container %q was not stopped after gofer death: %v", c.ID, err) + } + + // Check that container isn't running anymore. + args := &control.ExecArgs{Argv: []string{"/bin/true"}} + if _, err := c.executeSync(args); err == nil { + t.Fatalf("Container %q was not stopped after gofer death", c.ID) + } + + // Check that other containers are unaffected. + for i, c := range containers { + if i == 2 { + continue // container[2] has been killed. + } + pl := []*control.Process{ + {PID: kernel.ThreadID(i + 1), Cmd: "sleep"}, + } + if err := waitForProcessList(c, pl); err != nil { + t.Errorf("Container %q was affected by another container: %v", c.ID, err) + } + args := &control.ExecArgs{Argv: []string{"/bin/true"}} + if _, err := c.executeSync(args); err != nil { + t.Fatalf("Container %q was affected by another container: %v", c.ID, err) + } + } + + // Kill root container's gofer to bring entire sandbox down. + c = containers[0] + if err := syscall.Kill(c.GoferPid, syscall.SIGKILL); err != nil { + t.Fatalf("syscall.Kill(%d, SIGKILL)=%v", c.GoferPid, err) + } + + // Wait until sandbox stops. waitForProcessList will loop until sandbox exits + // and RPC errors out. + impossiblePL := []*control.Process{ + {PID: 100, Cmd: "non-existent-process"}, + } + if err := waitForProcessList(c, impossiblePL); err == nil { + t.Fatalf("Sandbox was not killed after gofer death") + } + + // Check that entire sandbox isn't running anymore. + for _, c := range containers { + args := &control.ExecArgs{Argv: []string{"/bin/true"}} + if _, err := c.executeSync(args); err == nil { + t.Fatalf("Container %q was not stopped after gofer death", c.ID) + } + } +} + +func TestMultiContainerLoadSandbox(t *testing.T) { + sleep := []string{"sleep", "100"} + specs, ids := createSpecs(sleep, sleep, sleep) + conf := testutil.TestConfig() + + // Create containers for the sandbox. + wants, cleanup, err := startContainers(conf, specs, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + + // Then create unrelated containers. + for i := 0; i < 3; i++ { + specs, ids = createSpecs(sleep, sleep, sleep) + _, cleanup, err = startContainers(conf, specs, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + } + + // Create an unrelated directory under root. + dir := filepath.Join(conf.RootDir, "not-a-container") + if err := os.MkdirAll(dir, 0755); err != nil { + t.Fatalf("os.MkdirAll(%q)=%v", dir, err) + } + + // Create a valid but empty container directory. + randomCID := testutil.UniqueContainerID() + dir = filepath.Join(conf.RootDir, randomCID) + if err := os.MkdirAll(dir, 0755); err != nil { + t.Fatalf("os.MkdirAll(%q)=%v", dir, err) + } + + // Load the sandbox and check that the correct containers were returned. + id := wants[0].Sandbox.ID + gots, err := loadSandbox(conf.RootDir, id) + if err != nil { + t.Fatalf("loadSandbox()=%v", err) + } + wantIDs := make(map[string]struct{}) + for _, want := range wants { + wantIDs[want.ID] = struct{}{} + } + for _, got := range gots { + if got.Sandbox.ID != id { + t.Errorf("wrong sandbox ID, got: %v, want: %v", got.Sandbox.ID, id) + } + if _, ok := wantIDs[got.ID]; !ok { + t.Errorf("wrong container ID, got: %v, wants: %v", got.ID, wantIDs) + } + delete(wantIDs, got.ID) + } + if len(wantIDs) != 0 { + t.Errorf("containers not found: %v", wantIDs) + } +} + +// TestMultiContainerRunNonRoot checks that child container can be configured +// when running as non-privileged user. +func TestMultiContainerRunNonRoot(t *testing.T) { + cmdRoot := []string{"/bin/sleep", "100"} + cmdSub := []string{"/bin/true"} + podSpecs, ids := createSpecs(cmdRoot, cmdSub) + + // User running inside container can't list '$TMP/blocked' and would fail to + // mount it. + blocked, err := ioutil.TempDir(testutil.TmpDir(), "blocked") + if err != nil { + t.Fatalf("ioutil.TempDir() failed: %v", err) + } + if err := os.Chmod(blocked, 0700); err != nil { + t.Fatalf("os.MkDir(%q) failed: %v", blocked, err) + } + dir := path.Join(blocked, "test") + if err := os.Mkdir(dir, 0755); err != nil { + t.Fatalf("os.MkDir(%q) failed: %v", dir, err) + } + + src, err := ioutil.TempDir(testutil.TmpDir(), "src") + if err != nil { + t.Fatalf("ioutil.TempDir() failed: %v", err) + } + + // Set a random user/group with no access to "blocked" dir. + podSpecs[1].Process.User.UID = 343 + podSpecs[1].Process.User.GID = 2401 + podSpecs[1].Process.Capabilities = nil + + podSpecs[1].Mounts = append(podSpecs[1].Mounts, specs.Mount{ + Destination: dir, + Source: src, + Type: "bind", + }) + + conf := testutil.TestConfig() + pod, cleanup, err := startContainers(conf, podSpecs, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + + // Once all containers are started, wait for the child container to exit. + // This means that the volume was mounted properly. + ws, err := pod[1].Wait() + if err != nil { + t.Fatalf("running child container: %v", err) + } + if !ws.Exited() || ws.ExitStatus() != 0 { + t.Fatalf("child container failed, waitStatus: %v", ws) + } +} diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go index 2d50774d4..8ddfa77d6 100644 --- a/runsc/fsgofer/filter/config.go +++ b/runsc/fsgofer/filter/config.go @@ -138,7 +138,7 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_NANOSLEEP: {}, syscall.SYS_NEWFSTATAT: {}, syscall.SYS_OPENAT: {}, - syscall.SYS_POLL: {}, + syscall.SYS_PPOLL: {}, syscall.SYS_PREAD64: {}, syscall.SYS_PWRITE64: {}, syscall.SYS_READ: {}, diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index fe450c64f..7c4d2b94e 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -125,7 +125,7 @@ func (a *attachPoint) Attach() (p9.File, error) { return nil, fmt.Errorf("stat file %q, err: %v", a.prefix, err) } mode := syscall.O_RDWR - if a.conf.ROMount || stat.Mode&syscall.S_IFDIR != 0 { + if a.conf.ROMount || (stat.Mode&syscall.S_IFMT) == syscall.S_IFDIR { mode = syscall.O_RDONLY } @@ -141,9 +141,13 @@ func (a *attachPoint) Attach() (p9.File, error) { f.Close() return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix) } - a.attached = true - return newLocalFile(a, f, a.prefix, stat) + rv, err := newLocalFile(a, f, a.prefix, stat) + if err != nil { + return nil, err + } + a.attached = true + return rv, nil } // makeQID returns a unique QID for the given stat buffer. diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go index 0a162bb8a..cbbe71019 100644 --- a/runsc/fsgofer/fsgofer_test.go +++ b/runsc/fsgofer/fsgofer_test.go @@ -17,8 +17,10 @@ package fsgofer import ( "fmt" "io/ioutil" + "net" "os" "path" + "path/filepath" "syscall" "testing" @@ -621,6 +623,54 @@ func TestAttachFile(t *testing.T) { } } +func TestAttachInvalidType(t *testing.T) { + dir, err := ioutil.TempDir("", "attach-") + if err != nil { + t.Fatalf("ioutil.TempDir() failed, err: %v", err) + } + defer os.RemoveAll(dir) + + fifo := filepath.Join(dir, "fifo") + if err := syscall.Mkfifo(fifo, 0755); err != nil { + t.Fatalf("Mkfifo(%q): %v", fifo, err) + } + + dirFile, err := os.Open(dir) + if err != nil { + t.Fatalf("Open(%s): %v", dir, err) + } + defer dirFile.Close() + + // Bind a socket via /proc to be sure that a length of a socket path + // is less than UNIX_PATH_MAX. + socket := filepath.Join(fmt.Sprintf("/proc/self/fd/%d", dirFile.Fd()), "socket") + l, err := net.Listen("unix", socket) + if err != nil { + t.Fatalf("net.Listen(unix, %q): %v", socket, err) + } + defer l.Close() + + for _, tc := range []struct { + name string + path string + }{ + {name: "fifo", path: fifo}, + {name: "socket", path: socket}, + } { + t.Run(tc.name, func(t *testing.T) { + conf := Config{ROMount: false} + a, err := NewAttachPoint(tc.path, conf) + if err != nil { + t.Fatalf("NewAttachPoint failed: %v", err) + } + f, err := a.Attach() + if f != nil || err == nil { + t.Fatalf("Attach should have failed, got (%v, nil)", f) + } + }) + } +} + func TestDoubleAttachError(t *testing.T) { conf := Config{ROMount: false} root, err := ioutil.TempDir("", "root-") diff --git a/runsc/main.go b/runsc/main.go index 58e7dd8f3..70f06dbb8 100644 --- a/runsc/main.go +++ b/runsc/main.go @@ -22,6 +22,7 @@ import ( "io" "io/ioutil" "os" + "os/signal" "path/filepath" "strings" "syscall" @@ -30,6 +31,7 @@ import ( "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/cmd" @@ -174,6 +176,10 @@ func main() { cmd.Fatalf("%v", err) } + // Sets the reference leak check mode. Also set it in config below to + // propagate it to child processes. + refs.SetLeakMode(refsLeakMode) + // Create a new Config from the flags. conf := &boot.Config{ RootDir: *rootDir, @@ -264,6 +270,13 @@ func main() { log.Infof("\t\tStrace: %t, max size: %d, syscalls: %s", conf.Strace, conf.StraceLogSize, conf.StraceSyscalls) log.Infof("***************************") + if *testOnlyAllowRunAsCurrentUserWithoutChroot { + // SIGTERM is sent to all processes if a test exceeds its + // timeout and this case is handled by syscall_test_runner. + log.Warningf("Block the TERM signal. This is only safe in tests!") + signal.Ignore(syscall.SIGTERM) + } + // Call the subcommand and pass in the configuration. var ws syscall.WaitStatus subcmdCode := subcommands.Execute(context.Background(), conf, &ws) diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go index a965a9dcb..5634f0707 100644 --- a/runsc/sandbox/network.go +++ b/runsc/sandbox/network.go @@ -33,13 +33,6 @@ import ( "gvisor.dev/gvisor/runsc/specutils" ) -const ( - // Annotations used to indicate whether the container corresponds to a - // pod or a container within a pod. - crioContainerTypeAnnotation = "io.kubernetes.cri-o.ContainerType" - containerdContainerTypeAnnotation = "io.kubernetes.cri.container-type" -) - // setupNetwork configures the network stack to mimic the local network // configuration. Docker uses network namespaces with vnets to configure the // network for the container. The untrusted app expects to see the same network @@ -88,12 +81,17 @@ func createDefaultLoopbackInterface(conn *urpc.Client) error { }, Routes: []boot.Route{ { - Destination: net.IP("\x7f\x00\x00\x00"), - Mask: net.IPMask("\xff\x00\x00\x00"), + Destination: net.IPNet{ + + IP: net.IPv4(0x7f, 0, 0, 0), + Mask: net.IPv4Mask(0xff, 0, 0, 0), + }, }, { - Destination: net.IPv6loopback, - Mask: net.IPMask(strings.Repeat("\xff", 16)), + Destination: net.IPNet{ + IP: net.IPv6loopback, + Mask: net.IPMask(strings.Repeat("\xff", net.IPv6len)), + }, }, }, } @@ -333,12 +331,13 @@ func loopbackLinks(iface net.Interface, addrs []net.Addr) ([]boot.LoopbackLink, if !ok { return nil, fmt.Errorf("address is not IPNet: %+v", addr) } + dst := *ipNet + dst.IP = dst.IP.Mask(dst.Mask) links = append(links, boot.LoopbackLink{ Name: iface.Name, Addresses: []net.IP{ipNet.IP}, Routes: []boot.Route{{ - Destination: ipNet.IP.Mask(ipNet.Mask), - Mask: ipNet.Mask, + Destination: dst, }}, }) } @@ -374,9 +373,11 @@ func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, error) { } // Create a catch all route to the gateway. def = &boot.Route{ - Destination: net.IPv4zero, - Mask: net.IPMask(net.IPv4zero), - Gateway: r.Gw, + Destination: net.IPNet{ + IP: net.IPv4zero, + Mask: net.IPMask(net.IPv4zero), + }, + Gateway: r.Gw, } continue } @@ -384,9 +385,10 @@ func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, error) { log.Warningf("IPv6 is not supported, skipping route: %v", r) continue } + dst := *r.Dst + dst.IP = dst.IP.Mask(dst.Mask) routes = append(routes, boot.Route{ - Destination: r.Dst.IP.Mask(r.Dst.Mask), - Mask: r.Dst.Mask, + Destination: dst, Gateway: r.Gw, }) } diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index 4a11f617d..df3c0c5ef 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -54,7 +54,7 @@ type Sandbox struct { // ID as the first container run in the sandbox. ID string `json:"id"` - // Pid is the pid of the running sandbox (immutable). May be 0 is the sandbox + // Pid is the pid of the running sandbox (immutable). May be 0 if the sandbox // is not running. Pid int `json:"pid"` @@ -361,6 +361,8 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF nextFD++ } + cmd.Args = append(cmd.Args, "--panic-signal="+strconv.Itoa(int(syscall.SIGTERM))) + // Add the "boot" command to the args. // // All flags after this must be for the boot command diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go index 0b40e38a3..2eec92349 100644 --- a/runsc/specutils/specutils.go +++ b/runsc/specutils/specutils.go @@ -492,3 +492,14 @@ func (c *Cleanup) Clean() { func (c *Cleanup) Release() { c.clean = nil } + +// RetryEintr retries the function until an error different than EINTR is +// returned. +func RetryEintr(f func() (uintptr, uintptr, error)) (uintptr, uintptr, error) { + for { + r1, r2, err := f() + if err != syscall.EINTR { + return r1, r2, err + } + } +} diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go index 3f3e191b0..94e625259 100644 --- a/runsc/test/testutil/docker.go +++ b/runsc/test/testutil/docker.go @@ -297,7 +297,11 @@ func (d *Docker) Remove() error { func (d *Docker) CleanUp() { d.logDockerID() if _, err := do("kill", d.Name); err != nil { - log.Printf("error killing container %q: %v", d.Name, err) + if strings.Contains(err.Error(), "is not running") { + // Nothing to kill. Don't log the error in this case. + } else { + log.Printf("error killing container %q: %v", d.Name, err) + } } if err := d.Remove(); err != nil { log.Print(err) diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go index a98675bfc..4a3dfa0e3 100644 --- a/runsc/test/testutil/testutil.go +++ b/runsc/test/testutil/testutil.go @@ -127,13 +127,15 @@ func FindFile(path string) (string, error) { // 'RootDir' must be set by caller if required. func TestConfig() *boot.Config { return &boot.Config{ - Debug: true, - LogFormat: "text", - LogPackets: true, - Network: boot.NetworkNone, - Strace: true, - Platform: "ptrace", - FileAccess: boot.FileAccessExclusive, + Debug: true, + LogFormat: "text", + DebugLogFormat: "text", + AlsoLogToStderr: true, + LogPackets: true, + Network: boot.NetworkNone, + Strace: true, + Platform: "ptrace", + FileAccess: boot.FileAccessExclusive, TestOnlyAllowRunAsCurrentUserWithoutChroot: true, NumNetworkChannels: 1, } @@ -189,11 +191,14 @@ func SetupRootDir() (string, error) { // SetupContainer creates a bundle and root dir for the container, generates a // test config, and writes the spec to config.json in the bundle dir. func SetupContainer(spec *specs.Spec, conf *boot.Config) (rootDir, bundleDir string, err error) { - rootDir, err = SetupRootDir() - if err != nil { - return "", "", err + // Setup root dir if one hasn't been provided. + if len(conf.RootDir) == 0 { + rootDir, err = SetupRootDir() + if err != nil { + return "", "", err + } + conf.RootDir = rootDir } - conf.RootDir = rootDir bundleDir, err = SetupBundleDir(spec) return rootDir, bundleDir, err } @@ -348,17 +353,6 @@ func StartReaper() func() { return r.Stop } -// RetryEintr retries the function until an error different than EINTR is -// returned. -func RetryEintr(f func() (uintptr, uintptr, error)) (uintptr, uintptr, error) { - for { - r1, r2, err := f() - if err != syscall.EINTR { - return r1, r2, err - } - } -} - // WaitUntilRead reads from the given reader until the wanted string is found // or until timeout. func WaitUntilRead(r io.Reader, want string, split bufio.SplitFunc, timeout time.Duration) error { |