summaryrefslogtreecommitdiffhomepage
path: root/runsc
diff options
context:
space:
mode:
Diffstat (limited to 'runsc')
-rw-r--r--runsc/BUILD2
-rw-r--r--runsc/boot/BUILD1
-rw-r--r--runsc/boot/config.go18
-rw-r--r--runsc/boot/controller.go12
-rw-r--r--runsc/boot/filter/config.go2
-rw-r--r--runsc/boot/fs.go182
-rw-r--r--runsc/boot/loader.go125
-rw-r--r--runsc/boot/loader_test.go19
-rw-r--r--runsc/boot/network.go42
-rw-r--r--runsc/boot/user_test.go12
-rw-r--r--runsc/container/BUILD1
-rw-r--r--runsc/container/container.go111
-rw-r--r--runsc/container/container_test.go18
-rw-r--r--runsc/container/multi_container_test.go213
-rw-r--r--runsc/fsgofer/filter/config.go2
-rw-r--r--runsc/fsgofer/fsgofer.go10
-rw-r--r--runsc/fsgofer/fsgofer_test.go50
-rw-r--r--runsc/main.go13
-rw-r--r--runsc/sandbox/network.go38
-rw-r--r--runsc/sandbox/sandbox.go4
-rw-r--r--runsc/specutils/specutils.go11
-rw-r--r--runsc/test/testutil/docker.go6
-rw-r--r--runsc/test/testutil/testutil.go38
23 files changed, 688 insertions, 242 deletions
diff --git a/runsc/BUILD b/runsc/BUILD
index 6b8c92706..cc8852d7d 100644
--- a/runsc/BUILD
+++ b/runsc/BUILD
@@ -16,6 +16,7 @@ go_binary(
x_defs = {"main.version": "{VERSION}"},
deps = [
"//pkg/log",
+ "//pkg/refs",
"//pkg/sentry/platform",
"//runsc/boot",
"//runsc/cmd",
@@ -48,6 +49,7 @@ go_binary(
x_defs = {"main.version": "{VERSION}"},
deps = [
"//pkg/log",
+ "//pkg/refs",
"//pkg/sentry/platform",
"//runsc/boot",
"//runsc/cmd",
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 5025401dd..588bb8851 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -88,6 +88,7 @@ go_library(
"//runsc/specutils",
"@com_github_golang_protobuf//proto:go_default_library",
"@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+ "@org_golang_x_sys//unix:go_default_library",
],
)
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 6a742f349..05b8f8761 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -113,7 +113,7 @@ func MakeWatchdogAction(s string) (watchdog.Action, error) {
}
}
-// MakeRefsLeakMode converts type from string
+// MakeRefsLeakMode converts type from string.
func MakeRefsLeakMode(s string) (refs.LeakMode, error) {
switch strings.ToLower(s) {
case "disabled":
@@ -127,6 +127,20 @@ func MakeRefsLeakMode(s string) (refs.LeakMode, error) {
}
}
+func refsLeakModeToString(mode refs.LeakMode) string {
+ switch mode {
+ // If not set, default it to disabled.
+ case refs.UninitializedLeakChecking, refs.NoLeakChecking:
+ return "disabled"
+ case refs.LeaksLogWarning:
+ return "log-names"
+ case refs.LeaksLogTraces:
+ return "log-traces"
+ default:
+ panic(fmt.Sprintf("Invalid leakmode: %d", mode))
+ }
+}
+
// Config holds configuration that is not part of the runtime spec.
type Config struct {
// RootDir is the runtime root directory.
@@ -245,7 +259,7 @@ func (c *Config) ToFlags() []string {
"--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels),
"--rootless=" + strconv.FormatBool(c.Rootless),
"--alsologtostderr=" + strconv.FormatBool(c.AlsoLogToStderr),
- "--ref-leak-mode=" + c.ReferenceLeakMode.String(),
+ "--ref-leak-mode=" + refsLeakModeToString(c.ReferenceLeakMode),
}
if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
// Only include if set since it is never to be used by users.
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index d79aaff60..72cbabd16 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -328,10 +328,8 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
return fmt.Errorf("at most two files may be passed to Restore")
}
- networkStack := cm.l.k.NetworkStack()
- // Destroy the old kernel and create a new kernel.
+ // Pause the kernel while we build a new one.
cm.l.k.Pause()
- cm.l.k.Destroy()
p, err := createPlatform(cm.l.conf, deviceFile)
if err != nil {
@@ -345,10 +343,11 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
return fmt.Errorf("creating memory file: %v", err)
}
k.SetMemoryFile(mf)
+ networkStack := cm.l.k.NetworkStack()
cm.l.k = k
// Set up the restore environment.
- mntr := newContainerMounter(cm.l.spec, "", cm.l.goferFDs, cm.l.k, cm.l.mountHints)
+ mntr := newContainerMounter(cm.l.spec, cm.l.goferFDs, cm.l.k, cm.l.mountHints)
renv, err := mntr.createRestoreEnvironment(cm.l.conf)
if err != nil {
return fmt.Errorf("creating RestoreEnvironment: %v", err)
@@ -380,13 +379,10 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
// Load the state.
loadOpts := state.LoadOpts{Source: specFile}
- if err := loadOpts.Load(k, networkStack); err != nil {
+ if err := loadOpts.Load(k, networkStack, time.NewCalibratedClocks()); err != nil {
return err
}
- // Set timekeeper.
- k.Timekeeper().SetClocks(time.NewCalibratedClocks())
-
// Since we have a new kernel we also must make a new watchdog.
dog := watchdog.New(k, watchdog.DefaultTimeout, cm.l.conf.WatchdogAction)
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 0ee5b8bbd..7ca776b3a 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -207,7 +207,7 @@ var allowedSyscalls = seccomp.SyscallRules{
syscall.SYS_MPROTECT: {},
syscall.SYS_MUNMAP: {},
syscall.SYS_NANOSLEEP: {},
- syscall.SYS_POLL: {},
+ syscall.SYS_PPOLL: {},
syscall.SYS_PREAD64: {},
syscall.SYS_PWRITE64: {},
syscall.SYS_READ: {},
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 7e95e1f41..34c674840 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -25,10 +25,8 @@ import (
// Include filesystem types that OCI spec might mount.
_ "gvisor.dev/gvisor/pkg/sentry/fs/dev"
- "gvisor.dev/gvisor/pkg/sentry/fs/gofer"
_ "gvisor.dev/gvisor/pkg/sentry/fs/host"
_ "gvisor.dev/gvisor/pkg/sentry/fs/proc"
- "gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
_ "gvisor.dev/gvisor/pkg/sentry/fs/sys"
_ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
_ "gvisor.dev/gvisor/pkg/sentry/fs/tty"
@@ -38,6 +36,8 @@ import (
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/fs/gofer"
+ "gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/syserror"
@@ -263,6 +263,18 @@ func subtargets(root string, mnts []specs.Mount) []string {
return targets
}
+func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
+ mns, err := mntr.setupFS(conf, procArgs)
+ if err != nil {
+ return err
+ }
+
+ // Set namespace here so that it can be found in ctx.
+ procArgs.MountNamespace = mns
+
+ return setExecutablePath(ctx, procArgs)
+}
+
// setExecutablePath sets the procArgs.Filename by searching the PATH for an
// executable matching the procArgs.Argv[0].
func setExecutablePath(ctx context.Context, procArgs *kernel.CreateProcessArgs) error {
@@ -478,9 +490,6 @@ func (p *podMountHints) findMount(mount specs.Mount) *mountHint {
}
type containerMounter struct {
- // cid is the container ID. May be set to empty for the root container.
- cid string
-
root *specs.Root
// mounts is the set of submounts for the container. It's a copy from the spec
@@ -495,9 +504,8 @@ type containerMounter struct {
hints *podMountHints
}
-func newContainerMounter(spec *specs.Spec, cid string, goferFDs []int, k *kernel.Kernel, hints *podMountHints) *containerMounter {
+func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hints *podMountHints) *containerMounter {
return &containerMounter{
- cid: cid,
root: spec.Root,
mounts: compileMounts(spec),
fds: fdDispenser{fds: goferFDs},
@@ -506,63 +514,88 @@ func newContainerMounter(spec *specs.Spec, cid string, goferFDs []int, k *kernel
}
}
-// setupFS is used to set up the file system for containers and amend
-// the procArgs accordingly. This is the main entry point for this rest of
-// functions in this file. procArgs are passed by reference and the FDMap field
-// is modified. It dups stdioFDs.
-func (c *containerMounter) setupFS(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs, creds *auth.Credentials) error {
- // Use root user to configure mounts. The current user might not have
- // permission to do so.
- rootProcArgs := kernel.CreateProcessArgs{
- WorkingDirectory: "/",
- Credentials: auth.NewRootCredentials(creds.UserNamespace),
- Umask: 0022,
- MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
- PIDNamespace: procArgs.PIDNamespace,
+// processHints processes annotations that container hints about how volumes
+// should be mounted (e.g. a volume shared between containers). It must be
+// called for the root container only.
+func (c *containerMounter) processHints(conf *Config) error {
+ ctx := c.k.SupervisorContext()
+ for _, hint := range c.hints.mounts {
+ log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
+ inode, err := c.mountSharedMaster(ctx, conf, hint)
+ if err != nil {
+ return fmt.Errorf("mounting shared master %q: %v", hint.name, err)
+ }
+ hint.root = inode
}
+ return nil
+}
+
+// setupFS is used to set up the file system for all containers. This is the
+// main entry point method, with most of the other being internal only. It
+// returns the mount namespace that is created for the container.
+func (c *containerMounter) setupFS(conf *Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) {
+ log.Infof("Configuring container's file system")
+
+ // Create context with root credentials to mount the filesystem (the current
+ // user may not be privileged enough).
+ rootProcArgs := *procArgs
+ rootProcArgs.WorkingDirectory = "/"
+ rootProcArgs.Credentials = auth.NewRootCredentials(procArgs.Credentials.UserNamespace)
+ rootProcArgs.Umask = 0022
+ rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals
rootCtx := rootProcArgs.NewContext(c.k)
- // If this is the root container, we also need to setup the root mount
- // namespace.
- rootMNS := c.k.RootMountNamespace()
- if rootMNS == nil {
- // Setup the root container.
- if err := c.setupRootContainer(ctx, rootCtx, conf, func(rootMNS *fs.MountNamespace) {
- // The callback to setupRootContainer inherits a
- // reference on the rootMNS, so we don't need to take
- // an additional reference here.
- procArgs.MountNamespace = rootMNS
- procArgs.Root = rootMNS.Root()
- c.k.SetRootMountNamespace(rootMNS)
- }); err != nil {
- return err
- }
- return c.checkDispenser()
+ mns, err := c.createMountNamespace(rootCtx, conf)
+ if err != nil {
+ return nil, err
}
- // Setup a child container.
- log.Infof("Creating new process in child container.")
+ // Set namespace here so that it can be found in rootCtx.
+ rootProcArgs.MountNamespace = mns
- // Create a new root inode and mount namespace for the container.
- rootInode, err := c.createRootMount(rootCtx, conf)
+ if err := c.mountSubmounts(rootCtx, conf, mns); err != nil {
+ return nil, err
+ }
+ return mns, nil
+}
+
+func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Config) (*fs.MountNamespace, error) {
+ rootInode, err := c.createRootMount(ctx, conf)
if err != nil {
- return fmt.Errorf("creating filesystem for container: %v", err)
+ return nil, fmt.Errorf("creating filesystem for container: %v", err)
}
- mns, err := fs.NewMountNamespace(rootCtx, rootInode)
+ mns, err := fs.NewMountNamespace(ctx, rootInode)
if err != nil {
- return fmt.Errorf("creating new mount namespace for container: %v", err)
+ return nil, fmt.Errorf("creating new mount namespace for container: %v", err)
}
+ return mns, nil
+}
- // Set process root here, so 'rootCtx.Value(CtxRoot)' will return it.
- // This will also donate a reference to procArgs, as required.
- procArgs.MountNamespace = mns
- procArgs.Root = mns.Root()
+func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace) error {
+ root := mns.Root()
+ defer root.DecRef()
- // Mount all submounts.
- if err := c.mountSubmounts(rootCtx, conf, mns, procArgs.Root); err != nil {
+ for _, m := range c.mounts {
+ log.Debugf("Mounting %q to %q, type: %s, options: %s", m.Source, m.Destination, m.Type, m.Options)
+ if hint := c.hints.findMount(m); hint != nil && hint.isSupported() {
+ if err := c.mountSharedSubmount(ctx, mns, root, m, hint); err != nil {
+ return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, m.Destination, err)
+ }
+ } else {
+ if err := c.mountSubmount(ctx, conf, mns, root, m); err != nil {
+ return fmt.Errorf("mount submount %q: %v", m.Destination, err)
+ }
+ }
+ }
+
+ if err := c.mountTmp(ctx, conf, mns, root); err != nil {
+ return fmt.Errorf("mount submount %q: %v", "tmp", err)
+ }
+
+ if err := c.checkDispenser(); err != nil {
return err
}
- return c.checkDispenser()
+ return nil
}
func (c *containerMounter) checkDispenser() error {
@@ -572,36 +605,6 @@ func (c *containerMounter) checkDispenser() error {
return nil
}
-// setupRootContainer creates a mount namespace containing the root filesystem
-// and all mounts. 'rootCtx' is used to walk directories to find mount points.
-// The 'setMountNS' callback is called after the mount namespace is created and
-// will get a reference on that namespace. The callback must ensure that the
-// rootCtx has the provided mount namespace.
-func (c *containerMounter) setupRootContainer(userCtx context.Context, rootCtx context.Context, conf *Config, setMountNS func(*fs.MountNamespace)) error {
- for _, hint := range c.hints.mounts {
- log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
- inode, err := c.mountSharedMaster(rootCtx, conf, hint)
- if err != nil {
- return fmt.Errorf("mounting shared master %q: %v", hint.name, err)
- }
- hint.root = inode
- }
-
- rootInode, err := c.createRootMount(rootCtx, conf)
- if err != nil {
- return fmt.Errorf("creating root mount: %v", err)
- }
- mns, err := fs.NewMountNamespace(userCtx, rootInode)
- if err != nil {
- return fmt.Errorf("creating root mount namespace: %v", err)
- }
- setMountNS(mns)
-
- root := mns.Root()
- defer root.DecRef()
- return c.mountSubmounts(rootCtx, conf, mns, root)
-}
-
// mountSharedMaster mounts the master of a volume that is shared among
// containers in a pod. It returns the root mount's inode.
func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config, hint *mountHint) (*fs.Inode, error) {
@@ -717,25 +720,6 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
return fsName, opts, useOverlay, err
}
-func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error {
- for _, m := range c.mounts {
- if hint := c.hints.findMount(m); hint != nil && hint.isSupported() {
- if err := c.mountSharedSubmount(ctx, mns, root, m, hint); err != nil {
- return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, m.Destination, err)
- }
- } else {
- if err := c.mountSubmount(ctx, conf, mns, root, m); err != nil {
- return fmt.Errorf("mount submount %q: %v", m.Destination, err)
- }
- }
- }
-
- if err := c.mountTmp(ctx, conf, mns, root); err != nil {
- return fmt.Errorf("mount submount %q: %v", "tmp", err)
- }
- return nil
-}
-
// mountSubmount mounts volumes inside the container's root. Because mounts may
// be readonly, a lower ramfs overlay is added to create the mount point dir.
// Another overlay is added with tmpfs on top if Config.Overlay is true.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 65ac67dbf..19b738705 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -27,12 +27,12 @@ import (
gtime "time"
specs "github.com/opencontainers/runtime-spec/specs-go"
+ "golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/cpuid"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/memutil"
"gvisor.dev/gvisor/pkg/rand"
- "gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/control"
"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -181,9 +181,6 @@ type Args struct {
// New initializes a new kernel loader configured by spec.
// New also handles setting up a kernel for restoring a container.
func New(args Args) (*Loader, error) {
- // Sets the reference leak check mode
- refs.SetLeakMode(args.Conf.ReferenceLeakMode)
-
// We initialize the rand package now to make sure /dev/urandom is pre-opened
// on kernels that do not support getrandom(2).
if err := rand.Init(); err != nil {
@@ -527,15 +524,14 @@ func (l *Loader) run() error {
// ours either way.
l.rootProcArgs.FDTable = fdTable
- // cid for root container can be empty. Only subcontainers need it to set
- // the mount location.
- mntr := newContainerMounter(l.spec, "", l.goferFDs, l.k, l.mountHints)
- if err := mntr.setupFS(ctx, l.conf, &l.rootProcArgs, l.rootProcArgs.Credentials); err != nil {
+ // Setup the root container file system.
+ l.startGoferMonitor(l.sandboxID, l.goferFDs)
+
+ mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints)
+ if err := mntr.processHints(l.conf); err != nil {
return err
}
-
- rootCtx := l.rootProcArgs.NewContext(l.k)
- if err := setExecutablePath(rootCtx, &l.rootProcArgs); err != nil {
+ if err := setupContainerFS(ctx, l.conf, mntr, &l.rootProcArgs); err != nil {
return err
}
@@ -549,7 +545,7 @@ func (l *Loader) run() error {
}
}
if !hasHomeEnvv {
- homeDir, err := getExecUserHome(rootCtx, l.rootProcArgs.MountNamespace, uint32(l.rootProcArgs.Credentials.RealKUID))
+ homeDir, err := getExecUserHome(ctx, l.rootProcArgs.MountNamespace, uint32(l.rootProcArgs.Credentials.RealKUID))
if err != nil {
return fmt.Errorf("error reading exec user: %v", err)
}
@@ -631,7 +627,6 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
// sentry currently supports only 1 mount namespace, which is tied to a
// single user namespace. Thus we must run in the same user namespace
// to access mounts.
- // TODO(b/63601033): Create a new mount namespace for the container.
creds := auth.NewUserCredentials(
auth.KUID(spec.Process.User.UID),
auth.KGID(spec.Process.User.GID),
@@ -687,13 +682,12 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
goferFDs = append(goferFDs, fd)
}
- mntr := newContainerMounter(spec, cid, goferFDs, l.k, l.mountHints)
- if err := mntr.setupFS(ctx, conf, &procArgs, creds); err != nil {
- return fmt.Errorf("configuring container FS: %v", err)
- }
+ // Setup the child container file system.
+ l.startGoferMonitor(cid, goferFDs)
- if err := setExecutablePath(ctx, &procArgs); err != nil {
- return fmt.Errorf("setting executable path for %+v: %v", procArgs, err)
+ mntr := newContainerMounter(spec, goferFDs, l.k, l.mountHints)
+ if err := setupContainerFS(ctx, conf, mntr, &procArgs); err != nil {
+ return err
}
// Create and start the new process.
@@ -710,17 +704,59 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
return nil
}
+// startGoferMonitor runs a goroutine to monitor gofer's health. It polls on
+// the gofer FDs looking for disconnects, and destroys the container if a
+// disconnect occurs in any of the gofer FDs.
+func (l *Loader) startGoferMonitor(cid string, goferFDs []int) {
+ go func() {
+ log.Debugf("Monitoring gofer health for container %q", cid)
+ var events []unix.PollFd
+ for _, fd := range goferFDs {
+ events = append(events, unix.PollFd{
+ Fd: int32(fd),
+ Events: unix.POLLHUP | unix.POLLRDHUP,
+ })
+ }
+ _, _, err := specutils.RetryEintr(func() (uintptr, uintptr, error) {
+ // Use ppoll instead of poll because it's already whilelisted in seccomp.
+ n, err := unix.Ppoll(events, nil, nil)
+ return uintptr(n), 0, err
+ })
+ if err != nil {
+ panic(fmt.Sprintf("Error monitoring gofer FDs: %v", err))
+ }
+
+ // Check if the gofer has stopped as part of normal container destruction.
+ // This is done just to avoid sending an annoying error message to the log.
+ // Note that there is a small race window in between mu.Unlock() and the
+ // lock being reacquired in destroyContainer(), but it's harmless to call
+ // destroyContainer() multiple times.
+ l.mu.Lock()
+ _, ok := l.processes[execID{cid: cid}]
+ l.mu.Unlock()
+ if ok {
+ log.Infof("Gofer socket disconnected, destroying container %q", cid)
+ if err := l.destroyContainer(cid); err != nil {
+ log.Warningf("Error destroying container %q after gofer stopped: %v", cid, err)
+ }
+ }
+ }()
+}
+
// destroyContainer stops a container if it is still running and cleans up its
// filesystem.
func (l *Loader) destroyContainer(cid string) error {
l.mu.Lock()
defer l.mu.Unlock()
- // Has the container started?
- _, _, err := l.threadGroupFromIDLocked(execID{cid: cid})
+ _, _, started, err := l.threadGroupFromIDLocked(execID{cid: cid})
+ if err != nil {
+ // Container doesn't exist.
+ return err
+ }
- // If the container has started, kill and wait for all processes.
- if err == nil {
+ // The container exists, has it been started?
+ if started {
if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
return fmt.Errorf("sending SIGKILL to all container processes: %v", err)
}
@@ -754,27 +790,22 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
l.mu.Lock()
defer l.mu.Unlock()
- tg, _, err := l.threadGroupFromIDLocked(execID{cid: args.ContainerID})
+ tg, _, started, err := l.threadGroupFromIDLocked(execID{cid: args.ContainerID})
if err != nil {
- return 0, fmt.Errorf("no such container: %q", args.ContainerID)
+ return 0, err
+ }
+ if !started {
+ return 0, fmt.Errorf("container %q not started", args.ContainerID)
}
- // Get the container Root Dirent and MountNamespace from the Task.
+ // Get the container MountNamespace from the Task.
tg.Leader().WithMuLocked(func(t *kernel.Task) {
- // FSContext.RootDirectory() will take an extra ref for us.
- args.Root = t.FSContext().RootDirectory()
-
// task.MountNamespace() does not take a ref, so we must do so
// ourselves.
args.MountNamespace = t.MountNamespace()
args.MountNamespace.IncRef()
})
- defer func() {
- if args.Root != nil {
- args.Root.DecRef()
- }
- args.MountNamespace.DecRef()
- }()
+ defer args.MountNamespace.DecRef()
// Start the process.
proc := control.Proc{Kernel: l.k}
@@ -895,6 +926,8 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err)
}
+ s.FillDefaultIPTables()
+
return &s, nil
default:
@@ -1026,20 +1059,28 @@ func (l *Loader) signalAllProcesses(cid string, signo int32) error {
func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, *host.TTYFileOperations, error) {
l.mu.Lock()
defer l.mu.Unlock()
- return l.threadGroupFromIDLocked(key)
+ tg, tty, ok, err := l.threadGroupFromIDLocked(key)
+ if err != nil {
+ return nil, nil, err
+ }
+ if !ok {
+ return nil, nil, fmt.Errorf("container %q not started", key.cid)
+ }
+ return tg, tty, nil
}
// threadGroupFromIDLocked returns the thread group and TTY for the given
// execution ID. TTY may be nil if the process is not attached to a terminal.
-// Returns error if execution ID is invalid or if container/process has not
-// started yet. Caller must hold 'mu'.
-func (l *Loader) threadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, *host.TTYFileOperations, error) {
+// Also returns a boolean indicating whether the container has already started.
+// Returns error if execution ID is invalid or if the container cannot be
+// found (maybe it has been deleted). Caller must hold 'mu'.
+func (l *Loader) threadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, *host.TTYFileOperations, bool, error) {
ep := l.processes[key]
if ep == nil {
- return nil, nil, fmt.Errorf("container not found")
+ return nil, nil, false, fmt.Errorf("container %q not found", key.cid)
}
if ep.tg == nil {
- return nil, nil, fmt.Errorf("container not started")
+ return nil, nil, false, nil
}
- return ep.tg, ep.tty, nil
+ return ep.tg, ep.tty, true, nil
}
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index ff713660d..147ff7703 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -401,17 +401,16 @@ func TestCreateMountNamespace(t *testing.T) {
}
defer cleanup()
- // setupRootContainer needs to find root from the context after the
- // namespace is created.
- var mns *fs.MountNamespace
- setMountNS := func(m *fs.MountNamespace) {
- mns = m
- ctx.(*contexttest.TestContext).RegisterValue(fs.CtxRoot, mns.Root())
+ mntr := newContainerMounter(&tc.spec, []int{sandEnd}, nil, &podMountHints{})
+ mns, err := mntr.createMountNamespace(ctx, conf)
+ if err != nil {
+ t.Fatalf("failed to create mount namespace: %v", err)
}
- mntr := newContainerMounter(&tc.spec, "", []int{sandEnd}, nil, &podMountHints{})
- if err := mntr.setupRootContainer(ctx, ctx, conf, setMountNS); err != nil {
- t.Fatalf("createMountNamespace test case %q failed: %v", tc.name, err)
+ ctx = fs.WithRoot(ctx, mns.Root())
+ if err := mntr.mountSubmounts(ctx, conf, mns); err != nil {
+ t.Fatalf("failed to create mount namespace: %v", err)
}
+
root := mns.Root()
defer root.DecRef()
for _, p := range tc.expectedPaths {
@@ -614,7 +613,7 @@ func TestRestoreEnvironment(t *testing.T) {
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
conf := testConfig()
- mntr := newContainerMounter(tc.spec, "", tc.ioFDs, nil, &podMountHints{})
+ mntr := newContainerMounter(tc.spec, tc.ioFDs, nil, &podMountHints{})
actualRenv, err := mntr.createRestoreEnvironment(conf)
if !tc.errorExpected && err != nil {
t.Fatalf("could not create restore environment for test:%s", tc.name)
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index d3d98243d..ea0d9f790 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -38,8 +38,7 @@ type Network struct {
// Route represents a route in the network stack.
type Route struct {
- Destination net.IP
- Mask net.IPMask
+ Destination net.IPNet
Gateway net.IP
}
@@ -85,16 +84,19 @@ type CreateLinksAndRoutesArgs struct {
// Empty returns true if route hasn't been set.
func (r *Route) Empty() bool {
- return r.Destination == nil && r.Mask == nil && r.Gateway == nil
+ return r.Destination.IP == nil && r.Destination.Mask == nil && r.Gateway == nil
}
-func (r *Route) toTcpipRoute(id tcpip.NICID) tcpip.Route {
+func (r *Route) toTcpipRoute(id tcpip.NICID) (tcpip.Route, error) {
+ subnet, err := tcpip.NewSubnet(ipToAddress(r.Destination.IP), ipMaskToAddressMask(r.Destination.Mask))
+ if err != nil {
+ return tcpip.Route{}, err
+ }
return tcpip.Route{
- Destination: ipToAddress(r.Destination),
+ Destination: subnet,
Gateway: ipToAddress(r.Gateway),
- Mask: ipToAddressMask(net.IP(r.Mask)),
NIC: id,
- }
+ }, nil
}
// CreateLinksAndRoutes creates links and routes in a network stack. It should
@@ -128,7 +130,11 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
// Collect the routes from this link.
for _, r := range link.Routes {
- routes = append(routes, r.toTcpipRoute(nicID))
+ route, err := r.toTcpipRoute(nicID)
+ if err != nil {
+ return err
+ }
+ routes = append(routes, route)
}
}
@@ -170,7 +176,11 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
// Collect the routes from this link.
for _, r := range link.Routes {
- routes = append(routes, r.toTcpipRoute(nicID))
+ route, err := r.toTcpipRoute(nicID)
+ if err != nil {
+ return err
+ }
+ routes = append(routes, route)
}
}
@@ -179,7 +189,11 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
if !ok {
return fmt.Errorf("invalid interface name %q for default route", args.DefaultGateway.Name)
}
- routes = append(routes, args.DefaultGateway.Route.toTcpipRoute(nicID))
+ route, err := args.DefaultGateway.Route.toTcpipRoute(nicID)
+ if err != nil {
+ return err
+ }
+ routes = append(routes, route)
}
log.Infof("Setting routes %+v", routes)
@@ -230,8 +244,8 @@ func ipToAddress(ip net.IP) tcpip.Address {
return addr
}
-// ipToAddressMask converts IP to tcpip.AddressMask, ignoring the protocol.
-func ipToAddressMask(ip net.IP) tcpip.AddressMask {
- _, addr := ipToAddressAndProto(ip)
- return tcpip.AddressMask(addr)
+// ipMaskToAddressMask converts IPMask to tcpip.AddressMask, ignoring the
+// protocol.
+func ipMaskToAddressMask(ipMask net.IPMask) tcpip.AddressMask {
+ return tcpip.AddressMask(ipToAddress(net.IP(ipMask)))
}
diff --git a/runsc/boot/user_test.go b/runsc/boot/user_test.go
index 834003430..906baf3e5 100644
--- a/runsc/boot/user_test.go
+++ b/runsc/boot/user_test.go
@@ -164,13 +164,13 @@ func TestGetExecUserHome(t *testing.T) {
},
}
- var mns *fs.MountNamespace
- setMountNS := func(m *fs.MountNamespace) {
- mns = m
- ctx.(*contexttest.TestContext).RegisterValue(fs.CtxRoot, mns.Root())
+ mntr := newContainerMounter(spec, []int{sandEnd}, nil, &podMountHints{})
+ mns, err := mntr.createMountNamespace(ctx, conf)
+ if err != nil {
+ t.Fatalf("failed to create mount namespace: %v", err)
}
- mntr := newContainerMounter(spec, "", []int{sandEnd}, nil, &podMountHints{})
- if err := mntr.setupRootContainer(ctx, ctx, conf, setMountNS); err != nil {
+ ctx = fs.WithRoot(ctx, mns.Root())
+ if err := mntr.mountSubmounts(ctx, conf, mns); err != nil {
t.Fatalf("failed to create mount namespace: %v", err)
}
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index e246c38ae..de8202bb1 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -49,6 +49,7 @@ go_test(
"//pkg/abi/linux",
"//pkg/log",
"//pkg/sentry/control",
+ "//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
"//pkg/unet",
"//pkg/urpc",
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 8320bb2ca..bbb364214 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -138,6 +138,34 @@ type Container struct {
RootContainerDir string
}
+// loadSandbox loads all containers that belong to the sandbox with the given
+// ID.
+func loadSandbox(rootDir, id string) ([]*Container, error) {
+ cids, err := List(rootDir)
+ if err != nil {
+ return nil, err
+ }
+
+ // Load the container metadata.
+ var containers []*Container
+ for _, cid := range cids {
+ container, err := Load(rootDir, cid)
+ if err != nil {
+ // Container file may not exist if it raced with creation/deletion or
+ // directory was left behind. Load provides a snapshot in time, so it's
+ // fine to skip it.
+ if os.IsNotExist(err) {
+ continue
+ }
+ return nil, fmt.Errorf("loading container %q: %v", id, err)
+ }
+ if container.Sandbox.ID == id {
+ containers = append(containers, container)
+ }
+ }
+ return containers, nil
+}
+
// Load loads a container with the given id from a metadata file. id may be an
// abbreviation of the full container id, in which case Load loads the
// container to which id unambiguously refers to.
@@ -180,7 +208,7 @@ func Load(rootDir, id string) (*Container, error) {
// If the status is "Running" or "Created", check that the sandbox
// process still exists, and set it to Stopped if it does not.
//
- // This is inherently racey.
+ // This is inherently racy.
if c.Status == Running || c.Status == Created {
// Check if the sandbox process is still running.
if !c.isSandboxRunning() {
@@ -237,7 +265,13 @@ func List(rootDir string) ([]string, error) {
}
var out []string
for _, f := range fs {
- out = append(out, f.Name())
+ // Filter out directories that do no belong to a container.
+ cid := f.Name()
+ if validateID(cid) == nil {
+ if _, err := os.Stat(filepath.Join(rootDir, cid, metadataFilename)); err == nil {
+ out = append(out, f.Name())
+ }
+ }
}
return out, nil
}
@@ -475,7 +509,13 @@ func (c *Container) Start(conf *boot.Config) error {
}
c.changeStatus(Running)
- return c.save()
+ if err := c.save(); err != nil {
+ return err
+ }
+
+ // Adjust the oom_score_adj for sandbox and gofers. This must be done after
+ // save().
+ return c.adjustOOMScoreAdj(conf)
}
// Restore takes a container and replaces its kernel and file system
@@ -1098,3 +1138,68 @@ func runInCgroup(cg *cgroup.Cgroup, fn func() error) error {
}
return fn()
}
+
+// adjustOOMScoreAdj sets the oom_score_adj for the sandbox and all gofers.
+// oom_score_adj is set to the lowest oom_score_adj among the containers
+// running in the sandbox.
+//
+// TODO(gvisor.dev/issue/512): This call could race with other containers being
+// created at the same time and end up setting the wrong oom_score_adj to the
+// sandbox.
+func (c *Container) adjustOOMScoreAdj(conf *boot.Config) error {
+ // If this container's OOMScoreAdj is nil then we can exit early as no
+ // change should be made to oom_score_adj for the sandbox.
+ if c.Spec.Process.OOMScoreAdj == nil {
+ return nil
+ }
+
+ containers, err := loadSandbox(conf.RootDir, c.Sandbox.ID)
+ if err != nil {
+ return fmt.Errorf("loading sandbox containers: %v", err)
+ }
+
+ // Get the lowest score for all containers.
+ var lowScore int
+ scoreFound := false
+ for _, container := range containers {
+ if container.Spec.Process.OOMScoreAdj != nil && (!scoreFound || *container.Spec.Process.OOMScoreAdj < lowScore) {
+ scoreFound = true
+ lowScore = *container.Spec.Process.OOMScoreAdj
+ }
+ }
+
+ // Only set oom_score_adj if one of the containers has oom_score_adj set
+ // in the OCI bundle. If not, we need to inherit the parent process's
+ // oom_score_adj.
+ // See: https://github.com/opencontainers/runtime-spec/blob/master/config.md#linux-process
+ if !scoreFound {
+ return nil
+ }
+
+ // Set the lowest of all containers oom_score_adj to the sandbox.
+ if err := setOOMScoreAdj(c.Sandbox.Pid, lowScore); err != nil {
+ return fmt.Errorf("setting oom_score_adj for sandbox %q: %v", c.Sandbox.ID, err)
+ }
+
+ // Set container's oom_score_adj to the gofer since it is dedicated to the
+ // container, in case the gofer uses up too much memory.
+ if err := setOOMScoreAdj(c.GoferPid, *c.Spec.Process.OOMScoreAdj); err != nil {
+ return fmt.Errorf("setting gofer oom_score_adj for container %q: %v", c.ID, err)
+ }
+ return nil
+}
+
+// setOOMScoreAdj sets oom_score_adj to the given value for the given PID.
+// /proc must be available and mounted read-write. scoreAdj should be between
+// -1000 and 1000.
+func setOOMScoreAdj(pid int, scoreAdj int) error {
+ f, err := os.OpenFile(fmt.Sprintf("/proc/%d/oom_score_adj", pid), os.O_WRONLY, 0644)
+ if err != nil {
+ return err
+ }
+ defer f.Close()
+ if _, err := f.WriteString(strconv.Itoa(scoreAdj)); err != nil {
+ return err
+ }
+ return nil
+}
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index ff68c586e..3d4f304f3 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -76,7 +76,7 @@ func waitForProcessCount(cont *Container, want int) error {
}
func blockUntilWaitable(pid int) error {
- _, _, err := testutil.RetryEintr(func() (uintptr, uintptr, error) {
+ _, _, err := specutils.RetryEintr(func() (uintptr, uintptr, error) {
var err error
_, _, err1 := syscall.Syscall6(syscall.SYS_WAITID, 1, uintptr(pid), 0, syscall.WEXITED|syscall.WNOWAIT, 0, 0)
if err1 != 0 {
@@ -1310,10 +1310,13 @@ func TestRunNonRoot(t *testing.T) {
t.Logf("Running test with conf: %+v", conf)
spec := testutil.NewSpecWithArgs("/bin/true")
+
+ // Set a random user/group with no access to "blocked" dir.
spec.Process.User.UID = 343
spec.Process.User.GID = 2401
+ spec.Process.Capabilities = nil
- // User that container runs as can't list '$TMP/blocked' and would fail to
+ // User running inside container can't list '$TMP/blocked' and would fail to
// mount it.
dir, err := ioutil.TempDir(testutil.TmpDir(), "blocked")
if err != nil {
@@ -1327,6 +1330,17 @@ func TestRunNonRoot(t *testing.T) {
t.Fatalf("os.MkDir(%q) failed: %v", dir, err)
}
+ src, err := ioutil.TempDir(testutil.TmpDir(), "src")
+ if err != nil {
+ t.Fatalf("ioutil.TempDir() failed: %v", err)
+ }
+
+ spec.Mounts = append(spec.Mounts, specs.Mount{
+ Destination: dir,
+ Source: src,
+ Type: "bind",
+ })
+
if err := run(spec, conf); err != nil {
t.Fatalf("error running sandbox: %v", err)
}
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 978a422f5..ae03d24b4 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -29,6 +29,7 @@ import (
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/pkg/sentry/control"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/runsc/boot"
"gvisor.dev/gvisor/runsc/specutils"
"gvisor.dev/gvisor/runsc/test/testutil"
@@ -59,11 +60,14 @@ func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) {
}
func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*Container, func(), error) {
- rootDir, err := testutil.SetupRootDir()
- if err != nil {
- return nil, nil, fmt.Errorf("error creating root dir: %v", err)
+ // Setup root dir if one hasn't been provided.
+ if len(conf.RootDir) == 0 {
+ rootDir, err := testutil.SetupRootDir()
+ if err != nil {
+ return nil, nil, fmt.Errorf("error creating root dir: %v", err)
+ }
+ conf.RootDir = rootDir
}
- conf.RootDir = rootDir
var containers []*Container
var bundles []string
@@ -74,7 +78,7 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C
for _, b := range bundles {
os.RemoveAll(b)
}
- os.RemoveAll(rootDir)
+ os.RemoveAll(conf.RootDir)
}
for i, spec := range specs {
bundleDir, err := testutil.SetupBundleDir(spec)
@@ -488,7 +492,7 @@ func TestMultiContainerSignal(t *testing.T) {
if err := containers[1].Destroy(); err != nil {
t.Errorf("failed to destroy container: %v", err)
}
- _, _, err = testutil.RetryEintr(func() (uintptr, uintptr, error) {
+ _, _, err = specutils.RetryEintr(func() (uintptr, uintptr, error) {
cpid, err := syscall.Wait4(goferPid, nil, 0, nil)
return uintptr(cpid), 0, err
})
@@ -905,9 +909,9 @@ func TestMultiContainerDifferentFilesystems(t *testing.T) {
}
}
-// TestMultiContainerGoferStop tests that IO operations continue to work after
-// containers have been stopped and gofers killed.
-func TestMultiContainerGoferStop(t *testing.T) {
+// TestMultiContainerContainerDestroyStress tests that IO operations continue
+// to work after containers have been stopped and gofers killed.
+func TestMultiContainerContainerDestroyStress(t *testing.T) {
app, err := testutil.FindFile("runsc/container/test_app/test_app")
if err != nil {
t.Fatal("error finding test_app:", err)
@@ -1345,3 +1349,194 @@ func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) {
}
}
}
+
+// Test that container is destroyed when Gofer is killed.
+func TestMultiContainerGoferKilled(t *testing.T) {
+ sleep := []string{"sleep", "100"}
+ specs, ids := createSpecs(sleep, sleep, sleep)
+ conf := testutil.TestConfig()
+ containers, cleanup, err := startContainers(conf, specs, ids)
+ if err != nil {
+ t.Fatalf("error starting containers: %v", err)
+ }
+ defer cleanup()
+
+ // Ensure container is running
+ c := containers[2]
+ expectedPL := []*control.Process{
+ {PID: 3, Cmd: "sleep"},
+ }
+ if err := waitForProcessList(c, expectedPL); err != nil {
+ t.Errorf("failed to wait for sleep to start: %v", err)
+ }
+
+ // Kill container's gofer.
+ if err := syscall.Kill(c.GoferPid, syscall.SIGKILL); err != nil {
+ t.Fatalf("syscall.Kill(%d, SIGKILL)=%v", c.GoferPid, err)
+ }
+
+ // Wait until container stops.
+ if err := waitForProcessList(c, nil); err != nil {
+ t.Errorf("Container %q was not stopped after gofer death: %v", c.ID, err)
+ }
+
+ // Check that container isn't running anymore.
+ args := &control.ExecArgs{Argv: []string{"/bin/true"}}
+ if _, err := c.executeSync(args); err == nil {
+ t.Fatalf("Container %q was not stopped after gofer death", c.ID)
+ }
+
+ // Check that other containers are unaffected.
+ for i, c := range containers {
+ if i == 2 {
+ continue // container[2] has been killed.
+ }
+ pl := []*control.Process{
+ {PID: kernel.ThreadID(i + 1), Cmd: "sleep"},
+ }
+ if err := waitForProcessList(c, pl); err != nil {
+ t.Errorf("Container %q was affected by another container: %v", c.ID, err)
+ }
+ args := &control.ExecArgs{Argv: []string{"/bin/true"}}
+ if _, err := c.executeSync(args); err != nil {
+ t.Fatalf("Container %q was affected by another container: %v", c.ID, err)
+ }
+ }
+
+ // Kill root container's gofer to bring entire sandbox down.
+ c = containers[0]
+ if err := syscall.Kill(c.GoferPid, syscall.SIGKILL); err != nil {
+ t.Fatalf("syscall.Kill(%d, SIGKILL)=%v", c.GoferPid, err)
+ }
+
+ // Wait until sandbox stops. waitForProcessList will loop until sandbox exits
+ // and RPC errors out.
+ impossiblePL := []*control.Process{
+ {PID: 100, Cmd: "non-existent-process"},
+ }
+ if err := waitForProcessList(c, impossiblePL); err == nil {
+ t.Fatalf("Sandbox was not killed after gofer death")
+ }
+
+ // Check that entire sandbox isn't running anymore.
+ for _, c := range containers {
+ args := &control.ExecArgs{Argv: []string{"/bin/true"}}
+ if _, err := c.executeSync(args); err == nil {
+ t.Fatalf("Container %q was not stopped after gofer death", c.ID)
+ }
+ }
+}
+
+func TestMultiContainerLoadSandbox(t *testing.T) {
+ sleep := []string{"sleep", "100"}
+ specs, ids := createSpecs(sleep, sleep, sleep)
+ conf := testutil.TestConfig()
+
+ // Create containers for the sandbox.
+ wants, cleanup, err := startContainers(conf, specs, ids)
+ if err != nil {
+ t.Fatalf("error starting containers: %v", err)
+ }
+ defer cleanup()
+
+ // Then create unrelated containers.
+ for i := 0; i < 3; i++ {
+ specs, ids = createSpecs(sleep, sleep, sleep)
+ _, cleanup, err = startContainers(conf, specs, ids)
+ if err != nil {
+ t.Fatalf("error starting containers: %v", err)
+ }
+ defer cleanup()
+ }
+
+ // Create an unrelated directory under root.
+ dir := filepath.Join(conf.RootDir, "not-a-container")
+ if err := os.MkdirAll(dir, 0755); err != nil {
+ t.Fatalf("os.MkdirAll(%q)=%v", dir, err)
+ }
+
+ // Create a valid but empty container directory.
+ randomCID := testutil.UniqueContainerID()
+ dir = filepath.Join(conf.RootDir, randomCID)
+ if err := os.MkdirAll(dir, 0755); err != nil {
+ t.Fatalf("os.MkdirAll(%q)=%v", dir, err)
+ }
+
+ // Load the sandbox and check that the correct containers were returned.
+ id := wants[0].Sandbox.ID
+ gots, err := loadSandbox(conf.RootDir, id)
+ if err != nil {
+ t.Fatalf("loadSandbox()=%v", err)
+ }
+ wantIDs := make(map[string]struct{})
+ for _, want := range wants {
+ wantIDs[want.ID] = struct{}{}
+ }
+ for _, got := range gots {
+ if got.Sandbox.ID != id {
+ t.Errorf("wrong sandbox ID, got: %v, want: %v", got.Sandbox.ID, id)
+ }
+ if _, ok := wantIDs[got.ID]; !ok {
+ t.Errorf("wrong container ID, got: %v, wants: %v", got.ID, wantIDs)
+ }
+ delete(wantIDs, got.ID)
+ }
+ if len(wantIDs) != 0 {
+ t.Errorf("containers not found: %v", wantIDs)
+ }
+}
+
+// TestMultiContainerRunNonRoot checks that child container can be configured
+// when running as non-privileged user.
+func TestMultiContainerRunNonRoot(t *testing.T) {
+ cmdRoot := []string{"/bin/sleep", "100"}
+ cmdSub := []string{"/bin/true"}
+ podSpecs, ids := createSpecs(cmdRoot, cmdSub)
+
+ // User running inside container can't list '$TMP/blocked' and would fail to
+ // mount it.
+ blocked, err := ioutil.TempDir(testutil.TmpDir(), "blocked")
+ if err != nil {
+ t.Fatalf("ioutil.TempDir() failed: %v", err)
+ }
+ if err := os.Chmod(blocked, 0700); err != nil {
+ t.Fatalf("os.MkDir(%q) failed: %v", blocked, err)
+ }
+ dir := path.Join(blocked, "test")
+ if err := os.Mkdir(dir, 0755); err != nil {
+ t.Fatalf("os.MkDir(%q) failed: %v", dir, err)
+ }
+
+ src, err := ioutil.TempDir(testutil.TmpDir(), "src")
+ if err != nil {
+ t.Fatalf("ioutil.TempDir() failed: %v", err)
+ }
+
+ // Set a random user/group with no access to "blocked" dir.
+ podSpecs[1].Process.User.UID = 343
+ podSpecs[1].Process.User.GID = 2401
+ podSpecs[1].Process.Capabilities = nil
+
+ podSpecs[1].Mounts = append(podSpecs[1].Mounts, specs.Mount{
+ Destination: dir,
+ Source: src,
+ Type: "bind",
+ })
+
+ conf := testutil.TestConfig()
+ pod, cleanup, err := startContainers(conf, podSpecs, ids)
+ if err != nil {
+ t.Fatalf("error starting containers: %v", err)
+ }
+ defer cleanup()
+
+ // Once all containers are started, wait for the child container to exit.
+ // This means that the volume was mounted properly.
+ ws, err := pod[1].Wait()
+ if err != nil {
+ t.Fatalf("running child container: %v", err)
+ }
+ if !ws.Exited() || ws.ExitStatus() != 0 {
+ t.Fatalf("child container failed, waitStatus: %v", ws)
+ }
+}
diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go
index 2d50774d4..8ddfa77d6 100644
--- a/runsc/fsgofer/filter/config.go
+++ b/runsc/fsgofer/filter/config.go
@@ -138,7 +138,7 @@ var allowedSyscalls = seccomp.SyscallRules{
syscall.SYS_NANOSLEEP: {},
syscall.SYS_NEWFSTATAT: {},
syscall.SYS_OPENAT: {},
- syscall.SYS_POLL: {},
+ syscall.SYS_PPOLL: {},
syscall.SYS_PREAD64: {},
syscall.SYS_PWRITE64: {},
syscall.SYS_READ: {},
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index fe450c64f..7c4d2b94e 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -125,7 +125,7 @@ func (a *attachPoint) Attach() (p9.File, error) {
return nil, fmt.Errorf("stat file %q, err: %v", a.prefix, err)
}
mode := syscall.O_RDWR
- if a.conf.ROMount || stat.Mode&syscall.S_IFDIR != 0 {
+ if a.conf.ROMount || (stat.Mode&syscall.S_IFMT) == syscall.S_IFDIR {
mode = syscall.O_RDONLY
}
@@ -141,9 +141,13 @@ func (a *attachPoint) Attach() (p9.File, error) {
f.Close()
return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix)
}
- a.attached = true
- return newLocalFile(a, f, a.prefix, stat)
+ rv, err := newLocalFile(a, f, a.prefix, stat)
+ if err != nil {
+ return nil, err
+ }
+ a.attached = true
+ return rv, nil
}
// makeQID returns a unique QID for the given stat buffer.
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
index 0a162bb8a..cbbe71019 100644
--- a/runsc/fsgofer/fsgofer_test.go
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -17,8 +17,10 @@ package fsgofer
import (
"fmt"
"io/ioutil"
+ "net"
"os"
"path"
+ "path/filepath"
"syscall"
"testing"
@@ -621,6 +623,54 @@ func TestAttachFile(t *testing.T) {
}
}
+func TestAttachInvalidType(t *testing.T) {
+ dir, err := ioutil.TempDir("", "attach-")
+ if err != nil {
+ t.Fatalf("ioutil.TempDir() failed, err: %v", err)
+ }
+ defer os.RemoveAll(dir)
+
+ fifo := filepath.Join(dir, "fifo")
+ if err := syscall.Mkfifo(fifo, 0755); err != nil {
+ t.Fatalf("Mkfifo(%q): %v", fifo, err)
+ }
+
+ dirFile, err := os.Open(dir)
+ if err != nil {
+ t.Fatalf("Open(%s): %v", dir, err)
+ }
+ defer dirFile.Close()
+
+ // Bind a socket via /proc to be sure that a length of a socket path
+ // is less than UNIX_PATH_MAX.
+ socket := filepath.Join(fmt.Sprintf("/proc/self/fd/%d", dirFile.Fd()), "socket")
+ l, err := net.Listen("unix", socket)
+ if err != nil {
+ t.Fatalf("net.Listen(unix, %q): %v", socket, err)
+ }
+ defer l.Close()
+
+ for _, tc := range []struct {
+ name string
+ path string
+ }{
+ {name: "fifo", path: fifo},
+ {name: "socket", path: socket},
+ } {
+ t.Run(tc.name, func(t *testing.T) {
+ conf := Config{ROMount: false}
+ a, err := NewAttachPoint(tc.path, conf)
+ if err != nil {
+ t.Fatalf("NewAttachPoint failed: %v", err)
+ }
+ f, err := a.Attach()
+ if f != nil || err == nil {
+ t.Fatalf("Attach should have failed, got (%v, nil)", f)
+ }
+ })
+ }
+}
+
func TestDoubleAttachError(t *testing.T) {
conf := Config{ROMount: false}
root, err := ioutil.TempDir("", "root-")
diff --git a/runsc/main.go b/runsc/main.go
index 58e7dd8f3..70f06dbb8 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -22,6 +22,7 @@ import (
"io"
"io/ioutil"
"os"
+ "os/signal"
"path/filepath"
"strings"
"syscall"
@@ -30,6 +31,7 @@ import (
"github.com/google/subcommands"
"gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/runsc/boot"
"gvisor.dev/gvisor/runsc/cmd"
@@ -174,6 +176,10 @@ func main() {
cmd.Fatalf("%v", err)
}
+ // Sets the reference leak check mode. Also set it in config below to
+ // propagate it to child processes.
+ refs.SetLeakMode(refsLeakMode)
+
// Create a new Config from the flags.
conf := &boot.Config{
RootDir: *rootDir,
@@ -264,6 +270,13 @@ func main() {
log.Infof("\t\tStrace: %t, max size: %d, syscalls: %s", conf.Strace, conf.StraceLogSize, conf.StraceSyscalls)
log.Infof("***************************")
+ if *testOnlyAllowRunAsCurrentUserWithoutChroot {
+ // SIGTERM is sent to all processes if a test exceeds its
+ // timeout and this case is handled by syscall_test_runner.
+ log.Warningf("Block the TERM signal. This is only safe in tests!")
+ signal.Ignore(syscall.SIGTERM)
+ }
+
// Call the subcommand and pass in the configuration.
var ws syscall.WaitStatus
subcmdCode := subcommands.Execute(context.Background(), conf, &ws)
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index a965a9dcb..5634f0707 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -33,13 +33,6 @@ import (
"gvisor.dev/gvisor/runsc/specutils"
)
-const (
- // Annotations used to indicate whether the container corresponds to a
- // pod or a container within a pod.
- crioContainerTypeAnnotation = "io.kubernetes.cri-o.ContainerType"
- containerdContainerTypeAnnotation = "io.kubernetes.cri.container-type"
-)
-
// setupNetwork configures the network stack to mimic the local network
// configuration. Docker uses network namespaces with vnets to configure the
// network for the container. The untrusted app expects to see the same network
@@ -88,12 +81,17 @@ func createDefaultLoopbackInterface(conn *urpc.Client) error {
},
Routes: []boot.Route{
{
- Destination: net.IP("\x7f\x00\x00\x00"),
- Mask: net.IPMask("\xff\x00\x00\x00"),
+ Destination: net.IPNet{
+
+ IP: net.IPv4(0x7f, 0, 0, 0),
+ Mask: net.IPv4Mask(0xff, 0, 0, 0),
+ },
},
{
- Destination: net.IPv6loopback,
- Mask: net.IPMask(strings.Repeat("\xff", 16)),
+ Destination: net.IPNet{
+ IP: net.IPv6loopback,
+ Mask: net.IPMask(strings.Repeat("\xff", net.IPv6len)),
+ },
},
},
}
@@ -333,12 +331,13 @@ func loopbackLinks(iface net.Interface, addrs []net.Addr) ([]boot.LoopbackLink,
if !ok {
return nil, fmt.Errorf("address is not IPNet: %+v", addr)
}
+ dst := *ipNet
+ dst.IP = dst.IP.Mask(dst.Mask)
links = append(links, boot.LoopbackLink{
Name: iface.Name,
Addresses: []net.IP{ipNet.IP},
Routes: []boot.Route{{
- Destination: ipNet.IP.Mask(ipNet.Mask),
- Mask: ipNet.Mask,
+ Destination: dst,
}},
})
}
@@ -374,9 +373,11 @@ func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, error) {
}
// Create a catch all route to the gateway.
def = &boot.Route{
- Destination: net.IPv4zero,
- Mask: net.IPMask(net.IPv4zero),
- Gateway: r.Gw,
+ Destination: net.IPNet{
+ IP: net.IPv4zero,
+ Mask: net.IPMask(net.IPv4zero),
+ },
+ Gateway: r.Gw,
}
continue
}
@@ -384,9 +385,10 @@ func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, error) {
log.Warningf("IPv6 is not supported, skipping route: %v", r)
continue
}
+ dst := *r.Dst
+ dst.IP = dst.IP.Mask(dst.Mask)
routes = append(routes, boot.Route{
- Destination: r.Dst.IP.Mask(r.Dst.Mask),
- Mask: r.Dst.Mask,
+ Destination: dst,
Gateway: r.Gw,
})
}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 4a11f617d..df3c0c5ef 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -54,7 +54,7 @@ type Sandbox struct {
// ID as the first container run in the sandbox.
ID string `json:"id"`
- // Pid is the pid of the running sandbox (immutable). May be 0 is the sandbox
+ // Pid is the pid of the running sandbox (immutable). May be 0 if the sandbox
// is not running.
Pid int `json:"pid"`
@@ -361,6 +361,8 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
nextFD++
}
+ cmd.Args = append(cmd.Args, "--panic-signal="+strconv.Itoa(int(syscall.SIGTERM)))
+
// Add the "boot" command to the args.
//
// All flags after this must be for the boot command
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 0b40e38a3..2eec92349 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -492,3 +492,14 @@ func (c *Cleanup) Clean() {
func (c *Cleanup) Release() {
c.clean = nil
}
+
+// RetryEintr retries the function until an error different than EINTR is
+// returned.
+func RetryEintr(f func() (uintptr, uintptr, error)) (uintptr, uintptr, error) {
+ for {
+ r1, r2, err := f()
+ if err != syscall.EINTR {
+ return r1, r2, err
+ }
+ }
+}
diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go
index 3f3e191b0..94e625259 100644
--- a/runsc/test/testutil/docker.go
+++ b/runsc/test/testutil/docker.go
@@ -297,7 +297,11 @@ func (d *Docker) Remove() error {
func (d *Docker) CleanUp() {
d.logDockerID()
if _, err := do("kill", d.Name); err != nil {
- log.Printf("error killing container %q: %v", d.Name, err)
+ if strings.Contains(err.Error(), "is not running") {
+ // Nothing to kill. Don't log the error in this case.
+ } else {
+ log.Printf("error killing container %q: %v", d.Name, err)
+ }
}
if err := d.Remove(); err != nil {
log.Print(err)
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index a98675bfc..4a3dfa0e3 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -127,13 +127,15 @@ func FindFile(path string) (string, error) {
// 'RootDir' must be set by caller if required.
func TestConfig() *boot.Config {
return &boot.Config{
- Debug: true,
- LogFormat: "text",
- LogPackets: true,
- Network: boot.NetworkNone,
- Strace: true,
- Platform: "ptrace",
- FileAccess: boot.FileAccessExclusive,
+ Debug: true,
+ LogFormat: "text",
+ DebugLogFormat: "text",
+ AlsoLogToStderr: true,
+ LogPackets: true,
+ Network: boot.NetworkNone,
+ Strace: true,
+ Platform: "ptrace",
+ FileAccess: boot.FileAccessExclusive,
TestOnlyAllowRunAsCurrentUserWithoutChroot: true,
NumNetworkChannels: 1,
}
@@ -189,11 +191,14 @@ func SetupRootDir() (string, error) {
// SetupContainer creates a bundle and root dir for the container, generates a
// test config, and writes the spec to config.json in the bundle dir.
func SetupContainer(spec *specs.Spec, conf *boot.Config) (rootDir, bundleDir string, err error) {
- rootDir, err = SetupRootDir()
- if err != nil {
- return "", "", err
+ // Setup root dir if one hasn't been provided.
+ if len(conf.RootDir) == 0 {
+ rootDir, err = SetupRootDir()
+ if err != nil {
+ return "", "", err
+ }
+ conf.RootDir = rootDir
}
- conf.RootDir = rootDir
bundleDir, err = SetupBundleDir(spec)
return rootDir, bundleDir, err
}
@@ -348,17 +353,6 @@ func StartReaper() func() {
return r.Stop
}
-// RetryEintr retries the function until an error different than EINTR is
-// returned.
-func RetryEintr(f func() (uintptr, uintptr, error)) (uintptr, uintptr, error) {
- for {
- r1, r2, err := f()
- if err != syscall.EINTR {
- return r1, r2, err
- }
- }
-}
-
// WaitUntilRead reads from the given reader until the wanted string is found
// or until timeout.
func WaitUntilRead(r io.Reader, want string, split bufio.SplitFunc, timeout time.Duration) error {