diff options
Diffstat (limited to 'runsc/boot')
-rw-r--r-- | runsc/boot/BUILD | 4 | ||||
-rw-r--r-- | runsc/boot/config.go | 63 | ||||
-rw-r--r-- | runsc/boot/controller.go | 14 | ||||
-rw-r--r-- | runsc/boot/filter/config.go | 14 | ||||
-rw-r--r-- | runsc/boot/fs.go | 256 | ||||
-rw-r--r-- | runsc/boot/loader.go | 127 | ||||
-rw-r--r-- | runsc/boot/loader_test.go | 17 | ||||
-rw-r--r-- | runsc/boot/network.go | 18 | ||||
-rw-r--r-- | runsc/boot/user.go | 28 | ||||
-rw-r--r-- | runsc/boot/user_test.go | 15 |
10 files changed, 337 insertions, 219 deletions
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index 588bb8851..6fe2b57de 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -57,10 +57,10 @@ go_library( "//pkg/sentry/pgalloc", "//pkg/sentry/platform", "//pkg/sentry/sighandling", - "//pkg/sentry/socket/epsocket", "//pkg/sentry/socket/hostinet", "//pkg/sentry/socket/netlink", "//pkg/sentry/socket/netlink/route", + "//pkg/sentry/socket/netstack", "//pkg/sentry/socket/unix", "//pkg/sentry/state", "//pkg/sentry/strace", @@ -80,6 +80,7 @@ go_library( "//pkg/tcpip/network/ipv6", "//pkg/tcpip/stack", "//pkg/tcpip/transport/icmp", + "//pkg/tcpip/transport/raw", "//pkg/tcpip/transport/tcp", "//pkg/tcpip/transport/udp", "//pkg/urpc", @@ -109,6 +110,7 @@ go_test( "//pkg/sentry/arch:registers_go_proto", "//pkg/sentry/context/contexttest", "//pkg/sentry/fs", + "//pkg/sentry/kernel/auth", "//pkg/unet", "//runsc/fsgofer", "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", diff --git a/runsc/boot/config.go b/runsc/boot/config.go index 7ae0dd05d..38278d0a2 100644 --- a/runsc/boot/config.go +++ b/runsc/boot/config.go @@ -19,6 +19,7 @@ import ( "strconv" "strings" + "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/watchdog" ) @@ -112,6 +113,34 @@ func MakeWatchdogAction(s string) (watchdog.Action, error) { } } +// MakeRefsLeakMode converts type from string. +func MakeRefsLeakMode(s string) (refs.LeakMode, error) { + switch strings.ToLower(s) { + case "disabled": + return refs.NoLeakChecking, nil + case "log-names": + return refs.LeaksLogWarning, nil + case "log-traces": + return refs.LeaksLogTraces, nil + default: + return 0, fmt.Errorf("invalid refs leakmode %q", s) + } +} + +func refsLeakModeToString(mode refs.LeakMode) string { + switch mode { + // If not set, default it to disabled. + case refs.UninitializedLeakChecking, refs.NoLeakChecking: + return "disabled" + case refs.LeaksLogWarning: + return "log-names" + case refs.LeaksLogTraces: + return "log-traces" + default: + panic(fmt.Sprintf("Invalid leakmode: %d", mode)) + } +} + // Config holds configuration that is not part of the runtime spec. type Config struct { // RootDir is the runtime root directory. @@ -138,6 +167,9 @@ type Config struct { // Overlay is whether to wrap the root filesystem in an overlay. Overlay bool + // FSGoferHostUDS enables the gofer to mount a host UDS. + FSGoferHostUDS bool + // Network indicates what type of network to use. Network NetworkType @@ -182,12 +214,6 @@ type Config struct { // RestoreFile is the path to the saved container image RestoreFile string - // TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in - // tests. It allows runsc to start the sandbox process as the current - // user, and without chrooting the sandbox process. This can be - // necessary in test environments that have limited capabilities. - TestOnlyAllowRunAsCurrentUserWithoutChroot bool - // NumNetworkChannels controls the number of AF_PACKET sockets that map // to the same underlying network device. This allows netstack to better // scale for high throughput use cases. @@ -201,6 +227,22 @@ type Config struct { // AlsoLogToStderr allows to send log messages to stderr. AlsoLogToStderr bool + + // ReferenceLeakMode sets reference leak check mode + ReferenceLeakMode refs.LeakMode + + // TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in + // tests. It allows runsc to start the sandbox process as the current + // user, and without chrooting the sandbox process. This can be + // necessary in test environments that have limited capabilities. + TestOnlyAllowRunAsCurrentUserWithoutChroot bool + + // TestOnlyTestNameEnv should only be used in tests. It looks up for the + // test name in the container environment variables and adds it to the debug + // log file name. This is done to help identify the log with the test when + // multiple tests are run in parallel, since there is no way to pass + // parameters to the runtime from docker. + TestOnlyTestNameEnv string } // ToFlags returns a slice of flags that correspond to the given Config. @@ -214,6 +256,7 @@ func (c *Config) ToFlags() []string { "--debug-log-format=" + c.DebugLogFormat, "--file-access=" + c.FileAccess.String(), "--overlay=" + strconv.FormatBool(c.Overlay), + "--fsgofer-host-uds=" + strconv.FormatBool(c.FSGoferHostUDS), "--network=" + c.Network.String(), "--log-packets=" + strconv.FormatBool(c.LogPackets), "--platform=" + c.Platform, @@ -227,10 +270,14 @@ func (c *Config) ToFlags() []string { "--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels), "--rootless=" + strconv.FormatBool(c.Rootless), "--alsologtostderr=" + strconv.FormatBool(c.AlsoLogToStderr), + "--ref-leak-mode=" + refsLeakModeToString(c.ReferenceLeakMode), } + // Only include these if set since it is never to be used by users. if c.TestOnlyAllowRunAsCurrentUserWithoutChroot { - // Only include if set since it is never to be used by users. - f = append(f, "-TESTONLY-unsafe-nonroot=true") + f = append(f, "--TESTONLY-unsafe-nonroot=true") + } + if len(c.TestOnlyTestNameEnv) != 0 { + f = append(f, "--TESTONLY-test-name-env="+c.TestOnlyTestNameEnv) } return f } diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index 72cbabd16..a73c593ea 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -18,7 +18,6 @@ import ( "errors" "fmt" "os" - "path" "syscall" specs "github.com/opencontainers/runtime-spec/specs-go" @@ -27,7 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/socket/epsocket" + "gvisor.dev/gvisor/pkg/sentry/socket/netstack" "gvisor.dev/gvisor/pkg/sentry/state" "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sentry/watchdog" @@ -142,7 +141,7 @@ func newController(fd int, l *Loader) (*controller, error) { } srv.Register(manager) - if eps, ok := l.k.NetworkStack().(*epsocket.Stack); ok { + if eps, ok := l.k.NetworkStack().(*netstack.Stack); ok { net := &Network{ Stack: eps.Stack, } @@ -234,13 +233,6 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { if args.CID == "" { return errors.New("start argument missing container ID") } - // Prevent CIDs containing ".." from confusing the sentry when creating - // /containers/<cid> directory. - // TODO(b/129293409): Once we have multiple independent roots, this - // check won't be necessary. - if path.Clean(args.CID) != args.CID { - return fmt.Errorf("container ID shouldn't contain directory traversals such as \"..\": %q", args.CID) - } if len(args.FilePayload.Files) < 4 { return fmt.Errorf("start arguments must contain stdin, stderr, and stdout followed by at least one file for the container root gofer") } @@ -355,7 +347,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { fs.SetRestoreEnvironment(*renv) // Prepare to load from the state file. - if eps, ok := networkStack.(*epsocket.Stack); ok { + if eps, ok := networkStack.(*netstack.Stack); ok { stack.StackFromEnv = eps.Stack // FIXME(b/36201077) } info, err := specFile.Stat() diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index 7ca776b3a..a2ecc6bcb 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -88,14 +88,24 @@ var allowedSyscalls = seccomp.SyscallRules{ seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG), seccomp.AllowAny{}, seccomp.AllowAny{}, - seccomp.AllowValue(0), }, { seccomp.AllowAny{}, seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG), seccomp.AllowAny{}, + }, + // Non-private variants are included for flipcall support. They are otherwise + // unncessary, as the sentry will use only private futexes internally. + { + seccomp.AllowAny{}, + seccomp.AllowValue(linux.FUTEX_WAIT), + seccomp.AllowAny{}, + seccomp.AllowAny{}, + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(linux.FUTEX_WAKE), seccomp.AllowAny{}, - seccomp.AllowValue(0), }, }, syscall.SYS_GETPID: {}, diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index b6eeacf98..393c2a88b 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -25,19 +25,21 @@ import ( // Include filesystem types that OCI spec might mount. _ "gvisor.dev/gvisor/pkg/sentry/fs/dev" - "gvisor.dev/gvisor/pkg/sentry/fs/gofer" _ "gvisor.dev/gvisor/pkg/sentry/fs/host" _ "gvisor.dev/gvisor/pkg/sentry/fs/proc" - "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" _ "gvisor.dev/gvisor/pkg/sentry/fs/sys" _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" _ "gvisor.dev/gvisor/pkg/sentry/fs/tty" specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/gofer" + "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/runsc/specutils" ) @@ -62,6 +64,9 @@ const ( nonefs = "none" ) +// tmpfs has some extra supported options that we must pass through. +var tmpfsAllowedOptions = []string{"mode", "uid", "gid"} + func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) { // Upper layer uses the same flags as lower, but it must be read-write. upperFlags := lowerFlags @@ -170,27 +175,25 @@ func p9MountOptions(fd int, fa FileAccessType) []string { func parseAndFilterOptions(opts []string, allowedKeys ...string) ([]string, error) { var out []string for _, o := range opts { - kv := strings.Split(o, "=") - switch len(kv) { - case 1: - if specutils.ContainsStr(allowedKeys, o) { - out = append(out, o) - continue - } - log.Warningf("ignoring unsupported key %q", kv) - case 2: - if specutils.ContainsStr(allowedKeys, kv[0]) { - out = append(out, o) - continue - } - log.Warningf("ignoring unsupported key %q", kv[0]) - default: - return nil, fmt.Errorf("invalid option %q", o) + ok, err := parseMountOption(o, allowedKeys...) + if err != nil { + return nil, err + } + if ok { + out = append(out, o) } } return out, nil } +func parseMountOption(opt string, allowedKeys ...string) (bool, error) { + kv := strings.SplitN(opt, "=", 3) + if len(kv) > 2 { + return false, fmt.Errorf("invalid option %q", opt) + } + return specutils.ContainsStr(allowedKeys, kv[0]), nil +} + // mountDevice returns a device string based on the fs type and target // of the mount. func mountDevice(m specs.Mount) string { @@ -205,6 +208,8 @@ func mountDevice(m specs.Mount) string { func mountFlags(opts []string) fs.MountSourceFlags { mf := fs.MountSourceFlags{} + // Note: changes to supported options must be reflected in + // isSupportedMountFlag() as well. for _, o := range opts { switch o { case "rw": @@ -222,6 +227,18 @@ func mountFlags(opts []string) fs.MountSourceFlags { return mf } +func isSupportedMountFlag(fstype, opt string) bool { + switch opt { + case "rw", "ro", "noatime", "noexec": + return true + } + if fstype == tmpfs { + ok, err := parseMountOption(opt, tmpfsAllowedOptions...) + return ok && err == nil + } + return false +} + func mustFindFilesystem(name string) fs.Filesystem { fs, ok := fs.FindFilesystem(name) if !ok { @@ -261,6 +278,18 @@ func subtargets(root string, mnts []specs.Mount) []string { return targets } +func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { + mns, err := mntr.setupFS(conf, procArgs) + if err != nil { + return err + } + + // Set namespace here so that it can be found in ctx. + procArgs.MountNamespace = mns + + return setExecutablePath(ctx, procArgs) +} + // setExecutablePath sets the procArgs.Filename by searching the PATH for an // executable matching the procArgs.Argv[0]. func setExecutablePath(ctx context.Context, procArgs *kernel.CreateProcessArgs) error { @@ -413,6 +442,39 @@ func (m *mountHint) isSupported() bool { return m.mount.Type == tmpfs && m.share == pod } +// checkCompatible verifies that shared mount is compatible with master. +// For now enforce that all options are the same. Once bind mount is properly +// supported, then we should ensure the master is less restrictive than the +// container, e.g. master can be 'rw' while container mounts as 'ro'. +func (m *mountHint) checkCompatible(mount specs.Mount) error { + // Remove options that don't affect to mount's behavior. + masterOpts := filterUnsupportedOptions(m.mount) + slaveOpts := filterUnsupportedOptions(mount) + + if len(masterOpts) != len(slaveOpts) { + return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts) + } + + sort.Strings(masterOpts) + sort.Strings(slaveOpts) + for i, opt := range masterOpts { + if opt != slaveOpts[i] { + return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts) + } + } + return nil +} + +func filterUnsupportedOptions(mount specs.Mount) []string { + rv := make([]string, 0, len(mount.Options)) + for _, o := range mount.Options { + if isSupportedMountFlag(mount.Type, o) { + rv = append(rv, o) + } + } + return rv +} + // podMountHints contains a collection of mountHints for the pod. type podMountHints struct { mounts map[string]*mountHint @@ -500,73 +562,95 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin } } -// setupChildContainer is used to set up the file system for non-root containers -// and amend the procArgs accordingly. This is the main entry point for this -// rest of functions in this file. procArgs are passed by reference and the -// FDMap field is modified. It dups stdioFDs. -func (c *containerMounter) setupChildContainer(conf *Config, procArgs *kernel.CreateProcessArgs) error { - // Setup a child container. - log.Infof("Creating new process in child container.") - - // Create a new root inode and mount namespace for the container. - rootCtx := c.k.SupervisorContext() - rootInode, err := c.createRootMount(rootCtx, conf) - if err != nil { - return fmt.Errorf("creating filesystem for container: %v", err) +// processHints processes annotations that container hints about how volumes +// should be mounted (e.g. a volume shared between containers). It must be +// called for the root container only. +func (c *containerMounter) processHints(conf *Config) error { + ctx := c.k.SupervisorContext() + for _, hint := range c.hints.mounts { + log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type) + inode, err := c.mountSharedMaster(ctx, conf, hint) + if err != nil { + return fmt.Errorf("mounting shared master %q: %v", hint.name, err) + } + hint.root = inode } - mns, err := fs.NewMountNamespace(rootCtx, rootInode) + return nil +} + +// setupFS is used to set up the file system for all containers. This is the +// main entry point method, with most of the other being internal only. It +// returns the mount namespace that is created for the container. +func (c *containerMounter) setupFS(conf *Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) { + log.Infof("Configuring container's file system") + + // Create context with root credentials to mount the filesystem (the current + // user may not be privileged enough). + rootProcArgs := *procArgs + rootProcArgs.WorkingDirectory = "/" + rootProcArgs.Credentials = auth.NewRootCredentials(procArgs.Credentials.UserNamespace) + rootProcArgs.Umask = 0022 + rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals + rootCtx := rootProcArgs.NewContext(c.k) + + mns, err := c.createMountNamespace(rootCtx, conf) if err != nil { - return fmt.Errorf("creating new mount namespace for container: %v", err) + return nil, err } - procArgs.MountNamespace = mns - root := mns.Root() - defer root.DecRef() - // Mount all submounts. - if err := c.mountSubmounts(rootCtx, conf, mns, root); err != nil { - return err + // Set namespace here so that it can be found in rootCtx. + rootProcArgs.MountNamespace = mns + + if err := c.mountSubmounts(rootCtx, conf, mns); err != nil { + return nil, err } - return c.checkDispenser() + return mns, nil } -func (c *containerMounter) checkDispenser() error { - if !c.fds.empty() { - return fmt.Errorf("not all gofer FDs were consumed, remaining: %v", c.fds) +func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Config) (*fs.MountNamespace, error) { + rootInode, err := c.createRootMount(ctx, conf) + if err != nil { + return nil, fmt.Errorf("creating filesystem for container: %v", err) } - return nil + mns, err := fs.NewMountNamespace(ctx, rootInode) + if err != nil { + return nil, fmt.Errorf("creating new mount namespace for container: %v", err) + } + return mns, nil } -// setupRootContainer creates a mount namespace containing the root filesystem -// and all mounts. 'rootCtx' is used to walk directories to find mount points. -// The 'setMountNS' callback is called after the mount namespace is created and -// will get a reference on that namespace. The callback must ensure that the -// rootCtx has the provided mount namespace. -func (c *containerMounter) setupRootContainer(userCtx context.Context, rootCtx context.Context, conf *Config, setMountNS func(*fs.MountNamespace)) error { - for _, hint := range c.hints.mounts { - log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type) - inode, err := c.mountSharedMaster(rootCtx, conf, hint) - if err != nil { - return fmt.Errorf("mounting shared master %q: %v", hint.name, err) +func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace) error { + root := mns.Root() + defer root.DecRef() + + for _, m := range c.mounts { + log.Debugf("Mounting %q to %q, type: %s, options: %s", m.Source, m.Destination, m.Type, m.Options) + if hint := c.hints.findMount(m); hint != nil && hint.isSupported() { + if err := c.mountSharedSubmount(ctx, mns, root, m, hint); err != nil { + return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, m.Destination, err) + } + } else { + if err := c.mountSubmount(ctx, conf, mns, root, m); err != nil { + return fmt.Errorf("mount submount %q: %v", m.Destination, err) + } } - hint.root = inode } - rootInode, err := c.createRootMount(rootCtx, conf) - if err != nil { - return fmt.Errorf("creating root mount: %v", err) + if err := c.mountTmp(ctx, conf, mns, root); err != nil { + return fmt.Errorf("mount submount %q: %v", "tmp", err) } - mns, err := fs.NewMountNamespace(userCtx, rootInode) - if err != nil { - return fmt.Errorf("creating root mount namespace: %v", err) + + if err := c.checkDispenser(); err != nil { + return err } - setMountNS(mns) + return nil +} - root := mns.Root() - defer root.DecRef() - if err := c.mountSubmounts(rootCtx, conf, mns, root); err != nil { - return fmt.Errorf("mounting submounts: %v", err) +func (c *containerMounter) checkDispenser() error { + if !c.fds.empty() { + return fmt.Errorf("not all gofer FDs were consumed, remaining: %v", c.fds) } - return c.checkDispenser() + return nil } // mountSharedMaster mounts the master of a volume that is shared among @@ -663,9 +747,7 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) ( fsName = sysfs case tmpfs: fsName = m.Type - - // tmpfs has some extra supported options that we must pass through. - opts, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid") + opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedOptions...) case bind: fd := c.fds.remove() @@ -684,25 +766,6 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) ( return fsName, opts, useOverlay, err } -func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error { - for _, m := range c.mounts { - if hint := c.hints.findMount(m); hint != nil && hint.isSupported() { - if err := c.mountSharedSubmount(ctx, mns, root, m, hint); err != nil { - return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, m.Destination, err) - } - } else { - if err := c.mountSubmount(ctx, conf, mns, root, m); err != nil { - return fmt.Errorf("mount submount %q: %v", m.Destination, err) - } - } - } - - if err := c.mountTmp(ctx, conf, mns, root); err != nil { - return fmt.Errorf("mount submount %q: %v", "tmp", err) - } - return nil -} - // mountSubmount mounts volumes inside the container's root. Because mounts may // be readonly, a lower ramfs overlay is added to create the mount point dir. // Another overlay is added with tmpfs on top if Config.Overlay is true. @@ -769,17 +832,8 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns // mountSharedSubmount binds mount to a previously mounted volume that is shared // among containers in the same pod. func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.MountNamespace, root *fs.Dirent, mount specs.Mount, source *mountHint) error { - // For now enforce that all options are the same. Once bind mount is properly - // supported, then we should ensure the master is less restrictive than the - // container, e.g. master can be 'rw' while container mounts as 'ro'. - if len(mount.Options) != len(source.mount.Options) { - return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", source.mount.Options, mount.Options) - } - sort.Strings(mount.Options) - for i, opt := range mount.Options { - if opt != source.mount.Options[i] { - return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", source.mount.Options, mount.Options) - } + if err := source.checkCompatible(mount); err != nil { + return err } maxTraversals := uint(0) diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index f91158027..c8e5e86ee 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -20,7 +20,6 @@ import ( mrand "math/rand" "os" "runtime" - "strings" "sync" "sync/atomic" "syscall" @@ -33,7 +32,6 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/memutil" "gvisor.dev/gvisor/pkg/rand" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -56,6 +54,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport/icmp" + "gvisor.dev/gvisor/pkg/tcpip/transport/raw" "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" "gvisor.dev/gvisor/pkg/tcpip/transport/udp" "gvisor.dev/gvisor/runsc/boot/filter" @@ -63,10 +62,10 @@ import ( "gvisor.dev/gvisor/runsc/specutils" // Include supported socket providers. - "gvisor.dev/gvisor/pkg/sentry/socket/epsocket" "gvisor.dev/gvisor/pkg/sentry/socket/hostinet" _ "gvisor.dev/gvisor/pkg/sentry/socket/netlink" _ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/route" + "gvisor.dev/gvisor/pkg/sentry/socket/netstack" _ "gvisor.dev/gvisor/pkg/sentry/socket/unix" ) @@ -527,34 +526,21 @@ func (l *Loader) run() error { // Setup the root container file system. l.startGoferMonitor(l.sandboxID, l.goferFDs) + mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints) - if err := mntr.setupRootContainer(ctx, ctx, l.conf, func(mns *fs.MountNamespace) { - l.rootProcArgs.MountNamespace = mns - }); err != nil { + if err := mntr.processHints(l.conf); err != nil { return err } - - if err := setExecutablePath(ctx, &l.rootProcArgs); err != nil { + if err := setupContainerFS(ctx, l.conf, mntr, &l.rootProcArgs); err != nil { return err } - // Read /etc/passwd for the user's HOME directory and set the HOME - // environment variable as required by POSIX if it is not overridden by - // the user. - hasHomeEnvv := false - for _, envv := range l.rootProcArgs.Envv { - if strings.HasPrefix(envv, "HOME=") { - hasHomeEnvv = true - } - } - if !hasHomeEnvv { - homeDir, err := getExecUserHome(ctx, l.rootProcArgs.MountNamespace, uint32(l.rootProcArgs.Credentials.RealKUID)) - if err != nil { - return fmt.Errorf("error reading exec user: %v", err) - } - - l.rootProcArgs.Envv = append(l.rootProcArgs.Envv, "HOME="+homeDir) + // Add the HOME enviroment variable if it is not already set. + envv, err := maybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace, l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv) + if err != nil { + return err } + l.rootProcArgs.Envv = envv // Create the root container init task. It will begin running // when the kernel is started. @@ -687,13 +673,10 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file // Setup the child container file system. l.startGoferMonitor(cid, goferFDs) - mntr := newContainerMounter(spec, goferFDs, l.k, l.mountHints) - if err := mntr.setupChildContainer(conf, &procArgs); err != nil { - return fmt.Errorf("configuring container FS: %v", err) - } - if err := setExecutablePath(ctx, &procArgs); err != nil { - return fmt.Errorf("setting executable path for %+v: %v", procArgs, err) + mntr := newContainerMounter(spec, goferFDs, l.k, l.mountHints) + if err := setupContainerFS(ctx, conf, mntr, &procArgs); err != nil { + return err } // Create and start the new process. @@ -766,26 +749,34 @@ func (l *Loader) destroyContainer(cid string) error { if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil { return fmt.Errorf("sending SIGKILL to all container processes: %v", err) } + // Wait for all processes that belong to the container to exit (including + // exec'd processes). + for _, t := range l.k.TaskSet().Root.Tasks() { + if t.ContainerID() == cid { + t.ThreadGroup().WaitExited() + } + } + + // At this point, all processes inside of the container have exited, + // releasing all references to the container's MountNamespace and + // causing all submounts and overlays to be unmounted. + // + // Since the container's MountNamespace has been released, + // MountNamespace.destroy() will have executed, but that function may + // trigger async close operations. We must wait for those to complete + // before returning, otherwise the caller may kill the gofer before + // they complete, causing a cascade of failing RPCs. + fs.AsyncBarrier() } - // Remove all container thread groups from the map. + // No more failure from this point on. Remove all container thread groups + // from the map. for key := range l.processes { if key.cid == cid { delete(l.processes, key) } } - // At this point, all processes inside of the container have exited, - // releasing all references to the container's MountNamespace and - // causing all submounts and overlays to be unmounted. - // - // Since the container's MountNamespace has been released, - // MountNamespace.destroy() will have executed, but that function may - // trigger async close operations. We must wait for those to complete - // before returning, otherwise the caller may kill the gofer before - // they complete, causing a cascade of failing RPCs. - fs.AsyncBarrier() - log.Debugf("Container destroyed %q", cid) return nil } @@ -813,6 +804,16 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { }) defer args.MountNamespace.DecRef() + // Add the HOME enviroment varible if it is not already set. + root := args.MountNamespace.Root() + defer root.DecRef() + ctx := fs.WithRoot(l.k.SupervisorContext(), root) + envv, err := maybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv) + if err != nil { + return 0, err + } + args.Envv = envv + // Start the process. proc := control.Proc{Kernel: l.k} args.PIDNamespace = tg.PIDNamespace() @@ -911,15 +912,17 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) { case NetworkNone, NetworkSandbox: // NetworkNone sets up loopback using netstack. - netProtos := []string{ipv4.ProtocolName, ipv6.ProtocolName, arp.ProtocolName} - protoNames := []string{tcp.ProtocolName, udp.ProtocolName, icmp.ProtocolName4} - s := epsocket.Stack{stack.New(netProtos, protoNames, stack.Options{ - Clock: clock, - Stats: epsocket.Metrics, - HandleLocal: true, + netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()} + transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()} + s := netstack.Stack{stack.New(stack.Options{ + NetworkProtocols: netProtos, + TransportProtocols: transProtos, + Clock: clock, + Stats: netstack.Metrics, + HandleLocal: true, // Enable raw sockets for users with sufficient // privileges. - Raw: true, + UnassociatedFactory: raw.EndpointFactory{}, })} // Enable SACK Recovery. @@ -927,6 +930,10 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) { return nil, fmt.Errorf("failed to enable SACK: %v", err) } + // Set default TTLs as required by socket/netstack. + s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL)) + s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL)) + // Enable Receive Buffer Auto-Tuning. if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil { return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err) @@ -1043,21 +1050,8 @@ func (l *Loader) signalAllProcesses(cid string, signo int32) error { // the signal is delivered. This prevents process leaks when SIGKILL is // sent to the entire container. l.k.Pause() - if err := l.k.SendContainerSignal(cid, &arch.SignalInfo{Signo: signo}); err != nil { - l.k.Unpause() - return err - } - l.k.Unpause() - - // If SIGKILLing all processes, wait for them to exit. - if linux.Signal(signo) == linux.SIGKILL { - for _, t := range l.k.TaskSet().Root.Tasks() { - if t.ContainerID() == cid { - t.ThreadGroup().WaitExited() - } - } - } - return nil + defer l.k.Unpause() + return l.k.SendContainerSignal(cid, &arch.SignalInfo{Signo: signo}) } // threadGroupFromID same as threadGroupFromIDLocked except that it acquires @@ -1090,8 +1084,3 @@ func (l *Loader) threadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, *host } return ep.tg, ep.tty, true, nil } - -func init() { - // TODO(gvisor.dev/issue/365): Make this configurable. - refs.SetLeakMode(refs.NoLeakChecking) -} diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index e0e32b9d5..147ff7703 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -401,17 +401,16 @@ func TestCreateMountNamespace(t *testing.T) { } defer cleanup() - // setupRootContainer needs to find root from the context after the - // namespace is created. - var mns *fs.MountNamespace - setMountNS := func(m *fs.MountNamespace) { - mns = m - ctx.(*contexttest.TestContext).RegisterValue(fs.CtxRoot, mns.Root()) - } mntr := newContainerMounter(&tc.spec, []int{sandEnd}, nil, &podMountHints{}) - if err := mntr.setupRootContainer(ctx, ctx, conf, setMountNS); err != nil { - t.Fatalf("createMountNamespace test case %q failed: %v", tc.name, err) + mns, err := mntr.createMountNamespace(ctx, conf) + if err != nil { + t.Fatalf("failed to create mount namespace: %v", err) } + ctx = fs.WithRoot(ctx, mns.Root()) + if err := mntr.mountSubmounts(ctx, conf, mns); err != nil { + t.Fatalf("failed to create mount namespace: %v", err) + } + root := mns.Root() defer root.DecRef() for _, p := range tc.expectedPaths { diff --git a/runsc/boot/network.go b/runsc/boot/network.go index ea0d9f790..32cba5ac1 100644 --- a/runsc/boot/network.go +++ b/runsc/boot/network.go @@ -121,10 +121,10 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct nicID++ nicids[link.Name] = nicID - linkEP := loopback.New() + ep := loopback.New() log.Infof("Enabling loopback interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses) - if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses, true /* loopback */); err != nil { + if err := n.createNICWithAddrs(nicID, link.Name, ep, link.Addresses, true /* loopback */); err != nil { return err } @@ -156,7 +156,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct } mac := tcpip.LinkAddress(link.LinkAddress) - linkEP, err := fdbased.New(&fdbased.Options{ + ep, err := fdbased.New(&fdbased.Options{ FDs: FDs, MTU: uint32(link.MTU), EthernetHeader: true, @@ -170,7 +170,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct } log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels) - if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses, false /* loopback */); err != nil { + if err := n.createNICWithAddrs(nicID, link.Name, ep, link.Addresses, false /* loopback */); err != nil { return err } @@ -203,14 +203,14 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct // createNICWithAddrs creates a NIC in the network stack and adds the given // addresses. -func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID, addrs []net.IP, loopback bool) error { +func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []net.IP, loopback bool) error { if loopback { - if err := n.Stack.CreateNamedLoopbackNIC(id, name, sniffer.New(linkEP)); err != nil { - return fmt.Errorf("CreateNamedLoopbackNIC(%v, %v, %v) failed: %v", id, name, linkEP, err) + if err := n.Stack.CreateNamedLoopbackNIC(id, name, sniffer.New(ep)); err != nil { + return fmt.Errorf("CreateNamedLoopbackNIC(%v, %v) failed: %v", id, name, err) } } else { - if err := n.Stack.CreateNamedNIC(id, name, sniffer.New(linkEP)); err != nil { - return fmt.Errorf("CreateNamedNIC(%v, %v, %v) failed: %v", id, name, linkEP, err) + if err := n.Stack.CreateNamedNIC(id, name, sniffer.New(ep)); err != nil { + return fmt.Errorf("CreateNamedNIC(%v, %v) failed: %v", id, name, err) } } diff --git a/runsc/boot/user.go b/runsc/boot/user.go index d1d423a5c..56cc12ee0 100644 --- a/runsc/boot/user.go +++ b/runsc/boot/user.go @@ -16,6 +16,7 @@ package boot import ( "bufio" + "fmt" "io" "strconv" "strings" @@ -23,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/usermem" ) @@ -42,7 +44,7 @@ func (r *fileReader) Read(buf []byte) (int, error) { // getExecUserHome returns the home directory of the executing user read from // /etc/passwd as read from the container filesystem. -func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid uint32) (string, error) { +func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid auth.KUID) (string, error) { // The default user home directory to return if no user matching the user // if found in the /etc/passwd found in the image. const defaultHome = "/" @@ -82,7 +84,7 @@ func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid uint32 File: f, } - homeDir, err := findHomeInPasswd(uid, r, defaultHome) + homeDir, err := findHomeInPasswd(uint32(uid), r, defaultHome) if err != nil { return "", err } @@ -90,6 +92,28 @@ func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid uint32 return homeDir, nil } +// maybeAddExecUserHome returns a new slice with the HOME enviroment variable +// set if the slice does not already contain it, otherwise it returns the +// original slice unmodified. +func maybeAddExecUserHome(ctx context.Context, mns *fs.MountNamespace, uid auth.KUID, envv []string) ([]string, error) { + // Check if the envv already contains HOME. + for _, env := range envv { + if strings.HasPrefix(env, "HOME=") { + // We have it. Return the original slice unmodified. + return envv, nil + } + } + + // Read /etc/passwd for the user's HOME directory and set the HOME + // environment variable as required by POSIX if it is not overridden by + // the user. + homeDir, err := getExecUserHome(ctx, mns, uid) + if err != nil { + return nil, fmt.Errorf("error reading exec user: %v", err) + } + return append(envv, "HOME="+homeDir), nil +} + // findHomeInPasswd parses a passwd file and returns the given user's home // directory. This function does it's best to replicate the runc's behavior. func findHomeInPasswd(uid uint32, passwd io.Reader, defaultHome string) (string, error) { diff --git a/runsc/boot/user_test.go b/runsc/boot/user_test.go index 01f666507..9aee2ad07 100644 --- a/runsc/boot/user_test.go +++ b/runsc/boot/user_test.go @@ -25,6 +25,7 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/sentry/context/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) func setupTempDir() (string, error) { @@ -68,7 +69,7 @@ func setupPasswd(contents string, perms os.FileMode) func() (string, error) { // TestGetExecUserHome tests the getExecUserHome function. func TestGetExecUserHome(t *testing.T) { tests := map[string]struct { - uid uint32 + uid auth.KUID createRoot func() (string, error) expected string }{ @@ -164,13 +165,13 @@ func TestGetExecUserHome(t *testing.T) { }, } - var mns *fs.MountNamespace - setMountNS := func(m *fs.MountNamespace) { - mns = m - ctx.(*contexttest.TestContext).RegisterValue(fs.CtxRoot, mns.Root()) - } mntr := newContainerMounter(spec, []int{sandEnd}, nil, &podMountHints{}) - if err := mntr.setupRootContainer(ctx, ctx, conf, setMountNS); err != nil { + mns, err := mntr.createMountNamespace(ctx, conf) + if err != nil { + t.Fatalf("failed to create mount namespace: %v", err) + } + ctx = fs.WithRoot(ctx, mns.Root()) + if err := mntr.mountSubmounts(ctx, conf, mns); err != nil { t.Fatalf("failed to create mount namespace: %v", err) } |