diff options
Diffstat (limited to 'runsc')
41 files changed, 2306 insertions, 763 deletions
diff --git a/runsc/BUILD b/runsc/BUILD index af8e928c5..8a57c597b 100644 --- a/runsc/BUILD +++ b/runsc/BUILD @@ -1,6 +1,4 @@ -package( - licenses = ["notice"], # Apache 2.0 -) +package(licenses = ["notice"]) # Apache 2.0 load("@io_bazel_rules_go//go:def.bzl", "go_binary") load("@bazel_tools//tools/build_defs/pkg:pkg.bzl", "pkg_deb", "pkg_tar") @@ -84,8 +82,9 @@ pkg_tar( genrule( name = "deb-version", outs = ["version.txt"], - cmd = "cat bazel-out/volatile-status.txt | grep VERSION | sed 's/^[^0-9]*//' >$@", + cmd = "$(location :runsc) -version | grep 'runsc version' | sed 's/^[^0-9]*//' > $@", stamp = 1, + tools = [":runsc"], ) pkg_deb( @@ -98,4 +97,7 @@ pkg_deb( package = "runsc", postinst = "debian/postinst.sh", version_file = ":version.txt", + visibility = [ + "//visibility:public", + ], ) diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index df9907e52..744f852a1 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -16,6 +16,7 @@ go_library( "limits.go", "loader.go", "network.go", + "pprof.go", "strace.go", ], importpath = "gvisor.googlesource.com/gvisor/runsc/boot", @@ -30,6 +31,7 @@ go_library( "//pkg/cpuid", "//pkg/eventchannel", "//pkg/log", + "//pkg/memutil", "//pkg/rand", "//pkg/sentry/arch", "//pkg/sentry/arch:registers_go_proto", @@ -51,7 +53,6 @@ go_library( "//pkg/sentry/kernel/kdefs", "//pkg/sentry/limits", "//pkg/sentry/loader", - "//pkg/sentry/memutil", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", "//pkg/sentry/platform/kvm", @@ -94,6 +95,7 @@ go_test( size = "small", srcs = [ "compat_test.go", + "fs_test.go", "loader_test.go", ], embed = [":boot"], diff --git a/runsc/boot/config.go b/runsc/boot/config.go index 15f624f9b..6112b6c0a 100644 --- a/runsc/boot/config.go +++ b/runsc/boot/config.go @@ -221,6 +221,17 @@ type Config struct { // user, and without chrooting the sandbox process. This can be // necessary in test environments that have limited capabilities. TestOnlyAllowRunAsCurrentUserWithoutChroot bool + + // NumNetworkChannels controls the number of AF_PACKET sockets that map + // to the same underlying network device. This allows netstack to better + // scale for high throughput use cases. + NumNetworkChannels int + + // Rootless allows the sandbox to be started with a user that is not root. + // Defense is depth measures are weaker with rootless. Specifically, the + // sandbox and Gofer process run as root inside a user namespace with root + // mapped to the caller's user. + Rootless bool } // ToFlags returns a slice of flags that correspond to the given Config. @@ -244,6 +255,8 @@ func (c *Config) ToFlags() []string { "--panic-signal=" + strconv.Itoa(c.PanicSignal), "--profile=" + strconv.FormatBool(c.ProfileEnable), "--net-raw=" + strconv.FormatBool(c.EnableRaw), + "--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels), + "--rootless=" + strconv.FormatBool(c.Rootless), } if c.TestOnlyAllowRunAsCurrentUserWithoutChroot { // Only include if set since it is never to be used by users. diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index 72ab9ef86..26765cc46 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -237,7 +237,7 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { return fmt.Errorf("start arguments must contain stdin, stderr, and stdout followed by at least one file for the container root gofer") } - err := cm.l.startContainer(cm.l.k, args.Spec, args.Conf, args.CID, args.FilePayload.Files) + err := cm.l.startContainer(args.Spec, args.Conf, args.CID, args.FilePayload.Files) if err != nil { log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err) return err @@ -340,8 +340,8 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { cm.l.k = k // Set up the restore environment. - fds := &fdDispenser{fds: cm.l.goferFDs} - renv, err := createRestoreEnvironment(cm.l.spec, cm.l.conf, fds) + mntr := newContainerMounter(cm.l.spec, "", cm.l.goferFDs, cm.l.k, cm.l.mountHints) + renv, err := mntr.createRestoreEnvironment(cm.l.conf) if err != nil { return fmt.Errorf("creating RestoreEnvironment: %v", err) } @@ -359,6 +359,17 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { return fmt.Errorf("file cannot be empty") } + if cm.l.conf.ProfileEnable { + // initializePProf opens /proc/self/maps, so has to be + // called before installing seccomp filters. + initializePProf() + } + + // Seccomp filters have to be applied before parsing the state file. + if err := cm.l.installSeccompFilters(); err != nil { + return err + } + // Load the state. loadOpts := state.LoadOpts{Source: specFile} if err := loadOpts.Load(k, networkStack); err != nil { @@ -369,11 +380,11 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { k.Timekeeper().SetClocks(time.NewCalibratedClocks()) // Since we have a new kernel we also must make a new watchdog. - watchdog := watchdog.New(k, watchdog.DefaultTimeout, cm.l.conf.WatchdogAction) + dog := watchdog.New(k, watchdog.DefaultTimeout, cm.l.conf.WatchdogAction) // Change the loader fields to reflect the changes made when restoring. cm.l.k = k - cm.l.watchdog = watchdog + cm.l.watchdog = dog cm.l.rootProcArgs = kernel.CreateProcessArgs{} cm.l.restore = true @@ -420,16 +431,12 @@ type WaitPIDArgs struct { // CID is the container ID. CID string - - // ClearStatus determines whether the exit status of the process should - // be cleared when WaitPID returns. - ClearStatus bool } // WaitPID waits for the process with PID 'pid' in the sandbox. func (cm *containerManager) WaitPID(args *WaitPIDArgs, waitStatus *uint32) error { log.Debugf("containerManager.Wait") - return cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, args.ClearStatus, waitStatus) + return cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, waitStatus) } // SignalDeliveryMode enumerates different signal delivery modes. diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go index 4e428b49c..0811e10f4 100644 --- a/runsc/boot/fds.go +++ b/runsc/boot/fds.go @@ -28,11 +28,12 @@ import ( // createFDMap creates an FD map that contains stdin, stdout, and stderr. If // console is true, then ioctl calls will be passed through to the host FD. // Upon success, createFDMap dups then closes stdioFDs. -func createFDMap(ctx context.Context, k *kernel.Kernel, l *limits.LimitSet, console bool, stdioFDs []int) (*kernel.FDMap, error) { +func createFDMap(ctx context.Context, l *limits.LimitSet, console bool, stdioFDs []int) (*kernel.FDMap, error) { if len(stdioFDs) != 3 { return nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs)) } + k := kernel.KernelFromContext(ctx) fdm := k.NewFDMap() defer fdm.DecRef() mounter := fs.FileOwnerFromContext(ctx) diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index 652da1cef..ef2dbfad2 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -246,6 +246,10 @@ var allowedSyscalls = seccomp.SyscallRules{ }, syscall.SYS_SETITIMER: {}, syscall.SYS_SHUTDOWN: []seccomp.Rule{ + // Used by fs/host to shutdown host sockets. + {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RD)}, + {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_WR)}, + // Used by unet to shutdown connections. {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)}, }, syscall.SYS_SIGALTSTACK: {}, diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 4b1557b9a..2fa0725d1 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -18,6 +18,7 @@ import ( "fmt" "path" "path/filepath" + "sort" "strconv" "strings" "syscall" @@ -29,9 +30,6 @@ import ( _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys" _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs" _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tty" - "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" - "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" - "gvisor.googlesource.com/gvisor/pkg/sentry/limits" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.googlesource.com/gvisor/pkg/abi/linux" @@ -40,6 +38,8 @@ import ( "gvisor.googlesource.com/gvisor/pkg/sentry/fs" "gvisor.googlesource.com/gvisor/pkg/sentry/fs/gofer" "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" "gvisor.googlesource.com/gvisor/pkg/syserror" "gvisor.googlesource.com/gvisor/runsc/specutils" ) @@ -51,6 +51,9 @@ const ( // Device name for root mount. rootDevice = "9pfs-/" + // MountPrefix is the annotation prefix for mount hints. + MountPrefix = "gvisor.dev/spec/mount" + // ChildContainersDir is the directory where child container root // filesystems are mounted. ChildContainersDir = "/__runsc_containers__" @@ -65,67 +68,24 @@ const ( nonefs = "none" ) -type fdDispenser struct { - fds []int -} - -func (f *fdDispenser) remove() int { - if f.empty() { - panic("fdDispenser out of fds") - } - rv := f.fds[0] - f.fds = f.fds[1:] - return rv -} - -func (f *fdDispenser) empty() bool { - return len(f.fds) == 0 -} +func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) { + // Upper layer uses the same flags as lower, but it must be read-write. + upperFlags := lowerFlags + upperFlags.ReadOnly = false -func adjustDirentCache(k *kernel.Kernel) error { - var hl syscall.Rlimit - if err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &hl); err != nil { - return fmt.Errorf("getting RLIMIT_NOFILE: %v", err) - } - if int64(hl.Cur) != syscall.RLIM_INFINITY { - newSize := hl.Cur / 2 - if newSize < gofer.DefaultDirentCacheSize { - log.Infof("Setting gofer dirent cache size to %d", newSize) - gofer.DefaultDirentCacheSize = newSize - k.DirentCacheLimiter = fs.NewDirentCacheLimiter(newSize) - } + tmpFS := mustFindFilesystem("tmpfs") + if !fs.IsDir(lower.StableAttr) { + // Create overlay on top of mount file, e.g. /etc/hostname. + msrc := fs.NewCachingMountSource(tmpFS, upperFlags) + return fs.NewOverlayRootFile(ctx, msrc, lower, upperFlags) } - return nil -} -// setupRootContainerFS creates a mount namespace containing the root filesystem -// and all mounts. 'rootCtx' is used to walk directories to find mount points. -// 'setMountNS' is called after namespace is created. It must set the mount NS -// to 'rootCtx'. -func setupRootContainerFS(userCtx context.Context, rootCtx context.Context, spec *specs.Spec, conf *Config, goferFDs []int, setMountNS func(*fs.MountNamespace)) error { - mounts := compileMounts(spec) - - // Create a tmpfs mount where we create and mount a root filesystem for - // each child container. - mounts = append(mounts, specs.Mount{ - Type: tmpfs, - Destination: ChildContainersDir, - }) - - fds := &fdDispenser{fds: goferFDs} - rootInode, err := createRootMount(rootCtx, spec, conf, fds, mounts) - if err != nil { - return fmt.Errorf("creating root mount: %v", err) - } - mns, err := fs.NewMountNamespace(userCtx, rootInode) + // Create overlay on top of mount dir. + upper, err := tmpFS.Mount(ctx, name+"-upper", upperFlags, "", nil) if err != nil { - return fmt.Errorf("creating root mount namespace: %v", err) + return nil, fmt.Errorf("creating tmpfs overlay: %v", err) } - setMountNS(mns) - - root := mns.Root() - defer root.DecRef() - return mountSubmounts(rootCtx, conf, mns, root, mounts, fds) + return fs.NewOverlayRoot(ctx, upper, lower, upperFlags) } // compileMounts returns the supported mounts from the mount spec, adding any @@ -184,186 +144,6 @@ func compileMounts(spec *specs.Spec) []specs.Mount { return mounts } -// createRootMount creates the root filesystem. -func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *fdDispenser, mounts []specs.Mount) (*fs.Inode, error) { - // First construct the filesystem from the spec.Root. - mf := fs.MountSourceFlags{ReadOnly: spec.Root.Readonly || conf.Overlay} - - var ( - rootInode *fs.Inode - err error - ) - - fd := fds.remove() - log.Infof("Mounting root over 9P, ioFD: %d", fd) - p9FS := mustFindFilesystem("9p") - opts := p9MountOptions(fd, conf.FileAccess) - rootInode, err = p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil) - if err != nil { - return nil, fmt.Errorf("creating root mount point: %v", err) - } - - // We need to overlay the root on top of a ramfs with stub directories - // for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always - // mounted even if they are not in the spec. - submounts := append(subtargets("/", mounts), "/dev", "/sys", "/proc", "/tmp") - rootInode, err = addSubmountOverlay(ctx, rootInode, submounts) - if err != nil { - return nil, fmt.Errorf("adding submount overlay: %v", err) - } - - if conf.Overlay && !spec.Root.Readonly { - log.Debugf("Adding overlay on top of root mount") - // Overlay a tmpfs filesystem on top of the root. - rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf) - if err != nil { - return nil, err - } - } - - log.Infof("Mounted %q to %q type root", spec.Root.Path, "/") - return rootInode, nil -} - -func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) { - // Upper layer uses the same flags as lower, but it must be read-write. - lowerFlags.ReadOnly = false - - tmpFS := mustFindFilesystem("tmpfs") - if !fs.IsDir(lower.StableAttr) { - // Create overlay on top of mount file, e.g. /etc/hostname. - msrc := fs.NewCachingMountSource(tmpFS, lowerFlags) - return fs.NewOverlayRootFile(ctx, msrc, lower, lowerFlags) - } - - // Create overlay on top of mount dir. - upper, err := tmpFS.Mount(ctx, name+"-upper", lowerFlags, "", nil) - if err != nil { - return nil, fmt.Errorf("creating tmpfs overlay: %v", err) - } - return fs.NewOverlayRoot(ctx, upper, lower, lowerFlags) -} - -// getMountNameAndOptions retrieves the fsName, opts, and useOverlay values -// used for mounts. -func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (string, []string, bool, error) { - var ( - fsName string - opts []string - useOverlay bool - err error - ) - - switch m.Type { - case devpts, devtmpfs, proc, sysfs: - fsName = m.Type - case nonefs: - fsName = sysfs - case tmpfs: - fsName = m.Type - - // tmpfs has some extra supported options that we must pass through. - opts, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid") - - case bind: - fd := fds.remove() - fsName = "9p" - // Non-root bind mounts are always shared. - opts = p9MountOptions(fd, FileAccessShared) - // If configured, add overlay to all writable mounts. - useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly - - default: - // TODO(nlacasse): Support all the mount types and make this a - // fatal error. Most applications will "just work" without - // them, so this is a warning for now. - // we do not support. - log.Warningf("ignoring unknown filesystem type %q", m.Type) - } - return fsName, opts, useOverlay, err -} - -func mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, mounts []specs.Mount, fds *fdDispenser) error { - for _, m := range mounts { - if err := mountSubmount(ctx, conf, mns, root, fds, m, mounts); err != nil { - return fmt.Errorf("mount submount %q: %v", m.Destination, err) - } - } - - if err := mountTmp(ctx, conf, mns, root, mounts); err != nil { - return fmt.Errorf("mount submount %q: %v", "tmp", err) - } - - if !fds.empty() { - return fmt.Errorf("not all mount points were consumed, remaining: %v", fds) - } - return nil -} - -// mountSubmount mounts volumes inside the container's root. Because mounts may -// be readonly, a lower ramfs overlay is added to create the mount point dir. -// Another overlay is added with tmpfs on top if Config.Overlay is true. -// 'm.Destination' must be an absolute path with '..' and symlinks resolved. -func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, fds *fdDispenser, m specs.Mount, mounts []specs.Mount) error { - // Map mount type to filesystem name, and parse out the options that we are - // capable of dealing with. - fsName, opts, useOverlay, err := getMountNameAndOptions(conf, m, fds) - - // Return the error or nil that corresponds to the default case in getMountNameAndOptions. - if err != nil { - return err - } - if fsName == "" { - return nil - } - - // All filesystem names should have been mapped to something we know. - filesystem := mustFindFilesystem(fsName) - - mf := mountFlags(m.Options) - if useOverlay { - // All writes go to upper, be paranoid and make lower readonly. - mf.ReadOnly = true - } - - inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil) - if err != nil { - return fmt.Errorf("creating mount with source %q: %v", m.Source, err) - } - - // If there are submounts, we need to overlay the mount on top of a - // ramfs with stub directories for submount paths. - submounts := subtargets(m.Destination, mounts) - if len(submounts) > 0 { - log.Infof("Adding submount overlay over %q", m.Destination) - inode, err = addSubmountOverlay(ctx, inode, submounts) - if err != nil { - return fmt.Errorf("adding submount overlay: %v", err) - } - } - - if useOverlay { - log.Debugf("Adding overlay on top of mount %q", m.Destination) - inode, err = addOverlay(ctx, conf, inode, m.Type, mf) - if err != nil { - return err - } - } - - maxTraversals := uint(0) - dirent, err := mns.FindInode(ctx, root, root, m.Destination, &maxTraversals) - if err != nil { - return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err) - } - defer dirent.DecRef() - if err := mns.Mount(ctx, dirent, inode); err != nil { - return fmt.Errorf("mount %q error: %v", m.Destination, err) - } - - log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type) - return nil -} - // p9MountOptions creates a slice of options for a p9 mount. func p9MountOptions(fd int, fa FileAccessType) []string { opts := []string{ @@ -416,82 +196,6 @@ func mountDevice(m specs.Mount) string { return "none" } -// addRestoreMount adds a mount to the MountSources map used for restoring a -// checkpointed container. -func addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount, fds *fdDispenser) error { - fsName, opts, useOverlay, err := getMountNameAndOptions(conf, m, fds) - - // Return the error or nil that corresponds to the default case in getMountNameAndOptions. - if err != nil { - return err - } - // TODO(nlacasse): Fix this when we support all the mount types and - // make this a fatal error. - if fsName == "" { - return nil - } - - newMount := fs.MountArgs{ - Dev: mountDevice(m), - Flags: mountFlags(m.Options), - DataString: strings.Join(opts, ","), - } - if useOverlay { - newMount.Flags.ReadOnly = true - } - renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount) - log.Infof("Added mount at %q: %+v", fsName, newMount) - return nil -} - -// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding the mounts -// to the environment. -func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser) (*fs.RestoreEnvironment, error) { - renv := &fs.RestoreEnvironment{ - MountSources: make(map[string][]fs.MountArgs), - } - - // Add root mount. - fd := fds.remove() - opts := p9MountOptions(fd, conf.FileAccess) - - mf := fs.MountSourceFlags{} - if spec.Root.Readonly || conf.Overlay { - mf.ReadOnly = true - } - - rootMount := fs.MountArgs{ - Dev: rootDevice, - Flags: mf, - DataString: strings.Join(opts, ","), - } - renv.MountSources[rootFsName] = append(renv.MountSources[rootFsName], rootMount) - - // Add submounts. - var tmpMounted bool - for _, m := range compileMounts(spec) { - if err := addRestoreMount(conf, renv, m, fds); err != nil { - return nil, err - } - if filepath.Clean(m.Destination) == "/tmp" { - tmpMounted = true - } - } - - // TODO(b/67958150): handle '/tmp' properly (see mountTmp()). - if !tmpMounted { - tmpMount := specs.Mount{ - Type: tmpfs, - Destination: "/tmp", - } - if err := addRestoreMount(conf, renv, tmpMount, fds); err != nil { - return nil, err - } - } - - return renv, nil -} - func mountFlags(opts []string) fs.MountSourceFlags { mf := fs.MountSourceFlags{} for _, o := range opts { @@ -546,22 +250,254 @@ func subtargets(root string, mnts []specs.Mount) []string { return targets } -// setupContainerFS is used to set up the file system and amend the procArgs accordingly. -// procArgs are passed by reference and the FDMap field is modified. It dups stdioFDs. -func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, stdioFDs, goferFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error { - ctx := procArgs.NewContext(k) - - // Create the FD map, which will set stdin, stdout, and stderr. If console - // is true, then ioctl calls will be passed through to the host fd. - fdm, err := createFDMap(ctx, k, ls, console, stdioFDs) +// setExecutablePath sets the procArgs.Filename by searching the PATH for an +// executable matching the procArgs.Argv[0]. +func setExecutablePath(ctx context.Context, mns *fs.MountNamespace, procArgs *kernel.CreateProcessArgs) error { + paths := fs.GetPath(procArgs.Envv) + exe := procArgs.Argv[0] + f, err := mns.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, exe, paths) if err != nil { - return fmt.Errorf("importing fds: %v", err) + return fmt.Errorf("searching for executable %q, cwd: %q, $PATH=%q: %v", exe, procArgs.WorkingDirectory, strings.Join(paths, ":"), err) + } + procArgs.Filename = f + return nil +} + +func adjustDirentCache(k *kernel.Kernel) error { + var hl syscall.Rlimit + if err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &hl); err != nil { + return fmt.Errorf("getting RLIMIT_NOFILE: %v", err) } + if int64(hl.Cur) != syscall.RLIM_INFINITY { + newSize := hl.Cur / 2 + if newSize < gofer.DefaultDirentCacheSize { + log.Infof("Setting gofer dirent cache size to %d", newSize) + gofer.DefaultDirentCacheSize = newSize + k.DirentCacheLimiter = fs.NewDirentCacheLimiter(newSize) + } + } + return nil +} - // CreateProcess takes a reference on FDMap if successful. We - // won't need ours either way. - procArgs.FDMap = fdm +type fdDispenser struct { + fds []int +} +func (f *fdDispenser) remove() int { + if f.empty() { + panic("fdDispenser out of fds") + } + rv := f.fds[0] + f.fds = f.fds[1:] + return rv +} + +func (f *fdDispenser) empty() bool { + return len(f.fds) == 0 +} + +type shareType int + +const ( + invalid shareType = iota + + // container shareType indicates that the mount is used by a single container. + container + + // pod shareType indicates that the mount is used by more than one container + // inside the pod. + pod + + // shared shareType indicates that the mount can also be shared with a process + // outside the pod, e.g. NFS. + shared +) + +func parseShare(val string) (shareType, error) { + switch val { + case "container": + return container, nil + case "pod": + return pod, nil + case "shared": + return shared, nil + default: + return 0, fmt.Errorf("invalid share value %q", val) + } +} + +func (s shareType) String() string { + switch s { + case invalid: + return "invalid" + case container: + return "container" + case pod: + return "pod" + case shared: + return "shared" + default: + return fmt.Sprintf("invalid share value %d", s) + } +} + +// mountHint represents extra information about mounts that are provided via +// annotations. They can override mount type, and provide sharing information +// so that mounts can be correctly shared inside the pod. +type mountHint struct { + name string + share shareType + mount specs.Mount + + // root is the inode where the volume is mounted. For mounts with 'pod' share + // the volume is mounted once and then bind mounted inside the containers. + root *fs.Inode +} + +func (m *mountHint) setField(key, val string) error { + switch key { + case "source": + if len(val) == 0 { + return fmt.Errorf("source cannot be empty") + } + m.mount.Source = val + case "type": + return m.setType(val) + case "share": + share, err := parseShare(val) + if err != nil { + return err + } + m.share = share + case "options": + return m.setOptions(val) + default: + return fmt.Errorf("invalid mount annotation: %s=%s", key, val) + } + return nil +} + +func (m *mountHint) setType(val string) error { + switch val { + case "tmpfs", "bind": + m.mount.Type = val + default: + return fmt.Errorf("invalid type %q", val) + } + return nil +} + +func (m *mountHint) setOptions(val string) error { + opts := strings.Split(val, ",") + if err := specutils.ValidateMountOptions(opts); err != nil { + return err + } + // Sort options so it can be compared with container mount options later on. + sort.Strings(opts) + m.mount.Options = opts + return nil +} + +func (m *mountHint) isSupported() bool { + return m.mount.Type == tmpfs && m.share == pod +} + +// podMountHints contains a collection of mountHints for the pod. +type podMountHints struct { + mounts map[string]*mountHint +} + +func newPodMountHints(spec *specs.Spec) (*podMountHints, error) { + mnts := make(map[string]*mountHint) + for k, v := range spec.Annotations { + // Look for 'gvisor.dev/spec/mount' annotations and parse them. + if strings.HasPrefix(k, MountPrefix) { + parts := strings.Split(k, "/") + if len(parts) != 5 { + return nil, fmt.Errorf("invalid mount annotation: %s=%s", k, v) + } + name := parts[3] + if len(name) == 0 || path.Clean(name) != name { + return nil, fmt.Errorf("invalid mount name: %s", name) + } + mnt := mnts[name] + if mnt == nil { + mnt = &mountHint{name: name} + mnts[name] = mnt + } + if err := mnt.setField(parts[4], v); err != nil { + return nil, err + } + } + } + + // Validate all hints after done parsing. + for name, m := range mnts { + log.Infof("Mount annotation found, name: %s, source: %q, type: %s, share: %v", name, m.mount.Source, m.mount.Type, m.share) + if m.share == invalid { + return nil, fmt.Errorf("share field for %q has not been set", m.name) + } + if len(m.mount.Source) == 0 { + return nil, fmt.Errorf("source field for %q has not been set", m.name) + } + if len(m.mount.Type) == 0 { + return nil, fmt.Errorf("type field for %q has not been set", m.name) + } + + // Check for duplicate mount sources. + for name2, m2 := range mnts { + if name != name2 && m.mount.Source == m2.mount.Source { + return nil, fmt.Errorf("mounts %q and %q have the same mount source %q", m.name, m2.name, m.mount.Source) + } + } + } + + return &podMountHints{mounts: mnts}, nil +} + +func (p *podMountHints) findMount(mount specs.Mount) *mountHint { + for _, m := range p.mounts { + if m.mount.Source == mount.Source { + return m + } + } + return nil +} + +type containerMounter struct { + // cid is the container ID. May be set to empty for the root container. + cid string + + root *specs.Root + + // mounts is the set of submounts for the container. It's a copy from the spec + // that may be freely modified without affecting the original spec. + mounts []specs.Mount + + // fds is the list of FDs to be dispensed for mounts that require it. + fds fdDispenser + + k *kernel.Kernel + + hints *podMountHints +} + +func newContainerMounter(spec *specs.Spec, cid string, goferFDs []int, k *kernel.Kernel, hints *podMountHints) *containerMounter { + return &containerMounter{ + cid: cid, + root: spec.Root, + mounts: compileMounts(spec), + fds: fdDispenser{fds: goferFDs}, + k: k, + hints: hints, + } +} + +// setupFS is used to set up the file system for containers and amend +// the procArgs accordingly. This is the main entry point for this rest of +// functions in this file. procArgs are passed by reference and the FDMap field +// is modified. It dups stdioFDs. +func (c *containerMounter) setupFS(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs, creds *auth.Credentials) error { // Use root user to configure mounts. The current user might not have // permission to do so. rootProcArgs := kernel.CreateProcessArgs{ @@ -570,16 +506,19 @@ func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf Umask: 0022, MaxSymlinkTraversals: linux.MaxSymlinkTraversals, } - rootCtx := rootProcArgs.NewContext(k) + rootCtx := rootProcArgs.NewContext(c.k) // If this is the root container, we also need to setup the root mount // namespace. - mns := k.RootMountNamespace() + mns := c.k.RootMountNamespace() if mns == nil { // Setup the root container. - return setupRootContainerFS(ctx, rootCtx, spec, conf, goferFDs, func(mns *fs.MountNamespace) { - k.SetRootMountNamespace(mns) - }) + if err := c.setupRootContainer(ctx, rootCtx, conf, func(mns *fs.MountNamespace) { + c.k.SetRootMountNamespace(mns) + }); err != nil { + return err + } + return c.checkDispenser() } // Setup a child container. @@ -593,18 +532,17 @@ func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf if err != nil { return fmt.Errorf("couldn't find child container dir %q: %v", ChildContainersDir, err) } - if err := contDir.CreateDirectory(ctx, globalRoot, cid, fs.FilePermsFromMode(0755)); err != nil { - return fmt.Errorf("create directory %q: %v", cid, err) + if err := contDir.CreateDirectory(ctx, globalRoot, c.cid, fs.FilePermsFromMode(0755)); err != nil { + return fmt.Errorf("create directory %q: %v", c.cid, err) } - containerRoot, err := contDir.Walk(ctx, globalRoot, cid) + containerRoot, err := contDir.Walk(ctx, globalRoot, c.cid) if err != nil { - return fmt.Errorf("walk to %q failed: %v", cid, err) + return fmt.Errorf("walk to %q failed: %v", c.cid, err) } defer containerRoot.DecRef() // Create the container's root filesystem mount. - fds := &fdDispenser{fds: goferFDs} - rootInode, err := createRootMount(rootCtx, spec, conf, fds, nil) + rootInode, err := c.createRootMount(rootCtx, conf) if err != nil { return fmt.Errorf("creating filesystem for container: %v", err) } @@ -614,39 +552,32 @@ func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf return fmt.Errorf("mount container root: %v", err) } - // We have to re-walk to the dirent to find the mounted - // directory. The old dirent is invalid at this point. - containerRoot, err = contDir.Walk(ctx, globalRoot, cid) + // We have to re-walk to the dirent to find the mounted directory. The old + // dirent is invalid at this point. + containerRoot, err = contDir.Walk(ctx, globalRoot, c.cid) if err != nil { - return fmt.Errorf("find container mount point %q: %v", cid, err) + return fmt.Errorf("find container mount point %q: %v", c.cid, err) } cu := specutils.MakeCleanup(func() { containerRoot.DecRef() }) defer cu.Clean() - log.Infof("Mounted child's root fs to %q", filepath.Join(ChildContainersDir, cid)) + log.Infof("Mounted child's root fs to %q", filepath.Join(ChildContainersDir, c.cid)) // Set process root here, so 'rootCtx.Value(CtxRoot)' will return it. procArgs.Root = containerRoot // Mount all submounts. - mounts := compileMounts(spec) - if err := mountSubmounts(rootCtx, conf, mns, containerRoot, mounts, fds); err != nil { + if err := c.mountSubmounts(rootCtx, conf, mns, containerRoot); err != nil { return err } cu.Release() - return nil + return c.checkDispenser() } -// setExecutablePath sets the procArgs.Filename by searching the PATH for an -// executable matching the procArgs.Argv[0]. -func setExecutablePath(ctx context.Context, mns *fs.MountNamespace, procArgs *kernel.CreateProcessArgs) error { - paths := fs.GetPath(procArgs.Envv) - exe := procArgs.Argv[0] - f, err := mns.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, exe, paths) - if err != nil { - return fmt.Errorf("searching for executable %q, cwd: %q, $PATH=%q: %v", exe, procArgs.WorkingDirectory, strings.Join(paths, ":"), err) +func (c *containerMounter) checkDispenser() error { + if !c.fds.empty() { + return fmt.Errorf("not all gofer FDs were consumed, remaining: %v", c.fds) } - procArgs.Filename = f return nil } @@ -715,17 +646,354 @@ func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error return nil } +// setupRootContainer creates a mount namespace containing the root filesystem +// and all mounts. 'rootCtx' is used to walk directories to find mount points. +// 'setMountNS' is called after namespace is created. It must set the mount NS +// to 'rootCtx'. +func (c *containerMounter) setupRootContainer(userCtx context.Context, rootCtx context.Context, conf *Config, setMountNS func(*fs.MountNamespace)) error { + for _, hint := range c.hints.mounts { + log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type) + inode, err := c.mountSharedMaster(rootCtx, conf, hint) + if err != nil { + return fmt.Errorf("mounting shared master %q: %v", hint.name, err) + } + hint.root = inode + } + + // Create a tmpfs mount where we create and mount a root filesystem for + // each child container. + c.mounts = append(c.mounts, specs.Mount{ + Type: tmpfs, + Destination: ChildContainersDir, + }) + + rootInode, err := c.createRootMount(rootCtx, conf) + if err != nil { + return fmt.Errorf("creating root mount: %v", err) + } + mns, err := fs.NewMountNamespace(userCtx, rootInode) + if err != nil { + return fmt.Errorf("creating root mount namespace: %v", err) + } + setMountNS(mns) + + root := mns.Root() + defer root.DecRef() + return c.mountSubmounts(rootCtx, conf, mns, root) +} + +// mountSharedMaster mounts the master of a volume that is shared among +// containers in a pod. It returns the root mount's inode. +func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config, hint *mountHint) (*fs.Inode, error) { + // Map mount type to filesystem name, and parse out the options that we are + // capable of dealing with. + fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, hint.mount) + if err != nil { + return nil, err + } + if len(fsName) == 0 { + return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type) + } + + // Mount with revalidate because it's shared among containers. + opts = append(opts, "cache=revalidate") + + // All filesystem names should have been mapped to something we know. + filesystem := mustFindFilesystem(fsName) + + mf := mountFlags(hint.mount.Options) + if useOverlay { + // All writes go to upper, be paranoid and make lower readonly. + mf.ReadOnly = true + } + + inode, err := filesystem.Mount(ctx, mountDevice(hint.mount), mf, strings.Join(opts, ","), nil) + if err != nil { + return nil, fmt.Errorf("creating mount %q: %v", hint.name, err) + } + + if useOverlay { + log.Debugf("Adding overlay on top of shared mount %q", hint.name) + inode, err = addOverlay(ctx, conf, inode, hint.mount.Type, mf) + if err != nil { + return nil, err + } + } + + return inode, nil +} + +// createRootMount creates the root filesystem. +func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*fs.Inode, error) { + // First construct the filesystem from the spec.Root. + mf := fs.MountSourceFlags{ReadOnly: c.root.Readonly || conf.Overlay} + + fd := c.fds.remove() + log.Infof("Mounting root over 9P, ioFD: %d", fd) + p9FS := mustFindFilesystem("9p") + opts := p9MountOptions(fd, conf.FileAccess) + rootInode, err := p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil) + if err != nil { + return nil, fmt.Errorf("creating root mount point: %v", err) + } + + // We need to overlay the root on top of a ramfs with stub directories + // for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always + // mounted even if they are not in the spec. + submounts := append(subtargets("/", c.mounts), "/dev", "/sys", "/proc", "/tmp") + rootInode, err = addSubmountOverlay(ctx, rootInode, submounts) + if err != nil { + return nil, fmt.Errorf("adding submount overlay: %v", err) + } + + if conf.Overlay && !c.root.Readonly { + log.Debugf("Adding overlay on top of root mount") + // Overlay a tmpfs filesystem on top of the root. + rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf) + if err != nil { + return nil, err + } + } + + log.Infof("Mounted %q to %q type root", c.root.Path, "/") + return rootInode, nil +} + +// getMountNameAndOptions retrieves the fsName, opts, and useOverlay values +// used for mounts. +func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (string, []string, bool, error) { + var ( + fsName string + opts []string + useOverlay bool + err error + ) + + switch m.Type { + case devpts, devtmpfs, proc, sysfs: + fsName = m.Type + case nonefs: + fsName = sysfs + case tmpfs: + fsName = m.Type + + // tmpfs has some extra supported options that we must pass through. + opts, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid") + + case bind: + fd := c.fds.remove() + fsName = "9p" + // Non-root bind mounts are always shared. + opts = p9MountOptions(fd, FileAccessShared) + // If configured, add overlay to all writable mounts. + useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly + + default: + // TODO(nlacasse): Support all the mount types and make this a fatal error. + // Most applications will "just work" without them, so this is a warning + // for now. + log.Warningf("ignoring unknown filesystem type %q", m.Type) + } + return fsName, opts, useOverlay, err +} + +func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error { + for _, m := range c.mounts { + if hint := c.hints.findMount(m); hint != nil && hint.isSupported() { + if err := c.mountSharedSubmount(ctx, mns, root, m, hint); err != nil { + return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, m.Destination, err) + } + } else { + if err := c.mountSubmount(ctx, conf, mns, root, m); err != nil { + return fmt.Errorf("mount submount %q: %v", m.Destination, err) + } + } + } + + if err := c.mountTmp(ctx, conf, mns, root); err != nil { + return fmt.Errorf("mount submount %q: %v", "tmp", err) + } + return nil +} + +// mountSubmount mounts volumes inside the container's root. Because mounts may +// be readonly, a lower ramfs overlay is added to create the mount point dir. +// Another overlay is added with tmpfs on top if Config.Overlay is true. +// 'm.Destination' must be an absolute path with '..' and symlinks resolved. +func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error { + // Map mount type to filesystem name, and parse out the options that we are + // capable of dealing with. + fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m) + if err != nil { + return err + } + if fsName == "" { + // Filesystem is not supported (e.g. cgroup), just skip it. + return nil + } + + // All filesystem names should have been mapped to something we know. + filesystem := mustFindFilesystem(fsName) + + mf := mountFlags(m.Options) + if useOverlay { + // All writes go to upper, be paranoid and make lower readonly. + mf.ReadOnly = true + } + + inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil) + if err != nil { + return fmt.Errorf("creating mount with source %q: %v", m.Source, err) + } + + // If there are submounts, we need to overlay the mount on top of a ramfs + // with stub directories for submount paths. + submounts := subtargets(m.Destination, c.mounts) + if len(submounts) > 0 { + log.Infof("Adding submount overlay over %q", m.Destination) + inode, err = addSubmountOverlay(ctx, inode, submounts) + if err != nil { + return fmt.Errorf("adding submount overlay: %v", err) + } + } + + if useOverlay { + log.Debugf("Adding overlay on top of mount %q", m.Destination) + inode, err = addOverlay(ctx, conf, inode, m.Type, mf) + if err != nil { + return err + } + } + + maxTraversals := uint(0) + dirent, err := mns.FindInode(ctx, root, root, m.Destination, &maxTraversals) + if err != nil { + return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err) + } + defer dirent.DecRef() + if err := mns.Mount(ctx, dirent, inode); err != nil { + return fmt.Errorf("mount %q error: %v", m.Destination, err) + } + + log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type) + return nil +} + +// mountSharedSubmount binds mount to a previously mounted volume that is shared +// among containers in the same pod. +func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.MountNamespace, root *fs.Dirent, mount specs.Mount, source *mountHint) error { + // For now enforce that all options are the same. Once bind mount is properly + // supported, then we should ensure the master is less restrictive than the + // container, e.g. master can be 'rw' while container mounts as 'ro'. + if len(mount.Options) != len(source.mount.Options) { + return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", source.mount.Options, mount.Options) + } + sort.Strings(mount.Options) + for i, opt := range mount.Options { + if opt != source.mount.Options[i] { + return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", source.mount.Options, mount.Options) + } + } + + maxTraversals := uint(0) + target, err := mns.FindInode(ctx, root, root, mount.Destination, &maxTraversals) + if err != nil { + return fmt.Errorf("can't find mount destination %q: %v", mount.Destination, err) + } + defer target.DecRef() + + if err := mns.Mount(ctx, target, source.root); err != nil { + return fmt.Errorf("bind mount %q error: %v", mount.Destination, err) + } + + log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name) + return nil +} + +// addRestoreMount adds a mount to the MountSources map used for restoring a +// checkpointed container. +func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount) error { + fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m) + if err != nil { + return err + } + if fsName == "" { + // Filesystem is not supported (e.g. cgroup), just skip it. + return nil + } + + newMount := fs.MountArgs{ + Dev: mountDevice(m), + Flags: mountFlags(m.Options), + DataString: strings.Join(opts, ","), + } + if useOverlay { + newMount.Flags.ReadOnly = true + } + renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount) + log.Infof("Added mount at %q: %+v", fsName, newMount) + return nil +} + +// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding +// the mounts to the environment. +func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEnvironment, error) { + renv := &fs.RestoreEnvironment{ + MountSources: make(map[string][]fs.MountArgs), + } + + // Add root mount. + fd := c.fds.remove() + opts := p9MountOptions(fd, conf.FileAccess) + + mf := fs.MountSourceFlags{} + if c.root.Readonly || conf.Overlay { + mf.ReadOnly = true + } + + rootMount := fs.MountArgs{ + Dev: rootDevice, + Flags: mf, + DataString: strings.Join(opts, ","), + } + renv.MountSources[rootFsName] = append(renv.MountSources[rootFsName], rootMount) + + // Add submounts. + var tmpMounted bool + for _, m := range c.mounts { + if err := c.addRestoreMount(conf, renv, m); err != nil { + return nil, err + } + if filepath.Clean(m.Destination) == "/tmp" { + tmpMounted = true + } + } + + // TODO(b/67958150): handle '/tmp' properly (see mountTmp()). + if !tmpMounted { + tmpMount := specs.Mount{ + Type: tmpfs, + Destination: "/tmp", + } + if err := c.addRestoreMount(conf, renv, tmpMount); err != nil { + return nil, err + } + } + + return renv, nil +} + // mountTmp mounts an internal tmpfs at '/tmp' if it's safe to do so. // Technically we don't have to mount tmpfs at /tmp, as we could just rely on // the host /tmp, but this is a nice optimization, and fixes some apps that call // mknod in /tmp. It's unsafe to mount tmpfs if: -// 1. /tmp is mounted explictly: we should not override user's wish +// 1. /tmp is mounted explicitly: we should not override user's wish // 2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp // // Note that when there are submounts inside of '/tmp', directories for the // mount points must be present, making '/tmp' not empty anymore. -func mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, mounts []specs.Mount) error { - for _, m := range mounts { +func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error { + for _, m := range c.mounts { if filepath.Clean(m.Destination) == "/tmp" { log.Debugf("Explict %q mount found, skipping internal tmpfs, mount: %+v", "/tmp", m) return nil @@ -766,7 +1034,7 @@ func mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *f // another user. This is normally done for /tmp. Options: []string{"mode=1777"}, } - return mountSubmount(ctx, conf, mns, root, nil, tmpMount, mounts) + return c.mountSubmount(ctx, conf, mns, root, tmpMount) default: return err diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go new file mode 100644 index 000000000..49ab34b33 --- /dev/null +++ b/runsc/boot/fs_test.go @@ -0,0 +1,193 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "path" + "reflect" + "strings" + "testing" + + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +func TestPodMountHintsHappy(t *testing.T) { + spec := &specs.Spec{ + Annotations: map[string]string{ + path.Join(MountPrefix, "mount1", "source"): "foo", + path.Join(MountPrefix, "mount1", "type"): "tmpfs", + path.Join(MountPrefix, "mount1", "share"): "pod", + + path.Join(MountPrefix, "mount2", "source"): "bar", + path.Join(MountPrefix, "mount2", "type"): "bind", + path.Join(MountPrefix, "mount2", "share"): "container", + path.Join(MountPrefix, "mount2", "options"): "rw,private", + }, + } + podHints, err := newPodMountHints(spec) + if err != nil { + t.Errorf("newPodMountHints failed: %v", err) + } + + // Check that fields were set correctly. + mount1 := podHints.mounts["mount1"] + if want := "mount1"; want != mount1.name { + t.Errorf("mount1 name, want: %q, got: %q", want, mount1.name) + } + if want := "foo"; want != mount1.mount.Source { + t.Errorf("mount1 source, want: %q, got: %q", want, mount1.mount.Source) + } + if want := "tmpfs"; want != mount1.mount.Type { + t.Errorf("mount1 type, want: %q, got: %q", want, mount1.mount.Type) + } + if want := pod; want != mount1.share { + t.Errorf("mount1 type, want: %q, got: %q", want, mount1.share) + } + if want := []string(nil); !reflect.DeepEqual(want, mount1.mount.Options) { + t.Errorf("mount1 type, want: %q, got: %q", want, mount1.mount.Options) + } + + mount2 := podHints.mounts["mount2"] + if want := "mount2"; want != mount2.name { + t.Errorf("mount2 name, want: %q, got: %q", want, mount2.name) + } + if want := "bar"; want != mount2.mount.Source { + t.Errorf("mount2 source, want: %q, got: %q", want, mount2.mount.Source) + } + if want := "bind"; want != mount2.mount.Type { + t.Errorf("mount2 type, want: %q, got: %q", want, mount2.mount.Type) + } + if want := container; want != mount2.share { + t.Errorf("mount2 type, want: %q, got: %q", want, mount2.share) + } + if want := []string{"private", "rw"}; !reflect.DeepEqual(want, mount2.mount.Options) { + t.Errorf("mount2 type, want: %q, got: %q", want, mount2.mount.Options) + } +} + +func TestPodMountHintsErrors(t *testing.T) { + for _, tst := range []struct { + name string + annotations map[string]string + error string + }{ + { + name: "too short", + annotations: map[string]string{ + path.Join(MountPrefix, "mount1"): "foo", + }, + error: "invalid mount annotation", + }, + { + name: "no name", + annotations: map[string]string{ + MountPrefix + "//source": "foo", + }, + error: "invalid mount name", + }, + { + name: "missing source", + annotations: map[string]string{ + path.Join(MountPrefix, "mount1", "type"): "tmpfs", + path.Join(MountPrefix, "mount1", "share"): "pod", + }, + error: "source field", + }, + { + name: "missing type", + annotations: map[string]string{ + path.Join(MountPrefix, "mount1", "source"): "foo", + path.Join(MountPrefix, "mount1", "share"): "pod", + }, + error: "type field", + }, + { + name: "missing share", + annotations: map[string]string{ + path.Join(MountPrefix, "mount1", "source"): "foo", + path.Join(MountPrefix, "mount1", "type"): "tmpfs", + }, + error: "share field", + }, + { + name: "invalid field name", + annotations: map[string]string{ + path.Join(MountPrefix, "mount1", "invalid"): "foo", + }, + error: "invalid mount annotation", + }, + { + name: "invalid source", + annotations: map[string]string{ + path.Join(MountPrefix, "mount1", "source"): "", + path.Join(MountPrefix, "mount1", "type"): "tmpfs", + path.Join(MountPrefix, "mount1", "share"): "pod", + }, + error: "source cannot be empty", + }, + { + name: "invalid type", + annotations: map[string]string{ + path.Join(MountPrefix, "mount1", "source"): "foo", + path.Join(MountPrefix, "mount1", "type"): "invalid-type", + path.Join(MountPrefix, "mount1", "share"): "pod", + }, + error: "invalid type", + }, + { + name: "invalid share", + annotations: map[string]string{ + path.Join(MountPrefix, "mount1", "source"): "foo", + path.Join(MountPrefix, "mount1", "type"): "tmpfs", + path.Join(MountPrefix, "mount1", "share"): "invalid-share", + }, + error: "invalid share", + }, + { + name: "invalid options", + annotations: map[string]string{ + path.Join(MountPrefix, "mount1", "source"): "foo", + path.Join(MountPrefix, "mount1", "type"): "tmpfs", + path.Join(MountPrefix, "mount1", "share"): "pod", + path.Join(MountPrefix, "mount1", "options"): "invalid-option", + }, + error: "unknown mount option", + }, + { + name: "duplicate source", + annotations: map[string]string{ + path.Join(MountPrefix, "mount1", "source"): "foo", + path.Join(MountPrefix, "mount1", "type"): "tmpfs", + path.Join(MountPrefix, "mount1", "share"): "pod", + + path.Join(MountPrefix, "mount2", "source"): "foo", + path.Join(MountPrefix, "mount2", "type"): "bind", + path.Join(MountPrefix, "mount2", "share"): "container", + }, + error: "have the same mount source", + }, + } { + t.Run(tst.name, func(t *testing.T) { + spec := &specs.Spec{Annotations: tst.annotations} + podHints, err := newPodMountHints(spec) + if err == nil || !strings.Contains(err.Error(), tst.error) { + t.Errorf("newPodMountHints invalid error, want: .*%s.*, got: %v", tst.error, err) + } + if podHints != nil { + t.Errorf("newPodMountHints must return nil on failure: %+v", podHints) + } + }) + } +} diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 6ac6b94dd..c1dea736f 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -29,6 +29,7 @@ import ( "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/cpuid" "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/memutil" "gvisor.googlesource.com/gvisor/pkg/rand" "gvisor.googlesource.com/gvisor/pkg/sentry/arch" "gvisor.googlesource.com/gvisor/pkg/sentry/control" @@ -37,7 +38,6 @@ import ( "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" "gvisor.googlesource.com/gvisor/pkg/sentry/loader" - "gvisor.googlesource.com/gvisor/pkg/sentry/memutil" "gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc" "gvisor.googlesource.com/gvisor/pkg/sentry/platform" "gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm" @@ -117,6 +117,10 @@ type Loader struct { // // processes is guardded by mu. processes map[execID]*execProcess + + // mountHints provides extra information about mounts for containers that + // apply to the entire pod. + mountHints *podMountHints } // execID uniquely identifies a sentry process that is executed in a container. @@ -288,7 +292,7 @@ func New(args Args) (*Loader, error) { } // Create a watchdog. - watchdog := watchdog.New(k, watchdog.DefaultTimeout, args.Conf.WatchdogAction) + dog := watchdog.New(k, watchdog.DefaultTimeout, args.Conf.WatchdogAction) procArgs, err := newProcess(args.ID, args.Spec, creds, k) if err != nil { @@ -299,18 +303,24 @@ func New(args Args) (*Loader, error) { return nil, fmt.Errorf("initializing compat logs: %v", err) } + mountHints, err := newPodMountHints(args.Spec) + if err != nil { + return nil, fmt.Errorf("creating pod mount hints: %v", err) + } + eid := execID{cid: args.ID} l := &Loader{ k: k, conf: args.Conf, console: args.Console, - watchdog: watchdog, + watchdog: dog, spec: args.Spec, goferFDs: args.GoferFDs, stdioFDs: args.StdioFDs, rootProcArgs: procArgs, sandboxID: args.ID, processes: map[execID]*execProcess{eid: {}}, + mountHints: mountHints, } // We don't care about child signals; some platforms can generate a @@ -424,6 +434,9 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) { return nil, fmt.Errorf("error creating memfd: %v", err) } memfile := os.NewFile(uintptr(memfd), memfileName) + // We can't enable pgalloc.MemoryFileOpts.UseHostMemcgPressure even if + // there are memory cgroups specified, because at this point we're already + // in a mount namespace in which the relevant cgroupfs is not visible. mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{}) if err != nil { memfile.Close() @@ -432,7 +445,24 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) { return mf, nil } -// Run runs the root container.. +func (l *Loader) installSeccompFilters() error { + if l.conf.DisableSeccomp { + filter.Report("syscall filter is DISABLED. Running in less secure mode.") + } else { + opts := filter.Options{ + Platform: l.k.Platform, + HostNetwork: l.conf.Network == NetworkHost, + ProfileEnable: l.conf.ProfileEnable, + ControllerFD: l.ctrl.srv.FD(), + } + if err := filter.Install(opts); err != nil { + return fmt.Errorf("installing seccomp filters: %v", err) + } + } + return nil +} + +// Run runs the root container. func (l *Loader) Run() error { err := l.run() l.ctrl.manager.startResultChan <- err @@ -467,36 +497,34 @@ func (l *Loader) run() error { return fmt.Errorf("trying to start deleted container %q", l.sandboxID) } - // Finally done with all configuration. Setup filters before user code - // is loaded. - if l.conf.DisableSeccomp { - filter.Report("syscall filter is DISABLED. Running in less secure mode.") - } else { - opts := filter.Options{ - Platform: l.k.Platform, - HostNetwork: l.conf.Network == NetworkHost, - ProfileEnable: l.conf.ProfileEnable, - ControllerFD: l.ctrl.srv.FD(), - } - if err := filter.Install(opts); err != nil { - return fmt.Errorf("installing seccomp filters: %v", err) - } - } - // If we are restoring, we do not want to create a process. // l.restore is set by the container manager when a restore call is made. if !l.restore { - if err := setupContainerFS( - &l.rootProcArgs, - l.spec, - l.conf, - l.stdioFDs, - l.goferFDs, - l.console, - l.rootProcArgs.Credentials, - l.rootProcArgs.Limits, - l.k, - "" /* CID, which isn't needed for the root container */); err != nil { + if l.conf.ProfileEnable { + initializePProf() + } + + // Finally done with all configuration. Setup filters before user code + // is loaded. + if err := l.installSeccompFilters(); err != nil { + return err + } + + // Create the FD map, which will set stdin, stdout, and stderr. If console + // is true, then ioctl calls will be passed through to the host fd. + ctx := l.rootProcArgs.NewContext(l.k) + fdm, err := createFDMap(ctx, l.rootProcArgs.Limits, l.console, l.stdioFDs) + if err != nil { + return fmt.Errorf("importing fds: %v", err) + } + // CreateProcess takes a reference on FDMap if successful. We won't need + // ours either way. + l.rootProcArgs.FDMap = fdm + + // cid for root container can be empty. Only subcontainers need it to set + // the mount location. + mntr := newContainerMounter(l.spec, "", l.goferFDs, l.k, l.mountHints) + if err := mntr.setupFS(ctx, l.conf, &l.rootProcArgs, l.rootProcArgs.Credentials); err != nil { return err } @@ -552,7 +580,7 @@ func (l *Loader) createContainer(cid string) error { // startContainer starts a child container. It returns the thread group ID of // the newly created process. Caller owns 'files' and may close them after // this method returns. -func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config, cid string, files []*os.File) error { +func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, files []*os.File) error { // Create capabilities. caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities) if err != nil { @@ -596,6 +624,16 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config stdioFDs = append(stdioFDs, int(f.Fd())) } + // Create the FD map, which will set stdin, stdout, and stderr. + ctx := procArgs.NewContext(l.k) + fdm, err := createFDMap(ctx, procArgs.Limits, false, stdioFDs) + if err != nil { + return fmt.Errorf("importing fds: %v", err) + } + // CreateProcess takes a reference on FDMap if successful. We won't need ours + // either way. + procArgs.FDMap = fdm + // Can't take ownership away from os.File. dup them to get a new FDs. var goferFDs []int for _, f := range files[3:] { @@ -606,22 +644,12 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config goferFDs = append(goferFDs, fd) } - if err := setupContainerFS( - &procArgs, - spec, - conf, - stdioFDs, - goferFDs, - false, - creds, - procArgs.Limits, - k, - cid); err != nil { + mntr := newContainerMounter(spec, cid, goferFDs, l.k, l.mountHints) + if err := mntr.setupFS(ctx, conf, &procArgs, creds); err != nil { return fmt.Errorf("configuring container FS: %v", err) } - ctx := procArgs.NewContext(l.k) - mns := k.RootMountNamespace() + mns := l.k.RootMountNamespace() if err := setExecutablePath(ctx, mns, &procArgs); err != nil { return fmt.Errorf("setting executable path for %+v: %v", procArgs, err) } @@ -724,7 +752,7 @@ func (l *Loader) waitContainer(cid string, waitStatus *uint32) error { return nil } -func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, clearStatus bool, waitStatus *uint32) error { +func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) error { if tgid <= 0 { return fmt.Errorf("PID (%d) must be positive", tgid) } @@ -736,13 +764,10 @@ func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, clearStatus bool, wai ws := l.wait(execTG) *waitStatus = ws - // Remove tg from the cache if caller requested it. - if clearStatus { - l.mu.Lock() - delete(l.processes, eid) - log.Debugf("updated processes (removal): %v", l.processes) - l.mu.Unlock() - } + l.mu.Lock() + delete(l.processes, eid) + log.Debugf("updated processes (removal): %v", l.processes) + l.mu.Unlock() return nil } diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index 4603f751d..2f2499811 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -397,14 +397,15 @@ func TestCreateMountNamespace(t *testing.T) { } defer cleanup() - // setupRootContainerFS needs to find root from the context after the + // setupRootContainer needs to find root from the context after the // namespace is created. var mns *fs.MountNamespace setMountNS := func(m *fs.MountNamespace) { mns = m ctx.(*contexttest.TestContext).RegisterValue(fs.CtxRoot, mns.Root()) } - if err := setupRootContainerFS(ctx, ctx, &tc.spec, conf, []int{sandEnd}, setMountNS); err != nil { + mntr := newContainerMounter(&tc.spec, "", []int{sandEnd}, nil, &podMountHints{}) + if err := mntr.setupRootContainer(ctx, ctx, conf, setMountNS); err != nil { t.Fatalf("createMountNamespace test case %q failed: %v", tc.name, err) } root := mns.Root() @@ -609,8 +610,8 @@ func TestRestoreEnvironment(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { conf := testConfig() - fds := &fdDispenser{fds: tc.ioFDs} - actualRenv, err := createRestoreEnvironment(tc.spec, conf, fds) + mntr := newContainerMounter(tc.spec, "", tc.ioFDs, nil, &podMountHints{}) + actualRenv, err := mntr.createRestoreEnvironment(conf) if !tc.errorExpected && err != nil { t.Fatalf("could not create restore environment for test:%s", tc.name) } else if tc.errorExpected { diff --git a/runsc/boot/network.go b/runsc/boot/network.go index 0a154d90b..d86803252 100644 --- a/runsc/boot/network.go +++ b/runsc/boot/network.go @@ -56,7 +56,11 @@ type FDBasedLink struct { Addresses []net.IP Routes []Route GSOMaxSize uint32 - LinkAddress []byte + LinkAddress net.HardwareAddr + + // NumChannels controls how many underlying FD's are to be used to + // create this endpoint. + NumChannels int } // LoopbackLink configures a loopback li nk. @@ -68,8 +72,9 @@ type LoopbackLink struct { // CreateLinksAndRoutesArgs are arguments to CreateLinkAndRoutes. type CreateLinksAndRoutesArgs struct { - // FilePayload contains the fds associated with the FDBasedLinks. The - // two slices must have the same length. + // FilePayload contains the fds associated with the FDBasedLinks. The + // number of fd's should match the sum of the NumChannels field of the + // FDBasedLink entries below. urpc.FilePayload LoopbackLinks []LoopbackLink @@ -95,8 +100,12 @@ func (r *Route) toTcpipRoute(id tcpip.NICID) tcpip.Route { // CreateLinksAndRoutes creates links and routes in a network stack. It should // only be called once. func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct{}) error { - if len(args.FilePayload.Files) != len(args.FDBasedLinks) { - return fmt.Errorf("FilePayload must be same length at FDBasedLinks") + wantFDs := 0 + for _, l := range args.FDBasedLinks { + wantFDs += l.NumChannels + } + if got := len(args.FilePayload.Files); got != wantFDs { + return fmt.Errorf("args.FilePayload.Files has %d FD's but we need %d entries based on FDBasedLinks", got, wantFDs) } var nicID tcpip.NICID @@ -123,20 +132,26 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct } } - for i, link := range args.FDBasedLinks { + fdOffset := 0 + for _, link := range args.FDBasedLinks { nicID++ nicids[link.Name] = nicID - // Copy the underlying FD. - oldFD := args.FilePayload.Files[i].Fd() - newFD, err := syscall.Dup(int(oldFD)) - if err != nil { - return fmt.Errorf("failed to dup FD %v: %v", oldFD, err) + FDs := []int{} + for j := 0; j < link.NumChannels; j++ { + // Copy the underlying FD. + oldFD := args.FilePayload.Files[fdOffset].Fd() + newFD, err := syscall.Dup(int(oldFD)) + if err != nil { + return fmt.Errorf("failed to dup FD %v: %v", oldFD, err) + } + FDs = append(FDs, newFD) + fdOffset++ } mac := tcpip.LinkAddress(link.LinkAddress) linkEP, err := fdbased.New(&fdbased.Options{ - FD: newFD, + FDs: FDs, MTU: uint32(link.MTU), EthernetHeader: true, Address: mac, @@ -148,7 +163,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct return err } - log.Infof("Enabling interface %q with id %d on addresses %+v (%v)", link.Name, nicID, link.Addresses, mac) + log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels) if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses, false /* loopback */); err != nil { return err } diff --git a/runsc/boot/pprof.go b/runsc/boot/pprof.go new file mode 100644 index 000000000..463362f02 --- /dev/null +++ b/runsc/boot/pprof.go @@ -0,0 +1,18 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +func initializePProf() { +} diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD index b7551a5ab..df6af0ced 100644 --- a/runsc/cmd/BUILD +++ b/runsc/cmd/BUILD @@ -14,9 +14,11 @@ go_library( "debug.go", "delete.go", "do.go", + "error.go", "events.go", "exec.go", "gofer.go", + "help.go", "kill.go", "list.go", "path.go", @@ -28,6 +30,7 @@ go_library( "spec.go", "start.go", "state.go", + "syscalls.go", "wait.go", ], importpath = "gvisor.googlesource.com/gvisor/runsc/cmd", @@ -38,6 +41,7 @@ go_library( "//pkg/log", "//pkg/p9", "//pkg/sentry/control", + "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/unet", "//pkg/urpc", diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go index 3a547d4aa..e0a950e9c 100644 --- a/runsc/cmd/boot.go +++ b/runsc/cmd/boot.go @@ -130,6 +130,8 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) // Ensure that if there is a panic, all goroutine stacks are printed. debug.SetTraceback("all") + conf := args[0].(*boot.Config) + if b.setUpRoot { if err := setUpChroot(b.pidns); err != nil { Fatalf("error setting up chroot: %v", err) @@ -143,14 +145,16 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) args = append(args, arg) } } - // Note that we've already read the spec from the spec FD, and - // we will read it again after the exec call. This works - // because the ReadSpecFromFile function seeks to the beginning - // of the file before reading. - if err := callSelfAsNobody(args); err != nil { - Fatalf("%v", err) + if !conf.Rootless { + // Note that we've already read the spec from the spec FD, and + // we will read it again after the exec call. This works + // because the ReadSpecFromFile function seeks to the beginning + // of the file before reading. + if err := callSelfAsNobody(args); err != nil { + Fatalf("%v", err) + } + panic("callSelfAsNobody must never return success") } - panic("callSelfAsNobody must never return success") } } @@ -163,9 +167,6 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) } specutils.LogSpec(spec) - conf := args[0].(*boot.Config) - waitStatus := args[1].(*syscall.WaitStatus) - if b.applyCaps { caps := spec.Process.Capabilities if caps == nil { @@ -251,6 +252,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) ws := l.WaitExit() log.Infof("application exiting with %+v", ws) + waitStatus := args[1].(*syscall.WaitStatus) *waitStatus = syscall.WaitStatus(ws.Status()) l.Destroy() return subcommands.ExitSuccess diff --git a/runsc/cmd/capability_test.go b/runsc/cmd/capability_test.go index ee74d33d8..2825dfaa5 100644 --- a/runsc/cmd/capability_test.go +++ b/runsc/cmd/capability_test.go @@ -116,6 +116,6 @@ func TestCapabilities(t *testing.T) { } func TestMain(m *testing.M) { - testutil.RunAsRoot() + specutils.MaybeRunAsRoot() os.Exit(m.Run()) } diff --git a/runsc/cmd/cmd.go b/runsc/cmd/cmd.go index a2fc377d1..5b4cc4a39 100644 --- a/runsc/cmd/cmd.go +++ b/runsc/cmd/cmd.go @@ -17,34 +17,15 @@ package cmd import ( "fmt" - "os" "runtime" "strconv" "syscall" - "github.com/google/subcommands" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.googlesource.com/gvisor/pkg/log" "gvisor.googlesource.com/gvisor/runsc/specutils" ) -// Errorf logs to stderr and returns subcommands.ExitFailure. -func Errorf(s string, args ...interface{}) subcommands.ExitStatus { - // If runsc is being invoked by docker or cri-o, then we might not have - // access to stderr, so we log a serious-looking warning in addition to - // writing to stderr. - log.Warningf("FATAL ERROR: "+s, args...) - fmt.Fprintf(os.Stderr, s+"\n", args...) - // Return an error that is unlikely to be used by the application. - return subcommands.ExitFailure -} - -// Fatalf logs to stderr and exits with a failure status code. -func Fatalf(s string, args ...interface{}) { - Errorf(s, args...) - os.Exit(128) -} - // intFlags can be used with int flags that appear multiple times. type intFlags []int diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go index 629c198fd..e82e8c667 100644 --- a/runsc/cmd/create.go +++ b/runsc/cmd/create.go @@ -16,7 +16,6 @@ package cmd import ( "context" - "flag" "github.com/google/subcommands" "gvisor.googlesource.com/gvisor/runsc/boot" @@ -83,13 +82,17 @@ func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{} id := f.Arg(0) conf := args[0].(*boot.Config) + if conf.Rootless { + return Errorf("Rootless mode not supported with %q", c.Name()) + } + bundleDir := c.bundleDir if bundleDir == "" { bundleDir = getwdOrDie() } spec, err := specutils.ReadSpec(bundleDir) if err != nil { - Fatalf("reading spec: %v", err) + return Errorf("reading spec: %v", err) } specutils.LogSpec(spec) @@ -97,7 +100,7 @@ func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{} // container unless the metadata specifies that it should be run in an // existing container. if _, err := container.Create(id, spec, conf, bundleDir, c.consoleSocket, c.pidFile, c.userLog); err != nil { - Fatalf("creating container: %v", err) + return Errorf("creating container: %v", err) } return subcommands.ExitSuccess } diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go index 8ea59046c..3f6e46fce 100644 --- a/runsc/cmd/do.go +++ b/runsc/cmd/do.go @@ -39,10 +39,9 @@ import ( // Do implements subcommands.Command for the "do" command. It sets up a simple // sandbox and executes the command inside it. See Usage() for more details. type Do struct { - root string - cwd string - ip string - networkNamespace bool + root string + cwd string + ip string } // Name implements subcommands.Command.Name. @@ -72,7 +71,6 @@ func (c *Do) SetFlags(f *flag.FlagSet) { f.StringVar(&c.root, "root", "/", `path to the root directory, defaults to "/"`) f.StringVar(&c.cwd, "cwd", ".", "path to the current directory, defaults to the current directory") f.StringVar(&c.ip, "ip", "192.168.10.2", "IPv4 address for the sandbox") - f.BoolVar(&c.networkNamespace, "netns", true, "run in a new network namespace") } // Execute implements subcommands.Command.Execute. @@ -85,15 +83,21 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su conf := args[0].(*boot.Config) waitStatus := args[1].(*syscall.WaitStatus) - // Map the entire host file system, but make it readonly with a writable - // overlay on top (ignore --overlay option). - conf.Overlay = true + if conf.Rootless { + if err := specutils.MaybeRunAsRoot(); err != nil { + return Errorf("Error executing inside namespace: %v", err) + } + // Execution will continue here if no more capabilities are needed... + } hostname, err := os.Hostname() if err != nil { return Errorf("Error to retrieve hostname: %v", err) } + // Map the entire host file system, but make it readonly with a writable + // overlay on top (ignore --overlay option). + conf.Overlay = true absRoot, err := resolvePath(c.root) if err != nil { return Errorf("Error resolving root: %v", err) @@ -119,11 +123,22 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su specutils.LogSpec(spec) cid := fmt.Sprintf("runsc-%06d", rand.Int31n(1000000)) - if !c.networkNamespace { - if conf.Network != boot.NetworkHost { - Fatalf("The current network namespace can be used only if --network=host is set", nil) + if conf.Network == boot.NetworkNone { + netns := specs.LinuxNamespace{ + Type: specs.NetworkNamespace, + } + if spec.Linux != nil { + panic("spec.Linux is not nil") } - } else if conf.Network != boot.NetworkNone { + spec.Linux = &specs.Linux{Namespaces: []specs.LinuxNamespace{netns}} + + } else if conf.Rootless { + if conf.Network == boot.NetworkSandbox { + fmt.Println("*** Rootless requires changing network type to host ***") + conf.Network = boot.NetworkHost + } + + } else { clean, err := c.setupNet(cid, spec) if err != nil { return Errorf("Error setting up network: %v", err) diff --git a/runsc/cmd/error.go b/runsc/cmd/error.go new file mode 100644 index 000000000..700b19f14 --- /dev/null +++ b/runsc/cmd/error.go @@ -0,0 +1,72 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "encoding/json" + "fmt" + "io" + "os" + "time" + + "github.com/google/subcommands" + "gvisor.googlesource.com/gvisor/pkg/log" +) + +// ErrorLogger is where error messages should be written to. These messages are +// consumed by containerd and show up to users of command line tools, +// like docker/kubectl. +var ErrorLogger io.Writer + +type jsonError struct { + Msg string `json:"msg"` + Level string `json:"level"` + Time time.Time `json:"time"` +} + +// Errorf logs error to containerd log (--log), to stderr, and debug logs. It +// returns subcommands.ExitFailure for convenience with subcommand.Execute() +// methods: +// return Errorf("Danger! Danger!") +// +func Errorf(format string, args ...interface{}) subcommands.ExitStatus { + // If runsc is being invoked by docker or cri-o, then we might not have + // access to stderr, so we log a serious-looking warning in addition to + // writing to stderr. + log.Warningf("FATAL ERROR: "+format, args...) + fmt.Fprintf(os.Stderr, format+"\n", args...) + + j := jsonError{ + Msg: fmt.Sprintf(format, args...), + Level: "error", + Time: time.Now(), + } + b, err := json.Marshal(j) + if err != nil { + panic(err) + } + if ErrorLogger != nil { + ErrorLogger.Write(b) + } + + return subcommands.ExitFailure +} + +// Fatalf logs the same way as Errorf() does, plus *exits* the process. +func Fatalf(format string, args ...interface{}) { + Errorf(format, args...) + // Return an error that is unlikely to be used by the application. + os.Exit(128) +} diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go index 52fd7ac4b..0eeaaadba 100644 --- a/runsc/cmd/exec.go +++ b/runsc/cmd/exec.go @@ -40,8 +40,6 @@ import ( "gvisor.googlesource.com/gvisor/runsc/specutils" ) -const privateClearStatusFlag = "private-clear-status" - // Exec implements subcommands.Command for the "exec" command. type Exec struct { cwd string @@ -51,7 +49,6 @@ type Exec struct { extraKGIDs stringSlice caps stringSlice detach bool - clearStatus bool processPath string pidFile string internalPidFile string @@ -103,10 +100,6 @@ func (ex *Exec) SetFlags(f *flag.FlagSet) { f.StringVar(&ex.pidFile, "pid-file", "", "filename that the container pid will be written to") f.StringVar(&ex.internalPidFile, "internal-pid-file", "", "filename that the container-internal pid will be written to") f.StringVar(&ex.consoleSocket, "console-socket", "", "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal") - - // This flag clears the status of the exec'd process upon completion. It is - // only used when we fork due to --detach being set on the parent. - f.BoolVar(&ex.clearStatus, privateClearStatusFlag, true, "private flag, do not use") } // Execute implements subcommands.Command.Execute. It starts a process in an @@ -150,13 +143,16 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) // write the child's PID to the pid file. So when the container returns, the // child process will also return and signal containerd. if ex.detach { - return ex.execAndWait(waitStatus) + return ex.execChildAndWait(waitStatus) } + return ex.exec(c, e, waitStatus) +} +func (ex *Exec) exec(c *container.Container, e *control.ExecArgs, waitStatus *syscall.WaitStatus) subcommands.ExitStatus { // Start the new process and get it pid. pid, err := c.Execute(e) if err != nil { - Fatalf("getting processes for container: %v", err) + return Errorf("executing processes for container: %v", err) } if e.StdioIsPty { @@ -170,33 +166,37 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) if ex.internalPidFile != "" { pidStr := []byte(strconv.Itoa(int(pid))) if err := ioutil.WriteFile(ex.internalPidFile, pidStr, 0644); err != nil { - Fatalf("writing internal pid file %q: %v", ex.internalPidFile, err) + return Errorf("writing internal pid file %q: %v", ex.internalPidFile, err) } } - // Generate the pid file after the internal pid file is generated, so that users - // can safely assume that the internal pid file is ready after `runsc exec -d` - // returns. + // Generate the pid file after the internal pid file is generated, so that + // users can safely assume that the internal pid file is ready after + // `runsc exec -d` returns. if ex.pidFile != "" { if err := ioutil.WriteFile(ex.pidFile, []byte(strconv.Itoa(os.Getpid())), 0644); err != nil { - Fatalf("writing pid file: %v", err) + return Errorf("writing pid file: %v", err) } } // Wait for the process to exit. - ws, err := c.WaitPID(pid, ex.clearStatus) + ws, err := c.WaitPID(pid) if err != nil { - Fatalf("waiting on pid %d: %v", pid, err) + return Errorf("waiting on pid %d: %v", pid, err) } *waitStatus = ws return subcommands.ExitSuccess } -func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStatus { - binPath := specutils.ExePath +func (ex *Exec) execChildAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStatus { var args []string + for _, a := range os.Args[1:] { + if !strings.Contains(a, "detach") { + args = append(args, a) + } + } - // The command needs to write a pid file so that execAndWait can tell + // The command needs to write a pid file so that execChildAndWait can tell // when it has started. If no pid-file was provided, we should use a // filename in a temp directory. pidFile := ex.pidFile @@ -210,19 +210,7 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat args = append(args, "--pid-file="+pidFile) } - // Add the rest of the args, excluding the "detach" flag. - for _, a := range os.Args[1:] { - if strings.Contains(a, "detach") { - // Replace with the "private-clear-status" flag, which tells - // the new process it's a detached child and shouldn't - // clear the exit status of the sentry process. - args = append(args, fmt.Sprintf("--%s=false", privateClearStatusFlag)) - } else { - args = append(args, a) - } - } - - cmd := exec.Command(binPath, args...) + cmd := exec.Command(specutils.ExePath, args...) cmd.Args[0] = "runsc-exec" // Exec stdio defaults to current process stdio. @@ -233,8 +221,7 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat // If the console control socket file is provided, then create a new // pty master/slave pair and set the TTY on the sandbox process. if ex.consoleSocket != "" { - // Create a new TTY pair and send the master on the provided - // socket. + // Create a new TTY pair and send the master on the provided socket. tty, err := console.NewWithSocket(ex.consoleSocket) if err != nil { Fatalf("setting up console with socket %q: %v", ex.consoleSocket, err) @@ -256,7 +243,7 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat Fatalf("failure to start child exec process, err: %v", err) } - log.Infof("Started child (PID: %d) to exec and wait: %s %s", cmd.Process.Pid, binPath, args) + log.Infof("Started child (PID: %d) to exec and wait: %s %s", cmd.Process.Pid, specutils.ExePath, args) // Wait for PID file to ensure that child process has started. Otherwise, // '--process' file is deleted as soon as this process returns and the child @@ -278,7 +265,10 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat return false, nil } if err := specutils.WaitForReady(cmd.Process.Pid, 10*time.Second, ready); err != nil { - Fatalf("unexpected error waiting for PID file, err: %v", err) + // Don't log fatal error here, otherwise it will override the error logged + // by the child process that has failed to start. + log.Warningf("Unexpected error waiting for PID file, err: %v", err) + return subcommands.ExitFailure } *waitStatus = 0 diff --git a/runsc/cmd/help.go b/runsc/cmd/help.go new file mode 100644 index 000000000..ff4f901cb --- /dev/null +++ b/runsc/cmd/help.go @@ -0,0 +1,126 @@ +// Copyright 2018 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "fmt" + + "flag" + "github.com/google/subcommands" +) + +// NewHelp returns a help command for the given commander. +func NewHelp(cdr *subcommands.Commander) *Help { + return &Help{ + cdr: cdr, + } +} + +// Help implements subcommands.Command for the "help" command. The 'help' +// command prints help for commands registered to a Commander but also allows for +// registering additional help commands that print other documentation. +type Help struct { + cdr *subcommands.Commander + commands []subcommands.Command + help bool +} + +// Name implements subcommands.Command.Name. +func (*Help) Name() string { + return "help" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Help) Synopsis() string { + return "Print help documentation." +} + +// Usage implements subcommands.Command.Usage. +func (*Help) Usage() string { + return `help [<subcommand>]: + With an argument, prints detailed information on the use of + the specified topic or subcommand. With no argument, print a list of + all commands and a brief description of each. +` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (h *Help) SetFlags(f *flag.FlagSet) {} + +// Execute implements subcommands.Command.Execute. +func (h *Help) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + switch f.NArg() { + case 0: + fmt.Fprintf(h.cdr.Output, "Usage: %s <flags> <subcommand> <subcommand args>\n\n", h.cdr.Name()) + fmt.Fprintf(h.cdr.Output, `runsc is a command line client for running applications packaged in the Open +Container Initiative (OCI) format. Applications run by runsc are run in an +isolated gVisor sandbox that emulates a Linux environment. + +gVisor is a user-space kernel, written in Go, that implements a substantial +portion of the Linux system call interface. It provides an additional layer +of isolation between running applications and the host operating system. + +Functionality is provided by subcommands. For additonal help on individual +subcommands use "%s %s <subcommand>". + +`, h.cdr.Name(), h.Name()) + h.cdr.VisitGroups(func(g *subcommands.CommandGroup) { + h.cdr.ExplainGroup(h.cdr.Output, g) + }) + + fmt.Fprintf(h.cdr.Output, "Additional help topics (Use \"%s %s <topic>\" to see help on the topic):\n", h.cdr.Name(), h.Name()) + for _, cmd := range h.commands { + fmt.Fprintf(h.cdr.Output, "\t%-15s %s\n", cmd.Name(), cmd.Synopsis()) + } + fmt.Fprintf(h.cdr.Output, "\nUse \"%s flags\" for a list of top-level flags\n", h.cdr.Name()) + return subcommands.ExitSuccess + default: + // Look for commands registered to the commander and print help explanation if found. + found := false + h.cdr.VisitCommands(func(g *subcommands.CommandGroup, cmd subcommands.Command) { + if f.Arg(0) == cmd.Name() { + h.cdr.ExplainCommand(h.cdr.Output, cmd) + found = true + } + }) + if found { + return subcommands.ExitSuccess + } + + // Next check commands registered to the help command. + for _, cmd := range h.commands { + if f.Arg(0) == cmd.Name() { + fs := flag.NewFlagSet(f.Arg(0), flag.ContinueOnError) + fs.Usage = func() { h.cdr.ExplainCommand(h.cdr.Error, cmd) } + cmd.SetFlags(fs) + if fs.Parse(f.Args()[1:]) != nil { + return subcommands.ExitUsageError + } + return cmd.Execute(ctx, f, args...) + } + } + + fmt.Fprintf(h.cdr.Error, "Subcommand %s not understood\n", f.Arg(0)) + } + + f.Usage() + return subcommands.ExitUsageError +} + +// Register registers a new help command. +func (h *Help) Register(cmd subcommands.Command) { + h.commands = append(h.commands, cmd) +} diff --git a/runsc/cmd/restore.go b/runsc/cmd/restore.go index 3ab2f5676..a78a0dce6 100644 --- a/runsc/cmd/restore.go +++ b/runsc/cmd/restore.go @@ -80,25 +80,29 @@ func (r *Restore) Execute(_ context.Context, f *flag.FlagSet, args ...interface{ conf := args[0].(*boot.Config) waitStatus := args[1].(*syscall.WaitStatus) + if conf.Rootless { + return Errorf("Rootless mode not supported with %q", r.Name()) + } + bundleDir := r.bundleDir if bundleDir == "" { bundleDir = getwdOrDie() } spec, err := specutils.ReadSpec(bundleDir) if err != nil { - Fatalf("reading spec: %v", err) + return Errorf("reading spec: %v", err) } specutils.LogSpec(spec) if r.imagePath == "" { - Fatalf("image-path flag must be provided") + return Errorf("image-path flag must be provided") } conf.RestoreFile = filepath.Join(r.imagePath, checkpointFileName) ws, err := container.Run(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile, r.userLog, r.detach) if err != nil { - Fatalf("running container: %v", err) + return Errorf("running container: %v", err) } *waitStatus = ws diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go index c228b4f93..abf602239 100644 --- a/runsc/cmd/run.go +++ b/runsc/cmd/run.go @@ -67,19 +67,23 @@ func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s conf := args[0].(*boot.Config) waitStatus := args[1].(*syscall.WaitStatus) + if conf.Rootless { + return Errorf("Rootless mode not supported with %q", r.Name()) + } + bundleDir := r.bundleDir if bundleDir == "" { bundleDir = getwdOrDie() } spec, err := specutils.ReadSpec(bundleDir) if err != nil { - Fatalf("reading spec: %v", err) + return Errorf("reading spec: %v", err) } specutils.LogSpec(spec) ws, err := container.Run(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile, r.userLog, r.detach) if err != nil { - Fatalf("running container: %v", err) + return Errorf("running container: %v", err) } *waitStatus = ws diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go index 657726251..31e8f42bb 100644 --- a/runsc/cmd/start.go +++ b/runsc/cmd/start.go @@ -16,7 +16,6 @@ package cmd import ( "context" - "flag" "github.com/google/subcommands" "gvisor.googlesource.com/gvisor/runsc/boot" diff --git a/runsc/cmd/syscalls.go b/runsc/cmd/syscalls.go new file mode 100644 index 000000000..9c8a66490 --- /dev/null +++ b/runsc/cmd/syscalls.go @@ -0,0 +1,347 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "encoding/csv" + "encoding/json" + "fmt" + "io" + "os" + "sort" + "strconv" + "text/tabwriter" + + "flag" + "github.com/google/subcommands" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" +) + +// Syscalls implements subcommands.Command for the "syscalls" command. +type Syscalls struct { + output string + os string + arch string +} + +// CompatibilityInfo is a map of system and architecture to compatibility doc. +// Maps operating system to architecture to ArchInfo. +type CompatibilityInfo map[string]map[string]ArchInfo + +// ArchInfo is compatbility doc for an architecture. +type ArchInfo struct { + // Syscalls maps syscall number for the architecture to the doc. + Syscalls map[uintptr]SyscallDoc `json:"syscalls"` +} + +// SyscallDoc represents a single item of syscall documentation. +type SyscallDoc struct { + Name string `json:"name"` + num uintptr + + Support string `json:"support"` + Note string `json:"note,omitempty"` + URLs []string `json:"urls,omitempty"` +} + +type outputFunc func(io.Writer, CompatibilityInfo) error + +var ( + // The string name to use for printing compatibility for all OSes. + osAll = "all" + + // The string name to use for printing compatibility for all architectures. + archAll = "all" + + // A map of OS name to map of architecture name to syscall table. + syscallTableMap = make(map[string]map[string]*kernel.SyscallTable) + + // A map of output type names to output functions. + outputMap = map[string]outputFunc{ + "table": outputTable, + "json": outputJSON, + "csv": outputCSV, + } +) + +// Name implements subcommands.Command.Name. +func (*Syscalls) Name() string { + return "syscalls" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Syscalls) Synopsis() string { + return "Print compatibility information for syscalls." +} + +// Usage implements subcommands.Command.Usage. +func (*Syscalls) Usage() string { + return `syscalls [options] - Print compatibility information for syscalls. +` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (s *Syscalls) SetFlags(f *flag.FlagSet) { + f.StringVar(&s.output, "o", "table", "Output format (table, csv, json).") + f.StringVar(&s.os, "os", osAll, "The OS (e.g. linux)") + f.StringVar(&s.arch, "arch", archAll, "The CPU architecture (e.g. amd64).") +} + +// Execute implements subcommands.Command.Execute. +func (s *Syscalls) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + out, ok := outputMap[s.output] + if !ok { + Fatalf("Unsupported output format %q", s.output) + } + + // Build map of all supported architectures. + tables := kernel.SyscallTables() + for _, t := range tables { + osMap, ok := syscallTableMap[t.OS.String()] + if !ok { + osMap = make(map[string]*kernel.SyscallTable) + syscallTableMap[t.OS.String()] = osMap + } + osMap[t.Arch.String()] = t + } + + // Build a map of the architectures we want to output. + info, err := getCompatibilityInfo(s.os, s.arch) + if err != nil { + Fatalf("%v", err) + } + + if err := out(os.Stdout, info); err != nil { + Fatalf("Error writing output: %v", err) + } + + return subcommands.ExitSuccess +} + +// getCompatibilityInfo returns compatibility info for the given OS name and +// architecture name. Supports the special name 'all' for OS and architecture that +// specifies that all supported OSes or architectures should be included. +func getCompatibilityInfo(osName string, archName string) (CompatibilityInfo, error) { + info := CompatibilityInfo(make(map[string]map[string]ArchInfo)) + if osName == osAll { + // Special processing for the 'all' OS name. + for osName, _ := range syscallTableMap { + info[osName] = make(map[string]ArchInfo) + // osName is a specific OS name. + if err := addToCompatibilityInfo(info, osName, archName); err != nil { + return info, err + } + } + } else { + // osName is a specific OS name. + info[osName] = make(map[string]ArchInfo) + if err := addToCompatibilityInfo(info, osName, archName); err != nil { + return info, err + } + } + + return info, nil +} + +// addToCompatibilityInfo adds ArchInfo for the given specific OS name and +// architecture name. Supports the special architecture name 'all' to specify +// that all supported architectures for the OS should be included. +func addToCompatibilityInfo(info CompatibilityInfo, osName string, archName string) error { + if archName == archAll { + // Special processing for the 'all' architecture name. + for archName, _ := range syscallTableMap[osName] { + archInfo, err := getArchInfo(osName, archName) + if err != nil { + return err + } + info[osName][archName] = archInfo + } + } else { + // archName is a specific architecture name. + archInfo, err := getArchInfo(osName, archName) + if err != nil { + return err + } + info[osName][archName] = archInfo + } + + return nil +} + +// getArchInfo returns compatibility info for a specific OS and architecture. +func getArchInfo(osName string, archName string) (ArchInfo, error) { + info := ArchInfo{} + info.Syscalls = make(map[uintptr]SyscallDoc) + + t, ok := syscallTableMap[osName][archName] + if !ok { + return info, fmt.Errorf("syscall table for %s/%s not found", osName, archName) + } + + for num, sc := range t.Table { + info.Syscalls[num] = SyscallDoc{ + Name: sc.Name, + num: num, + Support: sc.SupportLevel.String(), + Note: sc.Note, + URLs: sc.URLs, + } + } + + return info, nil +} + +// outputTable outputs the syscall info in tabular format. +func outputTable(w io.Writer, info CompatibilityInfo) error { + tw := tabwriter.NewWriter(w, 0, 0, 2, ' ', 0) + + // Linux + for osName, osInfo := range info { + for archName, archInfo := range osInfo { + // Print the OS/arch + fmt.Fprintf(w, "%s/%s:\n\n", osName, archName) + + // Sort the syscalls for output in the table. + sortedCalls := []SyscallDoc{} + for _, sc := range archInfo.Syscalls { + sortedCalls = append(sortedCalls, sc) + } + sort.Slice(sortedCalls, func(i, j int) bool { + return sortedCalls[i].num < sortedCalls[j].num + }) + + // Write the header + _, err := fmt.Fprintf(tw, "%s\t%s\t%s\t%s\n", + "NUM", + "NAME", + "SUPPORT", + "NOTE", + ) + if err != nil { + return err + } + + // Write each syscall entry + for _, sc := range sortedCalls { + _, err = fmt.Fprintf(tw, "%s\t%s\t%s\t%s\n", + strconv.FormatInt(int64(sc.num), 10), + sc.Name, + sc.Support, + sc.Note, + ) + if err != nil { + return err + } + // Add issue urls to note. + for _, url := range sc.URLs { + _, err = fmt.Fprintf(tw, "%s\t%s\t%s\tSee: %s\t\n", + "", + "", + "", + url, + ) + if err != nil { + return err + } + } + } + + err = tw.Flush() + if err != nil { + return err + } + } + } + + return nil +} + +// outputJSON outputs the syscall info in JSON format. +func outputJSON(w io.Writer, info CompatibilityInfo) error { + e := json.NewEncoder(w) + e.SetIndent("", " ") + return e.Encode(info) +} + +// numberedRow is aCSV row annotated by syscall number (used for sorting) +type numberedRow struct { + num uintptr + row []string +} + +// outputCSV outputs the syscall info in tabular format. +func outputCSV(w io.Writer, info CompatibilityInfo) error { + csvWriter := csv.NewWriter(w) + + // Linux + for osName, osInfo := range info { + for archName, archInfo := range osInfo { + // Sort the syscalls for output in the table. + sortedCalls := []numberedRow{} + for _, sc := range archInfo.Syscalls { + // Add issue urls to note. + note := sc.Note + for _, url := range sc.URLs { + note = fmt.Sprintf("%s\nSee: %s", note, url) + } + + sortedCalls = append(sortedCalls, numberedRow{ + num: sc.num, + row: []string{ + osName, + archName, + strconv.FormatInt(int64(sc.num), 10), + sc.Name, + sc.Support, + note, + }, + }) + } + sort.Slice(sortedCalls, func(i, j int) bool { + return sortedCalls[i].num < sortedCalls[j].num + }) + + // Write the header + err := csvWriter.Write([]string{ + "OS", + "Arch", + "Num", + "Name", + "Support", + "Note", + }) + if err != nil { + return err + } + + // Write each syscall entry + for _, sc := range sortedCalls { + err = csvWriter.Write(sc.row) + if err != nil { + return err + } + } + + csvWriter.Flush() + err = csvWriter.Error() + if err != nil { + return err + } + } + } + + return nil +} diff --git a/runsc/cmd/wait.go b/runsc/cmd/wait.go index a55a682f3..58fd01974 100644 --- a/runsc/cmd/wait.go +++ b/runsc/cmd/wait.go @@ -88,14 +88,14 @@ func (wt *Wait) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) waitStatus = ws // Wait on a PID in the root PID namespace. case wt.rootPID != unsetPID: - ws, err := c.WaitRootPID(int32(wt.rootPID), true /* clearStatus */) + ws, err := c.WaitRootPID(int32(wt.rootPID)) if err != nil { Fatalf("waiting on PID in root PID namespace %d in container %q: %v", wt.rootPID, c.ID, err) } waitStatus = ws // Wait on a PID in the container's PID namespace. case wt.pid != unsetPID: - ws, err := c.WaitPID(int32(wt.pid), true /* clearStatus */) + ws, err := c.WaitPID(int32(wt.pid)) if err != nil { Fatalf("waiting on PID %d in container %q: %v", wt.pid, c.ID, err) } diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go index b8af27c15..d016533e6 100644 --- a/runsc/container/console_test.go +++ b/runsc/container/console_test.go @@ -258,7 +258,7 @@ func TestJobControlSignalExec(t *testing.T) { } // Make sure the process indicates it was killed by a SIGKILL. - ws, err := c.WaitPID(pid, true) + ws, err := c.WaitPID(pid) if err != nil { t.Errorf("waiting on container failed: %v", err) } diff --git a/runsc/container/container.go b/runsc/container/container.go index 513085836..04b611b56 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -530,22 +530,22 @@ func (c *Container) Wait() (syscall.WaitStatus, error) { // WaitRootPID waits for process 'pid' in the sandbox's PID namespace and // returns its WaitStatus. -func (c *Container) WaitRootPID(pid int32, clearStatus bool) (syscall.WaitStatus, error) { +func (c *Container) WaitRootPID(pid int32) (syscall.WaitStatus, error) { log.Debugf("Wait on PID %d in sandbox %q", pid, c.Sandbox.ID) if !c.isSandboxRunning() { return 0, fmt.Errorf("sandbox is not running") } - return c.Sandbox.WaitPID(c.Sandbox.ID, pid, clearStatus) + return c.Sandbox.WaitPID(c.Sandbox.ID, pid) } // WaitPID waits for process 'pid' in the container's PID namespace and returns // its WaitStatus. -func (c *Container) WaitPID(pid int32, clearStatus bool) (syscall.WaitStatus, error) { +func (c *Container) WaitPID(pid int32) (syscall.WaitStatus, error) { log.Debugf("Wait on PID %d in container %q", pid, c.ID) if !c.isSandboxRunning() { return 0, fmt.Errorf("sandbox is not running") } - return c.Sandbox.WaitPID(c.ID, pid, clearStatus) + return c.Sandbox.WaitPID(c.ID, pid) } // SignalContainer sends the signal to the container. If all is true and signal diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index dcd9910a0..867bf8187 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -36,6 +36,7 @@ import ( "gvisor.googlesource.com/gvisor/pkg/sentry/control" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" "gvisor.googlesource.com/gvisor/runsc/boot" + "gvisor.googlesource.com/gvisor/runsc/specutils" "gvisor.googlesource.com/gvisor/runsc/test/testutil" ) @@ -1841,7 +1842,7 @@ func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, if err != nil { return 0, fmt.Errorf("error executing: %v", err) } - ws, err := cont.WaitPID(pid, true /* clearStatus */) + ws, err := cont.WaitPID(pid) if err != nil { return 0, fmt.Errorf("error waiting: %v", err) } @@ -1853,7 +1854,7 @@ func TestMain(m *testing.M) { if err := testutil.ConfigureExePath(); err != nil { panic(err.Error()) } - testutil.RunAsRoot() + specutils.MaybeRunAsRoot() os.Exit(m.Run()) } diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go index 39c4dc03d..d57a73d46 100644 --- a/runsc/container/multi_container_test.go +++ b/runsc/container/multi_container_test.go @@ -99,6 +99,36 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C return containers, cleanup, nil } +type execDesc struct { + c *Container + cmd []string + want int + desc string +} + +func execMany(execs []execDesc) error { + for _, exec := range execs { + args := &control.ExecArgs{Argv: exec.cmd} + if ws, err := exec.c.executeSync(args); err != nil { + return fmt.Errorf("error executing %+v: %v", args, err) + } else if ws.ExitStatus() != exec.want { + return fmt.Errorf("%q: exec %q got exit status: %d, want: %d", exec.desc, exec.cmd, ws.ExitStatus(), exec.want) + } + } + return nil +} + +func createSharedMount(mount specs.Mount, name string, pod ...*specs.Spec) { + for _, spec := range pod { + spec.Annotations[path.Join(boot.MountPrefix, name, "source")] = mount.Source + spec.Annotations[path.Join(boot.MountPrefix, name, "type")] = mount.Type + spec.Annotations[path.Join(boot.MountPrefix, name, "share")] = "pod" + if len(mount.Options) > 0 { + spec.Annotations[path.Join(boot.MountPrefix, name, "options")] = strings.Join(mount.Options, ",") + } + } +} + // TestMultiContainerSanity checks that it is possible to run 2 dead-simple // containers in the same sandbox. func TestMultiContainerSanity(t *testing.T) { @@ -175,12 +205,12 @@ func TestMultiContainerWait(t *testing.T) { go func(c *Container) { defer wg.Done() const pid = 2 - if ws, err := c.WaitPID(pid, true /* clearStatus */); err != nil { + if ws, err := c.WaitPID(pid); err != nil { t.Errorf("failed to wait for PID %d: %v", pid, err) } else if es := ws.ExitStatus(); es != 0 { t.Errorf("PID %d exited with non-zero status %d", pid, es) } - if _, err := c.WaitPID(pid, true /* clearStatus */); err == nil { + if _, err := c.WaitPID(pid); err == nil { t.Errorf("wait for stopped PID %d should fail", pid) } }(containers[1]) @@ -263,12 +293,12 @@ func TestExecWait(t *testing.T) { } // Get the exit status from the exec'd process. - if ws, err := containers[0].WaitPID(pid, true /* clearStatus */); err != nil { + if ws, err := containers[0].WaitPID(pid); err != nil { t.Fatalf("failed to wait for process %+v with pid %d: %v", args, pid, err) } else if es := ws.ExitStatus(); es != 0 { t.Fatalf("process %+v exited with non-zero status %d", args, es) } - if _, err := containers[0].WaitPID(pid, true /* clearStatus */); err == nil { + if _, err := containers[0].WaitPID(pid); err == nil { t.Fatalf("wait for stopped process %+v should fail", args) } } @@ -828,3 +858,272 @@ func TestMultiContainerGoferStop(t *testing.T) { } } } + +// Test that pod shared mounts are properly mounted in 2 containers and that +// changes from one container is reflected in the other. +func TestMultiContainerSharedMount(t *testing.T) { + for _, conf := range configs(all...) { + t.Logf("Running test with conf: %+v", conf) + + // Setup the containers. + sleep := []string{"sleep", "100"} + podSpec, ids := createSpecs(sleep, sleep) + mnt0 := specs.Mount{ + Destination: "/mydir/test", + Source: "/some/dir", + Type: "tmpfs", + Options: nil, + } + podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0) + + mnt1 := mnt0 + mnt1.Destination = "/mydir2/test2" + podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1) + + createSharedMount(mnt0, "test-mount", podSpec...) + + containers, cleanup, err := startContainers(conf, podSpec, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + + file0 := path.Join(mnt0.Destination, "abc") + file1 := path.Join(mnt1.Destination, "abc") + execs := []execDesc{ + { + c: containers[0], + cmd: []string{"/usr/bin/test", "-d", mnt0.Destination}, + desc: "directory is mounted in container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "-d", mnt1.Destination}, + desc: "directory is mounted in container1", + }, + { + c: containers[0], + cmd: []string{"/usr/bin/touch", file0}, + desc: "create file in container0", + }, + { + c: containers[0], + cmd: []string{"/usr/bin/test", "-f", file0}, + desc: "file appears in container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "-f", file1}, + desc: "file appears in container1", + }, + { + c: containers[1], + cmd: []string{"/bin/rm", file1}, + desc: "file removed from container1", + }, + { + c: containers[0], + cmd: []string{"/usr/bin/test", "!", "-f", file0}, + desc: "file removed from container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "!", "-f", file1}, + desc: "file removed from container1", + }, + { + c: containers[1], + cmd: []string{"/bin/mkdir", file1}, + desc: "create directory in container1", + }, + { + c: containers[0], + cmd: []string{"/usr/bin/test", "-d", file0}, + desc: "dir appears in container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "-d", file1}, + desc: "dir appears in container1", + }, + { + c: containers[0], + cmd: []string{"/bin/rmdir", file0}, + desc: "create directory in container0", + }, + { + c: containers[0], + cmd: []string{"/usr/bin/test", "!", "-d", file0}, + desc: "dir removed from container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "!", "-d", file1}, + desc: "dir removed from container1", + }, + } + if err := execMany(execs); err != nil { + t.Fatal(err.Error()) + } + } +} + +// Test that pod mounts are mounted as readonly when requested. +func TestMultiContainerSharedMountReadonly(t *testing.T) { + for _, conf := range configs(all...) { + t.Logf("Running test with conf: %+v", conf) + + // Setup the containers. + sleep := []string{"sleep", "100"} + podSpec, ids := createSpecs(sleep, sleep) + mnt0 := specs.Mount{ + Destination: "/mydir/test", + Source: "/some/dir", + Type: "tmpfs", + Options: []string{"ro"}, + } + podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0) + + mnt1 := mnt0 + mnt1.Destination = "/mydir2/test2" + podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1) + + createSharedMount(mnt0, "test-mount", podSpec...) + + containers, cleanup, err := startContainers(conf, podSpec, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + + file0 := path.Join(mnt0.Destination, "abc") + file1 := path.Join(mnt1.Destination, "abc") + execs := []execDesc{ + { + c: containers[0], + cmd: []string{"/usr/bin/test", "-d", mnt0.Destination}, + desc: "directory is mounted in container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "-d", mnt1.Destination}, + desc: "directory is mounted in container1", + }, + { + c: containers[0], + cmd: []string{"/usr/bin/touch", file0}, + want: 1, + desc: "fails to write to container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/touch", file1}, + want: 1, + desc: "fails to write to container1", + }, + } + if err := execMany(execs); err != nil { + t.Fatal(err.Error()) + } + } +} + +// Test that shared pod mounts continue to work after container is restarted. +func TestMultiContainerSharedMountRestart(t *testing.T) { + for _, conf := range configs(all...) { + t.Logf("Running test with conf: %+v", conf) + + // Setup the containers. + sleep := []string{"sleep", "100"} + podSpec, ids := createSpecs(sleep, sleep) + mnt0 := specs.Mount{ + Destination: "/mydir/test", + Source: "/some/dir", + Type: "tmpfs", + Options: nil, + } + podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0) + + mnt1 := mnt0 + mnt1.Destination = "/mydir2/test2" + podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1) + + createSharedMount(mnt0, "test-mount", podSpec...) + + containers, cleanup, err := startContainers(conf, podSpec, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + + file0 := path.Join(mnt0.Destination, "abc") + file1 := path.Join(mnt1.Destination, "abc") + execs := []execDesc{ + { + c: containers[0], + cmd: []string{"/usr/bin/touch", file0}, + desc: "create file in container0", + }, + { + c: containers[0], + cmd: []string{"/usr/bin/test", "-f", file0}, + desc: "file appears in container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "-f", file1}, + desc: "file appears in container1", + }, + } + if err := execMany(execs); err != nil { + t.Fatal(err.Error()) + } + + containers[1].Destroy() + + bundleDir, err := testutil.SetupBundleDir(podSpec[1]) + if err != nil { + t.Fatalf("error restarting container: %v", err) + } + defer os.RemoveAll(bundleDir) + + containers[1], err = Create(ids[1], podSpec[1], conf, bundleDir, "", "", "") + if err != nil { + t.Fatalf("error creating container: %v", err) + } + if err := containers[1].Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + execs = []execDesc{ + { + c: containers[0], + cmd: []string{"/usr/bin/test", "-f", file0}, + desc: "file is still in container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "-f", file1}, + desc: "file is still in container1", + }, + { + c: containers[1], + cmd: []string{"/bin/rm", file1}, + desc: "file removed from container1", + }, + { + c: containers[0], + cmd: []string{"/usr/bin/test", "!", "-f", file0}, + desc: "file removed from container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "!", "-f", file1}, + desc: "file removed from container1", + }, + } + if err := execMany(execs); err != nil { + t.Fatal(err.Error()) + } + } +} diff --git a/runsc/main.go b/runsc/main.go index 11bc73f75..cfe3a78d0 100644 --- a/runsc/main.go +++ b/runsc/main.go @@ -48,11 +48,12 @@ var ( // system that are not covered by the runtime spec. // Debugging flags. - debugLog = flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.") - logPackets = flag.Bool("log-packets", false, "enable network packet logging") - logFD = flag.Int("log-fd", -1, "file descriptor to log to. If set, the 'log' flag is ignored.") - debugLogFD = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to. If set, the 'debug-log-dir' flag is ignored.") - debugLogFormat = flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s") + debugLog = flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.") + logPackets = flag.Bool("log-packets", false, "enable network packet logging") + logFD = flag.Int("log-fd", -1, "file descriptor to log to. If set, the 'log' flag is ignored.") + debugLogFD = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to. If set, the 'debug-log-dir' flag is ignored.") + debugLogFormat = flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s") + alsoLogToStderr = flag.Bool("alsologtostderr", false, "send log messages to stderr") // Debugging flags: strace related strace = flag.Bool("strace", false, "enable strace") @@ -60,22 +61,27 @@ var ( straceLogSize = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs") // Flags that control sandbox runtime behavior. - platform = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm") - network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.") - gso = flag.Bool("gso", true, "enable generic segmenation offload") - fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.") - overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.") - watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.") - panicSignal = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.") - profile = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).") - netRaw = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.") - + platform = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm") + network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.") + gso = flag.Bool("gso", true, "enable generic segmenation offload") + fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.") + overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.") + watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.") + panicSignal = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.") + profile = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).") + netRaw = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.") + numNetworkChannels = flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.") + rootless = flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.") + + // Test flags, not to be used outside tests, ever. testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.") ) func main() { // Help and flags commands are generated automatically. - subcommands.Register(subcommands.HelpCommand(), "") + help := cmd.NewHelp(subcommands.DefaultCommander) + help.Register(new(cmd.Syscalls)) + subcommands.Register(help, "") subcommands.Register(subcommands.FlagsCommand(), "") // Register user-facing runsc commands. @@ -117,6 +123,22 @@ func main() { os.Exit(0) } + var errorLogger io.Writer + if *logFD > -1 { + errorLogger = os.NewFile(uintptr(*logFD), "error log file") + + } else if *logFilename != "" { + // We must set O_APPEND and not O_TRUNC because Docker passes + // the same log file for all commands (and also parses these + // log files), so we can't destroy them on each command. + var err error + errorLogger, err = os.OpenFile(*logFilename, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644) + if err != nil { + cmd.Fatalf("error opening log file %q: %v", *logFilename, err) + } + } + cmd.ErrorLogger = errorLogger + platformType, err := boot.MakePlatformType(*platform) if err != nil { cmd.Fatalf("%v", err) @@ -141,26 +163,33 @@ func main() { cmd.Fatalf("%v", err) } + if *numNetworkChannels <= 0 { + cmd.Fatalf("num_network_channels must be > 0, got: %d", *numNetworkChannels) + } + // Create a new Config from the flags. conf := &boot.Config{ - RootDir: *rootDir, - Debug: *debug, - LogFilename: *logFilename, - LogFormat: *logFormat, - DebugLog: *debugLog, - DebugLogFormat: *debugLogFormat, - FileAccess: fsAccess, - Overlay: *overlay, - Network: netType, - GSO: *gso, - LogPackets: *logPackets, - Platform: platformType, - Strace: *strace, - StraceLogSize: *straceLogSize, - WatchdogAction: wa, - PanicSignal: *panicSignal, - ProfileEnable: *profile, - EnableRaw: *netRaw, + RootDir: *rootDir, + Debug: *debug, + LogFilename: *logFilename, + LogFormat: *logFormat, + DebugLog: *debugLog, + DebugLogFormat: *debugLogFormat, + FileAccess: fsAccess, + Overlay: *overlay, + Network: netType, + GSO: *gso, + LogPackets: *logPackets, + Platform: platformType, + Strace: *strace, + StraceLogSize: *straceLogSize, + WatchdogAction: wa, + PanicSignal: *panicSignal, + ProfileEnable: *profile, + EnableRaw: *netRaw, + NumNetworkChannels: *numNetworkChannels, + Rootless: *rootless, + TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot, } if len(*straceSyscalls) != 0 { @@ -174,24 +203,7 @@ func main() { subcommand := flag.CommandLine.Arg(0) - var logFile io.Writer = os.Stderr - if *logFD > -1 { - logFile = os.NewFile(uintptr(*logFD), "log file") - } else if *logFilename != "" { - // We must set O_APPEND and not O_TRUNC because Docker passes - // the same log file for all commands (and also parses these - // log files), so we can't destroy them on each command. - f, err := os.OpenFile(*logFilename, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644) - if err != nil { - cmd.Fatalf("error opening log file %q: %v", *logFilename, err) - } - logFile = f - } else if subcommand == "do" { - logFile = ioutil.Discard - } - - e := newEmitter(*logFormat, logFile) - + var e log.Emitter if *debugLogFD > -1 { f := os.NewFile(uintptr(*debugLogFD), "debug log file") @@ -201,28 +213,31 @@ func main() { cmd.Fatalf("flag --debug-log-fd should only be passed to 'boot' and 'gofer' command, but was passed to %q", subcommand) } - // If we are the boot process, then we own our stdio FDs and - // can do what we want with them. Since Docker and Containerd - // both eat boot's stderr, we dup our stderr to the provided - // log FD so that panics will appear in the logs, rather than - // just disappear. + // If we are the boot process, then we own our stdio FDs and can do what we + // want with them. Since Docker and Containerd both eat boot's stderr, we + // dup our stderr to the provided log FD so that panics will appear in the + // logs, rather than just disappear. if err := syscall.Dup2(int(f.Fd()), int(os.Stderr.Fd())); err != nil { cmd.Fatalf("error dup'ing fd %d to stderr: %v", f.Fd(), err) } - if logFile == os.Stderr { - // Suppress logging to stderr when debug log is enabled. Otherwise all - // messages will be duplicated in the debug log (see Dup2() call above). - e = newEmitter(*debugLogFormat, f) - } else { - e = log.MultiEmitter{e, newEmitter(*debugLogFormat, f)} - } + e = newEmitter(*debugLogFormat, f) + } else if *debugLog != "" { f, err := specutils.DebugLogFile(*debugLog, subcommand) if err != nil { cmd.Fatalf("error opening debug log file in %q: %v", *debugLog, err) } - e = log.MultiEmitter{e, newEmitter(*debugLogFormat, f)} + e = newEmitter(*debugLogFormat, f) + + } else { + // Stderr is reserved for the application, just discard the logs if no debug + // log is specified. + e = newEmitter("text", ioutil.Discard) + } + + if *alsoLogToStderr { + e = log.MultiEmitter{e, newEmitter(*debugLogFormat, os.Stderr)} } log.SetTarget(e) diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go index 0460d5f1a..e9e24fc58 100644 --- a/runsc/sandbox/network.go +++ b/runsc/sandbox/network.go @@ -68,7 +68,7 @@ func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Confi // Build the path to the net namespace of the sandbox process. // This is what we will copy. nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net") - if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.GSO); err != nil { + if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.GSO, conf.NumNetworkChannels); err != nil { return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err) } case boot.NetworkHost: @@ -138,7 +138,7 @@ func isRootNS() (bool, error) { // createInterfacesAndRoutesFromNS scrapes the interface and routes from the // net namespace with the given path, creates them in the sandbox, and removes // them from the host. -func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO bool) error { +func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO bool, numNetworkChannels int) error { // Join the network namespace that we will be copying. restore, err := joinNetNS(nsPath) if err != nil { @@ -202,25 +202,6 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO continue } - // Create the socket. - const protocol = 0x0300 // htons(ETH_P_ALL) - fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol) - if err != nil { - return fmt.Errorf("unable to create raw socket: %v", err) - } - deviceFile := os.NewFile(uintptr(fd), "raw-device-fd") - - // Bind to the appropriate device. - ll := syscall.SockaddrLinklayer{ - Protocol: protocol, - Ifindex: iface.Index, - Hatype: 0, // No ARP type. - Pkttype: syscall.PACKET_OTHERHOST, - } - if err := syscall.Bind(fd, &ll); err != nil { - return fmt.Errorf("unable to bind to %q: %v", iface.Name, err) - } - // Scrape the routes before removing the address, since that // will remove the routes as well. routes, def, err := routesForIface(iface) @@ -236,9 +217,10 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO } link := boot.FDBasedLink{ - Name: iface.Name, - MTU: iface.MTU, - Routes: routes, + Name: iface.Name, + MTU: iface.MTU, + Routes: routes, + NumChannels: numNetworkChannels, } // Get the link for the interface. @@ -246,32 +228,25 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO if err != nil { return fmt.Errorf("getting link for interface %q: %v", iface.Name, err) } - link.LinkAddress = []byte(ifaceLink.Attrs().HardwareAddr) + link.LinkAddress = ifaceLink.Attrs().HardwareAddr - if enableGSO { - gso, err := isGSOEnabled(fd, iface.Name) + log.Debugf("Setting up network channels") + // Create the socket for the device. + for i := 0; i < link.NumChannels; i++ { + log.Debugf("Creating Channel %d", i) + socketEntry, err := createSocket(iface, ifaceLink, enableGSO) if err != nil { - return fmt.Errorf("getting GSO for interface %q: %v", iface.Name, err) + return fmt.Errorf("failed to createSocket for %s : %v", iface.Name, err) } - if gso { - if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil { - return fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err) - } - link.GSOMaxSize = ifaceLink.Attrs().GSOMaxSize + if i == 0 { + link.GSOMaxSize = socketEntry.gsoMaxSize } else { - log.Infof("GSO not available in host.") + if link.GSOMaxSize != socketEntry.gsoMaxSize { + return fmt.Errorf("inconsistent gsoMaxSize %d and %d when creating multiple channels for same interface: %s", + link.GSOMaxSize, socketEntry.gsoMaxSize, iface.Name) + } } - } - - // Use SO_RCVBUFFORCE because on linux the receive buffer for an - // AF_PACKET socket is capped by "net.core.rmem_max". rmem_max - // defaults to a unusually low value of 208KB. This is too low - // for gVisor to be able to receive packets at high throughputs - // without incurring packet drops. - const rcvBufSize = 4 << 20 // 4MB. - - if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, rcvBufSize); err != nil { - return fmt.Errorf("failed to increase socket rcv buffer to %d: %v", rcvBufSize, err) + args.FilePayload.Files = append(args.FilePayload.Files, socketEntry.deviceFile) } // Collect the addresses for the interface, enable forwarding, @@ -285,7 +260,6 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO } } - args.FilePayload.Files = append(args.FilePayload.Files, deviceFile) args.FDBasedLinks = append(args.FDBasedLinks, link) } @@ -296,6 +270,61 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO return nil } +type socketEntry struct { + deviceFile *os.File + gsoMaxSize uint32 +} + +// createSocket creates an underlying AF_PACKET socket and configures it for use by +// the sentry and returns an *os.File that wraps the underlying socket fd. +func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) (*socketEntry, error) { + // Create the socket. + const protocol = 0x0300 // htons(ETH_P_ALL) + fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol) + if err != nil { + return nil, fmt.Errorf("unable to create raw socket: %v", err) + } + deviceFile := os.NewFile(uintptr(fd), "raw-device-fd") + // Bind to the appropriate device. + ll := syscall.SockaddrLinklayer{ + Protocol: protocol, + Ifindex: iface.Index, + Hatype: 0, // No ARP type. + Pkttype: syscall.PACKET_OTHERHOST, + } + if err := syscall.Bind(fd, &ll); err != nil { + return nil, fmt.Errorf("unable to bind to %q: %v", iface.Name, err) + } + + gsoMaxSize := uint32(0) + if enableGSO { + gso, err := isGSOEnabled(fd, iface.Name) + if err != nil { + return nil, fmt.Errorf("getting GSO for interface %q: %v", iface.Name, err) + } + if gso { + if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil { + return nil, fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err) + } + gsoMaxSize = ifaceLink.Attrs().GSOMaxSize + } else { + log.Infof("GSO not available in host.") + } + } + + // Use SO_RCVBUFFORCE because on linux the receive buffer for an + // AF_PACKET socket is capped by "net.core.rmem_max". rmem_max + // defaults to a unusually low value of 208KB. This is too low + // for gVisor to be able to receive packets at high throughputs + // without incurring packet drops. + const rcvBufSize = 4 << 20 // 4MB. + + if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, rcvBufSize); err != nil { + return nil, fmt.Errorf("failed to increase socket rcv buffer to %d: %v", rcvBufSize, err) + } + return &socketEntry{deviceFile, gsoMaxSize}, nil +} + // loopbackLinks collects the links for a loopback interface. func loopbackLinks(iface net.Interface, addrs []net.Addr) ([]boot.LoopbackLink, error) { var links []boot.LoopbackLink diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index 47a66afb2..5ff6f879c 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -515,46 +515,64 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund } else if specutils.HasCapabilities(capability.CAP_SETUID, capability.CAP_SETGID) { log.Infof("Sandbox will be started in new user namespace") nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace}) + cmd.Args = append(cmd.Args, "--setup-root") - // Map nobody in the new namespace to nobody in the parent namespace. - // - // A sandbox process will construct an empty - // root for itself, so it has to have the CAP_SYS_ADMIN - // capability. - // - // FIXME(b/122554829): The current implementations of - // os/exec doesn't allow to set ambient capabilities if - // a process is started in a new user namespace. As a - // workaround, we start the sandbox process with the 0 - // UID and then it constructs a chroot and sets UID to - // nobody. https://github.com/golang/go/issues/2315 - const nobody = 65534 - cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{ - { - ContainerID: int(0), - HostID: int(nobody - 1), - Size: int(1), - }, - { - ContainerID: int(nobody), - HostID: int(nobody), - Size: int(1), - }, - } - cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{ - { - ContainerID: int(nobody), - HostID: int(nobody), - Size: int(1), - }, + if conf.Rootless { + log.Infof("Rootless mode: sandbox will run as root inside user namespace, mapped to the current user, uid: %d, gid: %d", os.Getuid(), os.Getgid()) + cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{ + { + ContainerID: 0, + HostID: os.Getuid(), + Size: 1, + }, + } + cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{ + { + ContainerID: 0, + HostID: os.Getgid(), + Size: 1, + }, + } + cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0} + + } else { + // Map nobody in the new namespace to nobody in the parent namespace. + // + // A sandbox process will construct an empty + // root for itself, so it has to have the CAP_SYS_ADMIN + // capability. + // + // FIXME(b/122554829): The current implementations of + // os/exec doesn't allow to set ambient capabilities if + // a process is started in a new user namespace. As a + // workaround, we start the sandbox process with the 0 + // UID and then it constructs a chroot and sets UID to + // nobody. https://github.com/golang/go/issues/2315 + const nobody = 65534 + cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{ + { + ContainerID: 0, + HostID: nobody - 1, + Size: 1, + }, + { + ContainerID: nobody, + HostID: nobody, + Size: 1, + }, + } + cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{ + { + ContainerID: nobody, + HostID: nobody, + Size: 1, + }, + } + + // Set credentials to run as user and group nobody. + cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: nobody} } - // Set credentials to run as user and group nobody. - cmd.SysProcAttr.Credential = &syscall.Credential{ - Uid: 0, - Gid: nobody, - } - cmd.Args = append(cmd.Args, "--setup-root") } else { return fmt.Errorf("can't run sandbox process as user nobody since we don't have CAP_SETUID or CAP_SETGID") } @@ -649,7 +667,7 @@ func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) { // WaitPID waits for process 'pid' in the container's sandbox and returns its // WaitStatus. -func (s *Sandbox) WaitPID(cid string, pid int32, clearStatus bool) (syscall.WaitStatus, error) { +func (s *Sandbox) WaitPID(cid string, pid int32) (syscall.WaitStatus, error) { log.Debugf("Waiting for PID %d in sandbox %q", pid, s.ID) var ws syscall.WaitStatus conn, err := s.sandboxConnect() @@ -659,9 +677,8 @@ func (s *Sandbox) WaitPID(cid string, pid int32, clearStatus bool) (syscall.Wait defer conn.Close() args := &boot.WaitPIDArgs{ - PID: pid, - CID: cid, - ClearStatus: clearStatus, + PID: pid, + CID: cid, } if err := conn.Call(boot.ContainerWaitPID, args, &ws); err != nil { return ws, fmt.Errorf("waiting on PID %d in sandbox %q: %v", pid, s.ID, err) diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD index 15476de6f..0456e4c4f 100644 --- a/runsc/specutils/BUILD +++ b/runsc/specutils/BUILD @@ -10,10 +10,7 @@ go_library( "specutils.go", ], importpath = "gvisor.googlesource.com/gvisor/runsc/specutils", - visibility = [ - "//runsc:__subpackages__", - "//test:__subpackages__", - ], + visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", "//pkg/log", diff --git a/runsc/specutils/fs.go b/runsc/specutils/fs.go index 1f3afb4e4..6e6902e9f 100644 --- a/runsc/specutils/fs.go +++ b/runsc/specutils/fs.go @@ -16,6 +16,7 @@ package specutils import ( "fmt" + "math/bits" "path" "syscall" @@ -105,22 +106,30 @@ func optionsToFlags(opts []string, source map[string]mapping) uint32 { return rv } -// ValidateMount validates that spec mounts are correct. +// validateMount validates that spec mounts are correct. func validateMount(mnt *specs.Mount) error { if !path.IsAbs(mnt.Destination) { return fmt.Errorf("Mount.Destination must be an absolute path: %v", mnt) } - if mnt.Type == "bind" { - for _, o := range mnt.Options { - if ContainsStr(invalidOptions, o) { - return fmt.Errorf("mount option %q is not supported: %v", o, mnt) - } - _, ok1 := optionsMap[o] - _, ok2 := propOptionsMap[o] - if !ok1 && !ok2 { - return fmt.Errorf("unknown mount option %q", o) - } + return ValidateMountOptions(mnt.Options) + } + return nil +} + +// ValidateMountOptions validates that mount options are correct. +func ValidateMountOptions(opts []string) error { + for _, o := range opts { + if ContainsStr(invalidOptions, o) { + return fmt.Errorf("mount option %q is not supported", o) + } + _, ok1 := optionsMap[o] + _, ok2 := propOptionsMap[o] + if !ok1 && !ok2 { + return fmt.Errorf("unknown mount option %q", o) + } + if err := validatePropagation(o); err != nil { + return err } } return nil @@ -133,5 +142,14 @@ func validateRootfsPropagation(opt string) error { if flags&(syscall.MS_SLAVE|syscall.MS_PRIVATE) == 0 { return fmt.Errorf("root mount propagation option must specify private or slave: %q", opt) } + return validatePropagation(opt) +} + +func validatePropagation(opt string) error { + flags := PropOptionsToFlags([]string{opt}) + exclusive := flags & (syscall.MS_SLAVE | syscall.MS_PRIVATE | syscall.MS_SHARED | syscall.MS_UNBINDABLE) + if bits.OnesCount32(exclusive) > 1 { + return fmt.Errorf("mount propagation options are mutually exclusive: %q", opt) + } return nil } diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go index 7d194335c..06c13d1ab 100644 --- a/runsc/specutils/namespace.go +++ b/runsc/specutils/namespace.go @@ -220,3 +220,55 @@ func HasCapabilities(cs ...capability.Cap) bool { } return true } + +// MaybeRunAsRoot ensures the process runs with capabilities needed to create a +// sandbox, e.g. CAP_SYS_ADMIN, CAP_SYS_CHROOT, etc. If capabilities are needed, +// it will create a new user namespace and re-execute the process as root +// inside the namespace with the same arguments and environment. +// +// This function returns immediately when no new capability is needed. If +// another process is executed, it returns straight from here with the same exit +// code as the child. +func MaybeRunAsRoot() error { + if HasCapabilities(capability.CAP_SYS_ADMIN, capability.CAP_SYS_CHROOT, capability.CAP_SETUID, capability.CAP_SETGID) { + return nil + } + + // Current process doesn't have required capabilities, create user namespace + // and run as root inside the namespace to acquire capabilities. + log.Infof("*** Re-running as root in new user namespace ***") + + cmd := exec.Command("/proc/self/exe", os.Args[1:]...) + + cmd.SysProcAttr = &syscall.SysProcAttr{ + Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS, + // Set current user/group as root inside the namespace. Since we may not + // have CAP_SETUID/CAP_SETGID, just map root to the current user/group. + UidMappings: []syscall.SysProcIDMap{ + {ContainerID: 0, HostID: os.Getuid(), Size: 1}, + }, + GidMappings: []syscall.SysProcIDMap{ + {ContainerID: 0, HostID: os.Getgid(), Size: 1}, + }, + Credential: &syscall.Credential{Uid: 0, Gid: 0}, + GidMappingsEnableSetgroups: false, + } + + cmd.Env = os.Environ() + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + if exit, ok := err.(*exec.ExitError); ok { + if ws, ok := exit.Sys().(syscall.WaitStatus); ok { + os.Exit(ws.ExitStatus()) + } + log.Warningf("No wait status provided, exiting with -1: %v", err) + os.Exit(-1) + } + return fmt.Errorf("re-executing self: %v", err) + } + // Child completed with success. + os.Exit(0) + panic("unreachable") +} diff --git a/runsc/test/integration/BUILD b/runsc/test/integration/BUILD index 0c4e4fa80..04ed885c6 100644 --- a/runsc/test/integration/BUILD +++ b/runsc/test/integration/BUILD @@ -8,6 +8,7 @@ go_test( srcs = [ "exec_test.go", "integration_test.go", + "regression_test.go", ], embed = [":integration"], tags = [ diff --git a/runsc/test/integration/exec_test.go b/runsc/test/integration/exec_test.go index 7af064d79..7c0e61ac3 100644 --- a/runsc/test/integration/exec_test.go +++ b/runsc/test/integration/exec_test.go @@ -29,6 +29,7 @@ package integration import ( "fmt" "strconv" + "strings" "syscall" "testing" "time" @@ -136,3 +137,25 @@ func TestExecJobControl(t *testing.T) { t.Errorf("ws.ExitedStatus got %d, want %d", got, want) } } + +// Test that failure to exec returns proper error message. +func TestExecError(t *testing.T) { + if err := testutil.Pull("alpine"); err != nil { + t.Fatalf("docker pull failed: %v", err) + } + d := testutil.MakeDocker("exec-error-test") + + // Start the container. + if err := d.Run("alpine", "sleep", "1000"); err != nil { + t.Fatalf("docker run failed: %v", err) + } + defer d.CleanUp() + + _, err := d.Exec("no_can_find") + if err == nil { + t.Fatalf("docker exec didn't fail") + } + if want := `error finding executable "no_can_find" in PATH`; !strings.Contains(err.Error(), want) { + t.Fatalf("docker exec wrong error, got: %s, want: .*%s.*", err.Error(), want) + } +} diff --git a/runsc/test/integration/regression_test.go b/runsc/test/integration/regression_test.go new file mode 100644 index 000000000..80bae9970 --- /dev/null +++ b/runsc/test/integration/regression_test.go @@ -0,0 +1,45 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package integration + +import ( + "strings" + "testing" + + "gvisor.googlesource.com/gvisor/runsc/test/testutil" +) + +// Test that UDS can be created using overlay when parent directory is in lower +// layer only (b/134090485). +// +// Prerequisite: the directory where the socket file is created must not have +// been open for write before bind(2) is called. +func TestBindOverlay(t *testing.T) { + if err := testutil.Pull("ubuntu:trusty"); err != nil { + t.Fatal("docker pull failed:", err) + } + d := testutil.MakeDocker("bind-overlay-test") + + cmd := "nc -l -U /var/run/sock& sleep 1 && echo foobar-asdf | nc -U /var/run/sock" + got, err := d.RunFg("ubuntu:trusty", "bash", "-c", cmd) + if err != nil { + t.Fatal("docker run failed:", err) + } + + if want := "foobar-asdf"; !strings.Contains(got, want) { + t.Fatalf("docker run output is missing %q: %s", want, got) + } + defer d.CleanUp() +} diff --git a/runsc/test/testutil/BUILD b/runsc/test/testutil/BUILD index ddec81444..eedf962a4 100644 --- a/runsc/test/testutil/BUILD +++ b/runsc/test/testutil/BUILD @@ -18,6 +18,5 @@ go_library( "@com_github_cenkalti_backoff//:go_default_library", "@com_github_kr_pty//:go_default_library", "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", - "@com_github_syndtr_gocapability//capability:go_default_library", ], ) diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go index 9efb1ba8e..1bd5adc54 100644 --- a/runsc/test/testutil/testutil.go +++ b/runsc/test/testutil/testutil.go @@ -30,7 +30,6 @@ import ( "os/exec" "os/signal" "path/filepath" - "runtime" "strings" "sync" "sync/atomic" @@ -39,7 +38,6 @@ import ( "github.com/cenkalti/backoff" specs "github.com/opencontainers/runtime-spec/specs-go" - "github.com/syndtr/gocapability/capability" "gvisor.googlesource.com/gvisor/runsc/boot" "gvisor.googlesource.com/gvisor/runsc/specutils" ) @@ -136,6 +134,7 @@ func TestConfig() *boot.Config { Strace: true, FileAccess: boot.FileAccessExclusive, TestOnlyAllowRunAsCurrentUserWithoutChroot: true, + NumNetworkChannels: 1, } } @@ -283,54 +282,6 @@ func WaitForHTTP(port int, timeout time.Duration) error { return Poll(cb, timeout) } -// RunAsRoot ensures the test runs with CAP_SYS_ADMIN and CAP_SYS_CHROOT. If -// needed it will create a new user namespace and re-execute the test as root -// inside of the namespace. This function returns when it's running as root. If -// it needs to create another process, it will exit from there and not return. -func RunAsRoot() { - if specutils.HasCapabilities(capability.CAP_SYS_ADMIN, capability.CAP_SYS_CHROOT) { - return - } - - fmt.Println("*** Re-running test as root in new user namespace ***") - - // Current process doesn't have CAP_SYS_ADMIN, create user namespace and run - // as root inside that namespace to get it. - runtime.LockOSThread() - defer runtime.UnlockOSThread() - - cmd := exec.Command("/proc/self/exe", os.Args[1:]...) - cmd.SysProcAttr = &syscall.SysProcAttr{ - Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS, - // Set current user/group as root inside the namespace. - UidMappings: []syscall.SysProcIDMap{ - {ContainerID: 0, HostID: os.Getuid(), Size: 1}, - }, - GidMappings: []syscall.SysProcIDMap{ - {ContainerID: 0, HostID: os.Getgid(), Size: 1}, - }, - GidMappingsEnableSetgroups: false, - Credential: &syscall.Credential{ - Uid: 0, - Gid: 0, - }, - } - cmd.Env = os.Environ() - cmd.Stdin = os.Stdin - cmd.Stdout = os.Stdout - cmd.Stderr = os.Stderr - if err := cmd.Run(); err != nil { - if exit, ok := err.(*exec.ExitError); ok { - if ws, ok := exit.Sys().(syscall.WaitStatus); ok { - os.Exit(ws.ExitStatus()) - } - os.Exit(-1) - } - panic(fmt.Sprint("error running child process:", err.Error())) - } - os.Exit(0) -} - // Reaper reaps child processes. type Reaper struct { // mu protects ch, which will be nil if the reaper is not running. |