diff options
Diffstat (limited to 'runsc')
-rw-r--r-- | runsc/BUILD | 12 | ||||
-rw-r--r-- | runsc/boot/BUILD | 5 | ||||
-rw-r--r-- | runsc/boot/config.go | 7 | ||||
-rw-r--r-- | runsc/boot/controller.go | 14 | ||||
-rw-r--r-- | runsc/boot/filter/config.go | 11 | ||||
-rw-r--r-- | runsc/boot/fs.go | 9 | ||||
-rw-r--r-- | runsc/boot/loader.go | 262 | ||||
-rw-r--r-- | runsc/boot/loader_test.go | 8 | ||||
-rw-r--r-- | runsc/boot/network.go | 4 | ||||
-rw-r--r-- | runsc/boot/vfs.go | 135 | ||||
-rw-r--r-- | runsc/cgroup/BUILD | 4 | ||||
-rw-r--r-- | runsc/cmd/BUILD | 8 | ||||
-rw-r--r-- | runsc/cmd/boot.go | 6 | ||||
-rw-r--r-- | runsc/cmd/spec.go | 220 | ||||
-rw-r--r-- | runsc/container/BUILD | 4 | ||||
-rw-r--r-- | runsc/container/console_test.go | 1 | ||||
-rw-r--r-- | runsc/container/container.go | 14 | ||||
-rw-r--r-- | runsc/container/multi_container_test.go | 184 | ||||
-rwxr-xr-x | runsc/debian/postinst.sh | 9 | ||||
-rw-r--r-- | runsc/fsgofer/BUILD | 1 | ||||
-rw-r--r-- | runsc/fsgofer/filter/config.go | 1 | ||||
-rw-r--r-- | runsc/fsgofer/fsgofer.go | 148 | ||||
-rw-r--r-- | runsc/fsgofer/fsgofer_test.go | 114 | ||||
-rw-r--r-- | runsc/main.go | 2 | ||||
-rw-r--r-- | runsc/sandbox/BUILD | 2 | ||||
-rw-r--r-- | runsc/sandbox/network.go | 1 | ||||
-rw-r--r-- | runsc/sandbox/sandbox.go | 4 | ||||
-rw-r--r-- | runsc/specutils/BUILD | 4 |
28 files changed, 697 insertions, 497 deletions
diff --git a/runsc/BUILD b/runsc/BUILD index 757f6d44c..96f697a5f 100644 --- a/runsc/BUILD +++ b/runsc/BUILD @@ -62,18 +62,22 @@ go_binary( ) pkg_tar( - name = "runsc-bin", - srcs = [":runsc"], + name = "debian-bin", + srcs = [ + ":runsc", + "//shim/v1:gvisor-containerd-shim", + "//shim/v2:containerd-shim-runsc-v1", + ], mode = "0755", package_dir = "/usr/bin", - strip_prefix = "/runsc/linux_amd64_pure_stripped", ) pkg_tar( name = "debian-data", extension = "tar.gz", deps = [ - ":runsc-bin", + ":debian-bin", + "//shim:config", ], ) diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index aad2a41de..9f52438c2 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -90,6 +90,7 @@ go_library( "//pkg/tcpip", "//pkg/tcpip/link/fdbased", "//pkg/tcpip/link/loopback", + "//pkg/tcpip/link/packetsocket", "//pkg/tcpip/link/qdisc/fifo", "//pkg/tcpip/link/sniffer", "//pkg/tcpip/network/arp", @@ -106,7 +107,7 @@ go_library( "//runsc/boot/pprof", "//runsc/specutils", "@com_github_golang_protobuf//proto:go_default_library", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", "@org_golang_x_sys//unix:go_default_library", ], ) @@ -131,7 +132,7 @@ go_test( "//pkg/sync", "//pkg/unet", "//runsc/fsgofer", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/runsc/boot/config.go b/runsc/boot/config.go index bb01b8fb5..80da8b3e6 100644 --- a/runsc/boot/config.go +++ b/runsc/boot/config.go @@ -274,6 +274,9 @@ type Config struct { // Enables VFS2 (not plumbled through yet). VFS2 bool + + // Enables FUSE usage (not plumbled through yet). + FUSE bool } // ToFlags returns a slice of flags that correspond to the given Config. @@ -325,5 +328,9 @@ func (c *Config) ToFlags() []string { f = append(f, "--vfs2=true") } + if c.FUSE { + f = append(f, "--fuse=true") + } + return f } diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index 8125d5061..3e5e4c22f 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -155,7 +155,7 @@ func newController(fd int, l *Loader) (*controller, error) { srv.Register(&debug{}) srv.Register(&control.Logging{}) - if l.conf.ProfileEnable { + if l.root.conf.ProfileEnable { srv.Register(&control.Profile{ Kernel: l.k, }) @@ -333,7 +333,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { // Pause the kernel while we build a new one. cm.l.k.Pause() - p, err := createPlatform(cm.l.conf, deviceFile) + p, err := createPlatform(cm.l.root.conf, deviceFile) if err != nil { return fmt.Errorf("creating platform: %v", err) } @@ -349,8 +349,8 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { cm.l.k = k // Set up the restore environment. - mntr := newContainerMounter(cm.l.spec, cm.l.goferFDs, cm.l.k, cm.l.mountHints) - renv, err := mntr.createRestoreEnvironment(cm.l.conf) + mntr := newContainerMounter(cm.l.root.spec, cm.l.root.goferFDs, cm.l.k, cm.l.mountHints) + renv, err := mntr.createRestoreEnvironment(cm.l.root.conf) if err != nil { return fmt.Errorf("creating RestoreEnvironment: %v", err) } @@ -368,7 +368,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { return fmt.Errorf("file cannot be empty") } - if cm.l.conf.ProfileEnable { + if cm.l.root.conf.ProfileEnable { // pprof.Initialize opens /proc/self/maps, so has to be called before // installing seccomp filters. pprof.Initialize() @@ -387,13 +387,13 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { // Since we have a new kernel we also must make a new watchdog. dogOpts := watchdog.DefaultOpts - dogOpts.TaskTimeoutAction = cm.l.conf.WatchdogAction + dogOpts.TaskTimeoutAction = cm.l.root.conf.WatchdogAction dog := watchdog.New(k, dogOpts) // Change the loader fields to reflect the changes made when restoring. cm.l.k = k cm.l.watchdog = dog - cm.l.rootProcArgs = kernel.CreateProcessArgs{} + cm.l.root.procArgs = kernel.CreateProcessArgs{} cm.l.restore = true // Reinitialize the sandbox ID and processes map. Note that it doesn't diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index 60e33425f..149eb0b1b 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -310,19 +310,12 @@ var allowedSyscalls = seccomp.SyscallRules{ }, }, syscall.SYS_WRITE: {}, - // The only user in rawfile.NonBlockingWrite3 always passes iovcnt with - // values 2 or 3. Three iovec-s are passed, when the PACKET_VNET_HDR - // option is enabled for a packet socket. + // For rawfile.NonBlockingWriteIovec. syscall.SYS_WRITEV: []seccomp.Rule{ { seccomp.AllowAny{}, seccomp.AllowAny{}, - seccomp.AllowValue(2), - }, - { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(3), + seccomp.GreaterThan(0), }, }, } diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index e83584b82..59639ba19 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -29,6 +29,7 @@ import ( _ "gvisor.dev/gvisor/pkg/sentry/fs/sys" _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" _ "gvisor.dev/gvisor/pkg/sentry/fs/tty" + "gvisor.dev/gvisor/pkg/sentry/vfs" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/abi/linux" @@ -390,6 +391,10 @@ type mountHint struct { // root is the inode where the volume is mounted. For mounts with 'pod' share // the volume is mounted once and then bind mounted inside the containers. root *fs.Inode + + // vfsMount is the master mount for the volume. For mounts with 'pod' share + // the master volume is bind mounted inside the containers. + vfsMount *vfs.Mount } func (m *mountHint) setField(key, val string) error { @@ -571,9 +576,9 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin // processHints processes annotations that container hints about how volumes // should be mounted (e.g. a volume shared between containers). It must be // called for the root container only. -func (c *containerMounter) processHints(conf *Config) error { +func (c *containerMounter) processHints(conf *Config, creds *auth.Credentials) error { if conf.VFS2 { - return nil + return c.processHintsVFS2(conf, creds) } ctx := c.k.SupervisorContext() for _, hint := range c.hints.mounts { diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index b5df1deb9..9cd9c5909 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -77,29 +77,34 @@ import ( _ "gvisor.dev/gvisor/pkg/sentry/socket/unix" ) -// Loader keeps state needed to start the kernel and run the container.. -type Loader struct { - // k is the kernel. - k *kernel.Kernel - - // ctrl is the control server. - ctrl *controller - +type containerInfo struct { conf *Config - // console is set to true if terminal is enabled. - console bool + // spec is the base configuration for the root container. + spec *specs.Spec - watchdog *watchdog.Watchdog + // procArgs refers to the container's init task. + procArgs kernel.CreateProcessArgs // stdioFDs contains stdin, stdout, and stderr. stdioFDs []int // goferFDs are the FDs that attach the sandbox to the gofers. goferFDs []int +} - // spec is the base configuration for the root container. - spec *specs.Spec +// Loader keeps state needed to start the kernel and run the container.. +type Loader struct { + // k is the kernel. + k *kernel.Kernel + + // ctrl is the control server. + ctrl *controller + + // root contains information about the root container in the sandbox. + root containerInfo + + watchdog *watchdog.Watchdog // stopSignalForwarding disables forwarding of signals to the sandboxed // container. It should be called when a sandbox is destroyed. @@ -108,9 +113,6 @@ type Loader struct { // restore is set to true if we are restoring a container. restore bool - // rootProcArgs refers to the root sandbox init task. - rootProcArgs kernel.CreateProcessArgs - // sandboxID is the ID for the whole sandbox. sandboxID string @@ -175,8 +177,6 @@ type Args struct { // StdioFDs is the stdio for the application. The Loader takes ownership of // these FDs and may close them at any time. StdioFDs []int - // Console is set to true if using TTY. - Console bool // NumCPU is the number of CPUs to create inside the sandbox. NumCPU int // TotalMem is the initial amount of total memory to report back to the @@ -205,6 +205,10 @@ func New(args Args) (*Loader, error) { // Is this a VFSv2 kernel? if args.Conf.VFS2 { kernel.VFS2Enabled = true + if args.Conf.FUSE { + kernel.FUSEEnabled = true + } + vfs2.Override() } @@ -227,9 +231,7 @@ func New(args Args) (*Loader, error) { // Create VDSO. // // Pass k as the platform since it is savable, unlike the actual platform. - // - // FIXME(b/109889800): Use non-nil context. - vdso, err := loader.PrepareVDSO(nil, k) + vdso, err := loader.PrepareVDSO(k) if err != nil { return nil, fmt.Errorf("creating vdso: %v", err) } @@ -300,6 +302,12 @@ func New(args Args) (*Loader, error) { return nil, fmt.Errorf("initializing kernel: %v", err) } + if kernel.VFS2Enabled { + if err := registerFilesystems(k); err != nil { + return nil, fmt.Errorf("registering filesystems: %w", err) + } + } + if err := adjustDirentCache(k); err != nil { return nil, err } @@ -318,7 +326,7 @@ func New(args Args) (*Loader, error) { dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction dog := watchdog.New(k, dogOpts) - procArgs, err := newProcess(args.ID, args.Spec, creds, k, k.RootPIDNamespace()) + procArgs, err := createProcessArgs(args.ID, args.Spec, creds, k, k.RootPIDNamespace()) if err != nil { return nil, fmt.Errorf("creating init process for root container: %v", err) } @@ -366,17 +374,18 @@ func New(args Args) (*Loader, error) { eid := execID{cid: args.ID} l := &Loader{ - k: k, - conf: args.Conf, - console: args.Console, - watchdog: dog, - spec: args.Spec, - goferFDs: args.GoferFDs, - stdioFDs: stdioFDs, - rootProcArgs: procArgs, - sandboxID: args.ID, - processes: map[execID]*execProcess{eid: {}}, - mountHints: mountHints, + k: k, + watchdog: dog, + sandboxID: args.ID, + processes: map[execID]*execProcess{eid: {}}, + mountHints: mountHints, + root: containerInfo{ + conf: args.Conf, + stdioFDs: stdioFDs, + goferFDs: args.GoferFDs, + spec: args.Spec, + procArgs: procArgs, + }, } // We don't care about child signals; some platforms can generate a @@ -404,8 +413,8 @@ func New(args Args) (*Loader, error) { return l, nil } -// newProcess creates a process that can be run with kernel.CreateProcess. -func newProcess(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) { +// createProcessArgs creates args that can be used with kernel.CreateProcess. +func createProcessArgs(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) { // Create initial limits. ls, err := createLimitSet(spec) if err != nil { @@ -479,13 +488,13 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) { } func (l *Loader) installSeccompFilters() error { - if l.conf.DisableSeccomp { + if l.root.conf.DisableSeccomp { filter.Report("syscall filter is DISABLED. Running in less secure mode.") } else { opts := filter.Options{ Platform: l.k.Platform, - HostNetwork: l.conf.Network == NetworkHost, - ProfileEnable: l.conf.ProfileEnable, + HostNetwork: l.root.conf.Network == NetworkHost, + ProfileEnable: l.root.conf.ProfileEnable, ControllerFD: l.ctrl.srv.FD(), } if err := filter.Install(opts); err != nil { @@ -511,7 +520,7 @@ func (l *Loader) Run() error { } func (l *Loader) run() error { - if l.conf.Network == NetworkHost { + if l.root.conf.Network == NetworkHost { // Delay host network configuration to this point because network namespace // is configured after the loader is created and before Run() is called. log.Debugf("Configuring host network") @@ -532,10 +541,8 @@ func (l *Loader) run() error { // If we are restoring, we do not want to create a process. // l.restore is set by the container manager when a restore call is made. - var ttyFile *host.TTYFileOperations - var ttyFileVFS2 *hostvfs2.TTYFileDescription if !l.restore { - if l.conf.ProfileEnable { + if l.root.conf.ProfileEnable { pprof.Initialize() } @@ -545,82 +552,29 @@ func (l *Loader) run() error { return err } - // Create the FD map, which will set stdin, stdout, and stderr. If console - // is true, then ioctl calls will be passed through to the host fd. - ctx := l.rootProcArgs.NewContext(l.k) - var err error - - // CreateProcess takes a reference on FDMap if successful. We won't need - // ours either way. - l.rootProcArgs.FDTable, ttyFile, ttyFileVFS2, err = createFDTable(ctx, l.console, l.stdioFDs) - if err != nil { - return fmt.Errorf("importing fds: %v", err) - } - - // Setup the root container file system. - l.startGoferMonitor(l.sandboxID, l.goferFDs) - - mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints) - if err := mntr.processHints(l.conf); err != nil { - return err - } - if err := setupContainerFS(ctx, l.conf, mntr, &l.rootProcArgs); err != nil { - return err - } - - // Add the HOME enviroment variable if it is not already set. - var envv []string - if kernel.VFS2Enabled { - envv, err = user.MaybeAddExecUserHomeVFS2(ctx, l.rootProcArgs.MountNamespaceVFS2, - l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv) - - } else { - envv, err = user.MaybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace, - l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv) - } - if err != nil { - return err - } - l.rootProcArgs.Envv = envv - // Create the root container init task. It will begin running // when the kernel is started. - if _, _, err := l.k.CreateProcess(l.rootProcArgs); err != nil { - return fmt.Errorf("creating init process: %v", err) + if _, err := l.createContainerProcess(true, l.sandboxID, &l.root, ep); err != nil { + return err } - - // CreateProcess takes a reference on FDTable if successful. - l.rootProcArgs.FDTable.DecRef() } ep.tg = l.k.GlobalInit() - if ns, ok := specutils.GetNS(specs.PIDNamespace, l.spec); ok { + if ns, ok := specutils.GetNS(specs.PIDNamespace, l.root.spec); ok { ep.pidnsPath = ns.Path } - if l.console { - // Set the foreground process group on the TTY to the global init process - // group, since that is what we are about to start running. - switch { - case ttyFileVFS2 != nil: - ep.ttyVFS2 = ttyFileVFS2 - ttyFileVFS2.InitForegroundProcessGroup(ep.tg.ProcessGroup()) - case ttyFile != nil: - ep.tty = ttyFile - ttyFile.InitForegroundProcessGroup(ep.tg.ProcessGroup()) - } - } // Handle signals by forwarding them to the root container process // (except for panic signal, which should cause a panic). l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) { // Panic signal should cause a panic. - if l.conf.PanicSignal != -1 && sig == linux.Signal(l.conf.PanicSignal) { + if l.root.conf.PanicSignal != -1 && sig == linux.Signal(l.root.conf.PanicSignal) { panic("Signal-induced panic") } // Otherwise forward to root container. deliveryMode := DeliverToProcess - if l.console { + if l.root.spec.Process.Terminal { // Since we are running with a console, we should forward the signal to // the foreground process group so that job control signals like ^C can // be handled properly. @@ -637,7 +591,7 @@ func (l *Loader) run() error { // during restore, we can release l.stdioFDs now. VFS2 takes ownership of the // passed FDs, so only close for VFS1. if !kernel.VFS2Enabled { - for _, fd := range l.stdioFDs { + for _, fd := range l.root.stdioFDs { err := syscall.Close(fd) if err != nil { return fmt.Errorf("close dup()ed stdioFDs: %v", err) @@ -676,8 +630,8 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file l.mu.Lock() defer l.mu.Unlock() - eid := execID{cid: cid} - if _, ok := l.processes[eid]; !ok { + ep := l.processes[execID{cid: cid}] + if ep == nil { return fmt.Errorf("trying to start a deleted container %q", cid) } @@ -711,76 +665,112 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file if pidns == nil { pidns = l.k.RootPIDNamespace().NewChild(l.k.RootUserNamespace()) } - l.processes[eid].pidnsPath = ns.Path + ep.pidnsPath = ns.Path } else { pidns = l.k.RootPIDNamespace() } - procArgs, err := newProcess(cid, spec, creds, l.k, pidns) + + info := &containerInfo{ + conf: conf, + spec: spec, + } + info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns) if err != nil { return fmt.Errorf("creating new process: %v", err) } // setupContainerFS() dups stdioFDs, so we don't need to dup them here. - var stdioFDs []int for _, f := range files[:3] { - stdioFDs = append(stdioFDs, int(f.Fd())) + info.stdioFDs = append(info.stdioFDs, int(f.Fd())) } - // Create the FD map, which will set stdin, stdout, and stderr. - ctx := procArgs.NewContext(l.k) - fdTable, _, _, err := createFDTable(ctx, false, stdioFDs) - if err != nil { - return fmt.Errorf("importing fds: %v", err) - } - // CreateProcess takes a reference on fdTable if successful. We won't - // need ours either way. - procArgs.FDTable = fdTable - // Can't take ownership away from os.File. dup them to get a new FDs. - var goferFDs []int for _, f := range files[3:] { fd, err := syscall.Dup(int(f.Fd())) if err != nil { return fmt.Errorf("failed to dup file: %v", err) } - goferFDs = append(goferFDs, fd) + info.goferFDs = append(info.goferFDs, fd) + } + + tg, err := l.createContainerProcess(false, cid, info, ep) + if err != nil { + return err + } + + // Success! + l.k.StartProcess(tg) + ep.tg = tg + return nil +} + +func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo, ep *execProcess) (*kernel.ThreadGroup, error) { + console := false + if root { + // Only root container supports terminal for now. + console = info.spec.Process.Terminal } + // Create the FD map, which will set stdin, stdout, and stderr. + ctx := info.procArgs.NewContext(l.k) + fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, console, info.stdioFDs) + if err != nil { + return nil, fmt.Errorf("importing fds: %v", err) + } + // CreateProcess takes a reference on fdTable if successful. We won't need + // ours either way. + info.procArgs.FDTable = fdTable + // Setup the child container file system. - l.startGoferMonitor(cid, goferFDs) + l.startGoferMonitor(cid, info.goferFDs) - mntr := newContainerMounter(spec, goferFDs, l.k, l.mountHints) - if err := setupContainerFS(ctx, conf, mntr, &procArgs); err != nil { - return err + mntr := newContainerMounter(info.spec, info.goferFDs, l.k, l.mountHints) + if root { + if err := mntr.processHints(info.conf, info.procArgs.Credentials); err != nil { + return nil, err + } + } + if err := setupContainerFS(ctx, info.conf, mntr, &info.procArgs); err != nil { + return nil, err } // Add the HOME enviroment variable if it is not already set. var envv []string if kernel.VFS2Enabled { - envv, err = user.MaybeAddExecUserHomeVFS2(ctx, procArgs.MountNamespaceVFS2, - procArgs.Credentials.RealKUID, procArgs.Envv) + envv, err = user.MaybeAddExecUserHomeVFS2(ctx, info.procArgs.MountNamespaceVFS2, + info.procArgs.Credentials.RealKUID, info.procArgs.Envv) } else { - envv, err = user.MaybeAddExecUserHome(ctx, procArgs.MountNamespace, - procArgs.Credentials.RealKUID, procArgs.Envv) + envv, err = user.MaybeAddExecUserHome(ctx, info.procArgs.MountNamespace, + info.procArgs.Credentials.RealKUID, info.procArgs.Envv) } if err != nil { - return err + return nil, err } - procArgs.Envv = envv + info.procArgs.Envv = envv // Create and start the new process. - tg, _, err := l.k.CreateProcess(procArgs) + tg, _, err := l.k.CreateProcess(info.procArgs) if err != nil { - return fmt.Errorf("creating process: %v", err) + return nil, fmt.Errorf("creating process: %v", err) } - l.k.StartProcess(tg) - // CreateProcess takes a reference on FDTable if successful. - procArgs.FDTable.DecRef() + info.procArgs.FDTable.DecRef() - l.processes[eid].tg = tg - return nil + // Set the foreground process group on the TTY to the global init process + // group, since that is what we are about to start running. + if root { + switch { + case ttyFileVFS2 != nil: + ep.ttyVFS2 = ttyFileVFS2 + ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup()) + case ttyFile != nil: + ep.tty = ttyFile + ttyFile.InitForegroundProcessGroup(tg.ProcessGroup()) + } + } + + return tg, nil } // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index e448fd773..8e6fe57e1 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -479,13 +479,13 @@ func TestCreateMountNamespaceVFS2(t *testing.T) { defer l.Destroy() defer loaderCleanup() - mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints) - if err := mntr.processHints(l.conf); err != nil { + mntr := newContainerMounter(l.root.spec, l.root.goferFDs, l.k, l.mountHints) + if err := mntr.processHints(l.root.conf, l.root.procArgs.Credentials); err != nil { t.Fatalf("failed process hints: %v", err) } ctx := l.k.SupervisorContext() - mns, err := mntr.setupVFS2(ctx, l.conf, &l.rootProcArgs) + mns, err := mntr.setupVFS2(ctx, l.root.conf, &l.root.procArgs) if err != nil { t.Fatalf("failed to setupVFS2: %v", err) } @@ -499,7 +499,7 @@ func TestCreateMountNamespaceVFS2(t *testing.T) { Path: fspath.Parse(p), } - if d, err := l.k.VFS().GetDentryAt(ctx, l.rootProcArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil { + if d, err := l.k.VFS().GetDentryAt(ctx, l.root.procArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil { t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err) } else { d.DecRef() diff --git a/runsc/boot/network.go b/runsc/boot/network.go index 14d2f56a5..4e1fa7665 100644 --- a/runsc/boot/network.go +++ b/runsc/boot/network.go @@ -25,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/link/fdbased" "gvisor.dev/gvisor/pkg/tcpip/link/loopback" + "gvisor.dev/gvisor/pkg/tcpip/link/packetsocket" "gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo" "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" "gvisor.dev/gvisor/pkg/tcpip/network/arp" @@ -252,6 +253,9 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct linkEP = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000) } + // Enable support for AF_PACKET sockets to receive outgoing packets. + linkEP = packetsocket.New(linkEP) + log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels) if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil { return err diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index b68117867..9a1ed8e9e 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -43,7 +43,11 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) -func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) error { +func registerFilesystems(k *kernel.Kernel) error { + ctx := k.SupervisorContext() + creds := auth.NewRootCredentials(k.RootUserNamespace()) + vfsObj := k.VFS() + vfsObj.MustRegisterFilesystemType(devpts.Name, &devpts.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserList: true, // TODO(b/29356795): Users may mount this once the terminals are in a @@ -73,6 +77,10 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre AllowUserMount: true, AllowUserList: true, }) + vfsObj.MustRegisterFilesystemType(fuse.Name, &fuse.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) // Setup files in devtmpfs. if err := memdev.Register(vfsObj); err != nil { @@ -82,9 +90,12 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre return fmt.Errorf("registering ttydev: %w", err) } - if err := fuse.Register(vfsObj); err != nil { - return fmt.Errorf("registering fusedev: %w", err) + if kernel.FUSEEnabled { + if err := fuse.Register(vfsObj); err != nil { + return fmt.Errorf("registering fusedev: %w", err) + } } + if err := tundev.Register(vfsObj); err != nil { return fmt.Errorf("registering tundev: %v", err) } @@ -106,16 +117,17 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre if err := tundev.CreateDevtmpfsFiles(ctx, a); err != nil { return fmt.Errorf("creating tundev devtmpfs files: %v", err) } - if err := fuse.CreateDevtmpfsFile(ctx, a); err != nil { - return fmt.Errorf("creating fusedev devtmpfs files: %w", err) + + if kernel.FUSEEnabled { + if err := fuse.CreateDevtmpfsFile(ctx, a); err != nil { + return fmt.Errorf("creating fusedev devtmpfs files: %w", err) + } } + return nil } func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { - if err := mntr.k.VFS().Init(); err != nil { - return fmt.Errorf("failed to initialize VFS: %w", err) - } mns, err := mntr.setupVFS2(ctx, conf, procArgs) if err != nil { return fmt.Errorf("failed to setupFS: %w", err) @@ -144,10 +156,6 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals rootCtx := procArgs.NewContext(c.k) - if err := registerFilesystems(rootCtx, c.k.VFS(), rootCreds); err != nil { - return nil, fmt.Errorf("register filesystems: %w", err) - } - mns, err := c.createMountNamespaceVFS2(rootCtx, conf, rootCreds) if err != nil { return nil, fmt.Errorf("creating mount namespace: %w", err) @@ -182,8 +190,14 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, for i := range mounts { submount := &mounts[i] log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options) - if err := c.mountSubmountVFS2(ctx, conf, mns, creds, submount); err != nil { - return err + if hint := c.hints.findMount(submount.Mount); hint != nil && hint.isSupported() { + if err := c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.Mount, hint); err != nil { + return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, submount.Destination, err) + } + } else { + if err := c.mountSubmountVFS2(ctx, conf, mns, creds, submount); err != nil { + return fmt.Errorf("mount submount %q: %w", submount.Destination, err) + } } } @@ -257,20 +271,18 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, // getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values // used for mounts. func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndFD) (string, *vfs.MountOptions, error) { - var ( - fsName string - data []string - ) + fsName := m.Type + var data []string // Find filesystem name and FS specific data field. switch m.Type { case devpts.Name, devtmpfs.Name, proc.Name, sys.Name: - fsName = m.Type + // Nothing to do. + case nonefs: fsName = sys.Name - case tmpfs.Name: - fsName = m.Type + case tmpfs.Name: var err error data, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...) if err != nil { @@ -279,10 +291,16 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndF case bind: fsName = gofer.Name + if m.fd == 0 { + // Check that an FD was provided to fails fast. Technically FD=0 is valid, + // but unlikely to be correct in this context. + return "", nil, fmt.Errorf("9P mount requires a connection FD") + } data = p9MountData(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */) default: log.Warningf("ignoring unknown filesystem type %q", m.Type) + return "", nil, nil } opts := &vfs.MountOptions{ @@ -322,7 +340,7 @@ func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath s } _, err := c.k.VFS().StatAt(ctx, creds, target, &vfs.StatOptions{}) if err == nil { - // Mount point exists, nothing else to do. + log.Debugf("Mount point %q already exists", currentPath) return nil } if err != syserror.ENOENT { @@ -400,3 +418,76 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds return fmt.Errorf(`stating "/tmp" inside container: %w`, err) } } + +// processHintsVFS2 processes annotations that container hints about how volumes +// should be mounted (e.g. a volume shared between containers). It must be +// called for the root container only. +func (c *containerMounter) processHintsVFS2(conf *Config, creds *auth.Credentials) error { + ctx := c.k.SupervisorContext() + for _, hint := range c.hints.mounts { + // TODO(b/142076984): Only support tmpfs for now. Bind mounts require a + // common gofer to mount all shared volumes. + if hint.mount.Type != tmpfs.Name { + continue + } + + log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type) + mnt, err := c.mountSharedMasterVFS2(ctx, conf, hint, creds) + if err != nil { + return fmt.Errorf("mounting shared master %q: %v", hint.name, err) + } + hint.vfsMount = mnt + } + return nil +} + +// mountSharedMasterVFS2 mounts the master of a volume that is shared among +// containers in a pod. +func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *Config, hint *mountHint, creds *auth.Credentials) (*vfs.Mount, error) { + // Map mount type to filesystem name, and parse out the options that we are + // capable of dealing with. + mntFD := &mountAndFD{Mount: hint.mount} + fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, mntFD) + if err != nil { + return nil, err + } + if len(fsName) == 0 { + return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type) + } + return c.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts) +} + +// mountSharedSubmount binds mount to a previously mounted volume that is shared +// among containers in the same pod. +func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount specs.Mount, source *mountHint) error { + if err := source.checkCompatible(mount); err != nil { + return err + } + + _, opts, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount}) + if err != nil { + return err + } + newMnt, err := c.k.VFS().NewDisconnectedMount(source.vfsMount.Filesystem(), source.vfsMount.Root(), opts) + if err != nil { + return err + } + defer newMnt.DecRef() + + root := mns.Root() + defer root.DecRef() + if err := c.makeSyntheticMount(ctx, mount.Destination, root, creds); err != nil { + return err + } + + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(mount.Destination), + } + if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil { + return err + } + log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name) + return nil +} diff --git a/runsc/cgroup/BUILD b/runsc/cgroup/BUILD index 7e34a284a..37f4253ba 100644 --- a/runsc/cgroup/BUILD +++ b/runsc/cgroup/BUILD @@ -10,7 +10,7 @@ go_library( "//pkg/cleanup", "//pkg/log", "@com_github_cenkalti_backoff//:go_default_library", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", ], ) @@ -22,6 +22,6 @@ go_test( tags = ["local"], deps = [ "//pkg/test/testutil", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", ], ) diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD index dae9b3b3e..1b5178dd5 100644 --- a/runsc/cmd/BUILD +++ b/runsc/cmd/BUILD @@ -58,7 +58,7 @@ go_library( "//runsc/fsgofer/filter", "//runsc/specutils", "@com_github_google_subcommands//:go_default_library", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", "@com_github_syndtr_gocapability//capability:go_default_library", "@org_golang_x_sys//unix:go_default_library", ], @@ -87,9 +87,9 @@ go_test( "//runsc/boot", "//runsc/container", "//runsc/specutils", - "@com_github_google_go-cmp//cmp:go_default_library", - "@com_github_google_go-cmp//cmp/cmpopts:go_default_library", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_google_go_cmp//cmp:go_default_library", + "@com_github_google_go_cmp//cmp/cmpopts:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", "@com_github_syndtr_gocapability//capability:go_default_library", ], ) diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go index 01204ab4d..f4f247721 100644 --- a/runsc/cmd/boot.go +++ b/runsc/cmd/boot.go @@ -54,10 +54,6 @@ type Boot struct { // provided in that order. stdioFDs intFlags - // console is set to true if the sandbox should allow terminal ioctl(2) - // syscalls. - console bool - // applyCaps determines if capabilities defined in the spec should be applied // to the process. applyCaps bool @@ -115,7 +111,6 @@ func (b *Boot) SetFlags(f *flag.FlagSet) { f.IntVar(&b.deviceFD, "device-fd", -1, "FD for the platform device file") f.Var(&b.ioFDs, "io-fds", "list of FDs to connect 9P clients. They must follow this order: root first, then mounts as defined in the spec") f.Var(&b.stdioFDs, "stdio-fds", "list of FDs containing sandbox stdin, stdout, and stderr in that order") - f.BoolVar(&b.console, "console", false, "set to true if the sandbox should allow terminal ioctl(2) syscalls") f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process") f.BoolVar(&b.setUpRoot, "setup-root", false, "if true, set up an empty root for the process") f.BoolVar(&b.pidns, "pidns", false, "if true, the sandbox is in its own PID namespace") @@ -229,7 +224,6 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) Device: os.NewFile(uintptr(b.deviceFD), "platform device"), GoferFDs: b.ioFDs.GetArray(), StdioFDs: b.stdioFDs.GetArray(), - Console: b.console, NumCPU: b.cpuNum, TotalMem: b.totalMem, UserLogFD: b.userLogFD, diff --git a/runsc/cmd/spec.go b/runsc/cmd/spec.go index a2b0a4b14..55194e641 100644 --- a/runsc/cmd/spec.go +++ b/runsc/cmd/spec.go @@ -16,124 +16,122 @@ package cmd import ( "context" - "fmt" - "io/ioutil" + "encoding/json" + "io" "os" "path/filepath" "github.com/google/subcommands" + specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/runsc/flag" ) -func genSpec(cwd string) []byte { - var template = fmt.Sprintf(`{ - "ociVersion": "1.0.0", - "process": { - "terminal": true, - "user": { - "uid": 0, - "gid": 0 - }, - "args": [ - "sh" - ], - "env": [ - "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", - "TERM=xterm" - ], - "cwd": "%s", - "capabilities": { - "bounding": [ - "CAP_AUDIT_WRITE", - "CAP_KILL", - "CAP_NET_BIND_SERVICE" - ], - "effective": [ - "CAP_AUDIT_WRITE", - "CAP_KILL", - "CAP_NET_BIND_SERVICE" - ], - "inheritable": [ - "CAP_AUDIT_WRITE", - "CAP_KILL", - "CAP_NET_BIND_SERVICE" - ], - "permitted": [ - "CAP_AUDIT_WRITE", - "CAP_KILL", - "CAP_NET_BIND_SERVICE" - ], - "ambient": [ - "CAP_AUDIT_WRITE", - "CAP_KILL", - "CAP_NET_BIND_SERVICE" - ] - }, - "rlimits": [ - { - "type": "RLIMIT_NOFILE", - "hard": 1024, - "soft": 1024 - } - ] - }, - "root": { - "path": "rootfs", - "readonly": true - }, - "hostname": "runsc", - "mounts": [ - { - "destination": "/proc", - "type": "proc", - "source": "proc" +func writeSpec(w io.Writer, cwd string, netns string, args []string) error { + spec := &specs.Spec{ + Version: "1.0.0", + Process: &specs.Process{ + Terminal: true, + User: specs.User{ + UID: 0, + GID: 0, + }, + Args: args, + Env: []string{ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm", + }, + Cwd: cwd, + Capabilities: &specs.LinuxCapabilities{ + Bounding: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Effective: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Inheritable: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Permitted: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + // TODO(gvisor.dev/issue/3166): support ambient capabilities + }, + Rlimits: []specs.POSIXRlimit{ + { + Type: "RLIMIT_NOFILE", + Hard: 1024, + Soft: 1024, + }, + }, }, - { - "destination": "/dev", - "type": "tmpfs", - "source": "tmpfs", - "options": [] + Root: &specs.Root{ + Path: "rootfs", + Readonly: true, }, - { - "destination": "/sys", - "type": "sysfs", - "source": "sysfs", - "options": [ - "nosuid", - "noexec", - "nodev", - "ro" - ] - } - ], - "linux": { - "namespaces": [ + Hostname: "runsc", + Mounts: []specs.Mount{ { - "type": "pid" + Destination: "/proc", + Type: "proc", + Source: "proc", }, { - "type": "network" + Destination: "/dev", + Type: "tmpfs", + Source: "tmpfs", }, { - "type": "ipc" + Destination: "/sys", + Type: "sysfs", + Source: "sysfs", + Options: []string{ + "nosuid", + "noexec", + "nodev", + "ro", + }, }, - { - "type": "uts" + }, + Linux: &specs.Linux{ + Namespaces: []specs.LinuxNamespace{ + { + Type: "pid", + }, + { + Type: "network", + Path: netns, + }, + { + Type: "ipc", + }, + { + Type: "uts", + }, + { + Type: "mount", + }, }, - { - "type": "mount" - } - ] + }, } -}`, cwd) - return []byte(template) + e := json.NewEncoder(w) + e.SetIndent("", " ") + return e.Encode(spec) } // Spec implements subcommands.Command for the "spec" command. type Spec struct { bundle string cwd string + netns string } // Name implements subcommands.Command.Name. @@ -148,21 +146,26 @@ func (*Spec) Synopsis() string { // Usage implements subcommands.Command.Usage. func (*Spec) Usage() string { - return `spec [options] - create a new OCI bundle specification file. + return `spec [options] [-- args...] - create a new OCI bundle specification file. + +The spec command creates a new specification file (config.json) for a new OCI +bundle. -The spec command creates a new specification file (config.json) for a new OCI bundle. +The specification file is a starter file that runs the command specified by +'args' in the container. If 'args' is not specified the default is to run the +'sh' program. -The specification file is a starter file that runs the "sh" command in the container. You -should edit the file to suit your needs. You can find out more about the format of the -specification file by visiting the OCI runtime spec repository: +While a number of flags are provided to change values in the specification, you +can examine the file and edit it to suit your needs after this command runs. +You can find out more about the format of the specification file by visiting +the OCI runtime spec repository: https://github.com/opencontainers/runtime-spec/ EXAMPLE: $ mkdir -p bundle/rootfs $ cd bundle - $ runsc spec + $ runsc spec -- /hello $ docker export $(docker create hello-world) | tar -xf - -C rootfs - $ sed -i 's;"sh";"/hello";' config.json $ sudo runsc run hello ` @@ -173,18 +176,29 @@ func (s *Spec) SetFlags(f *flag.FlagSet) { f.StringVar(&s.bundle, "bundle", ".", "path to the root of the OCI bundle") f.StringVar(&s.cwd, "cwd", "/", "working directory that will be set for the executable, "+ "this value MUST be an absolute path") + f.StringVar(&s.netns, "netns", "", "network namespace path") } // Execute implements subcommands.Command.Execute. func (s *Spec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + // Grab the arguments. + containerArgs := f.Args() + if len(containerArgs) == 0 { + containerArgs = []string{"sh"} + } + confPath := filepath.Join(s.bundle, "config.json") if _, err := os.Stat(confPath); !os.IsNotExist(err) { Fatalf("file %q already exists", confPath) } - var spec = genSpec(s.cwd) + configFile, err := os.OpenFile(confPath, os.O_WRONLY|os.O_CREATE, 0664) + if err != nil { + Fatalf("opening file %q: %v", confPath, err) + } - if err := ioutil.WriteFile(confPath, spec, 0664); err != nil { + err = writeSpec(configFile, s.cwd, s.netns, containerArgs) + if err != nil { Fatalf("writing to %q: %v", confPath, err) } diff --git a/runsc/container/BUILD b/runsc/container/BUILD index 49cfb0837..9a9ee7e2a 100644 --- a/runsc/container/BUILD +++ b/runsc/container/BUILD @@ -27,7 +27,7 @@ go_library( "//runsc/specutils", "@com_github_cenkalti_backoff//:go_default_library", "@com_github_gofrs_flock//:go_default_library", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", ], ) @@ -68,7 +68,7 @@ go_test( "//runsc/specutils", "@com_github_cenkalti_backoff//:go_default_library", "@com_github_kr_pty//:go_default_library", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go index 3813c6b93..995d4e267 100644 --- a/runsc/container/console_test.go +++ b/runsc/container/console_test.go @@ -122,6 +122,7 @@ func TestConsoleSocket(t *testing.T) { for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { spec := testutil.NewSpecWithArgs("true") + spec.Process.Terminal = true _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) if err != nil { t.Fatalf("error setting up container: %v", err) diff --git a/runsc/container/container.go b/runsc/container/container.go index 6d297d0df..7ad09bf23 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -324,7 +324,7 @@ func New(conf *boot.Config, args Args) (*Container, error) { } } if err := runInCgroup(cg, func() error { - ioFiles, specFile, err := c.createGoferProcess(args.Spec, conf, args.BundleDir) + ioFiles, specFile, err := c.createGoferProcess(args.Spec, conf, args.BundleDir, args.Attached) if err != nil { return err } @@ -427,7 +427,7 @@ func (c *Container) Start(conf *boot.Config) error { // the start (and all their children processes). if err := runInCgroup(c.Sandbox.Cgroup, func() error { // Create the gofer process. - ioFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir) + ioFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir, false) if err != nil { return err } @@ -861,7 +861,7 @@ func (c *Container) waitForStopped() error { return backoff.Retry(op, b) } -func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir string) ([]*os.File, *os.File, error) { +func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir string, attached bool) ([]*os.File, *os.File, error) { // Start with the general config flags. args := conf.ToFlags() @@ -955,6 +955,14 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund cmd.ExtraFiles = goferEnds cmd.Args[0] = "runsc-gofer" + if attached { + // The gofer is attached to the lifetime of this process, so it + // should synchronously die when this process dies. + cmd.SysProcAttr = &syscall.SysProcAttr{ + Pdeathsig: syscall.SIGKILL, + } + } + // Enter new namespaces to isolate from the rest of the system. Don't unshare // cgroup because gofer is added to a cgroup in the caller's namespace. nss := []specs.LinuxNamespace{ diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go index a27a01942..e189648f4 100644 --- a/runsc/container/multi_container_test.go +++ b/runsc/container/multi_container_test.go @@ -100,19 +100,20 @@ type execDesc struct { c *Container cmd []string want int - desc string + name string } -func execMany(execs []execDesc) error { +func execMany(t *testing.T, execs []execDesc) { for _, exec := range execs { - args := &control.ExecArgs{Argv: exec.cmd} - if ws, err := exec.c.executeSync(args); err != nil { - return fmt.Errorf("error executing %+v: %v", args, err) - } else if ws.ExitStatus() != exec.want { - return fmt.Errorf("%q: exec %q got exit status: %d, want: %d", exec.desc, exec.cmd, ws.ExitStatus(), exec.want) - } + t.Run(exec.name, func(t *testing.T) { + args := &control.ExecArgs{Argv: exec.cmd} + if ws, err := exec.c.executeSync(args); err != nil { + t.Errorf("error executing %+v: %v", args, err) + } else if ws.ExitStatus() != exec.want { + t.Errorf("%q: exec %q got exit status: %d, want: %d", exec.name, exec.cmd, ws.ExitStatus(), exec.want) + } + }) } - return nil } func createSharedMount(mount specs.Mount, name string, pod ...*specs.Spec) { @@ -1072,7 +1073,7 @@ func TestMultiContainerContainerDestroyStress(t *testing.T) { // Test that pod shared mounts are properly mounted in 2 containers and that // changes from one container is reflected in the other. func TestMultiContainerSharedMount(t *testing.T) { - for name, conf := range configs(t, all...) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { rootDir, cleanup, err := testutil.SetupRootDir() if err != nil { @@ -1110,84 +1111,82 @@ func TestMultiContainerSharedMount(t *testing.T) { { c: containers[0], cmd: []string{"/usr/bin/test", "-d", mnt0.Destination}, - desc: "directory is mounted in container0", + name: "directory is mounted in container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "-d", mnt1.Destination}, - desc: "directory is mounted in container1", + name: "directory is mounted in container1", }, { c: containers[0], - cmd: []string{"/usr/bin/touch", file0}, - desc: "create file in container0", + cmd: []string{"/bin/touch", file0}, + name: "create file in container0", }, { c: containers[0], cmd: []string{"/usr/bin/test", "-f", file0}, - desc: "file appears in container0", + name: "file appears in container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "-f", file1}, - desc: "file appears in container1", + name: "file appears in container1", }, { c: containers[1], cmd: []string{"/bin/rm", file1}, - desc: "file removed from container1", + name: "remove file from container1", }, { c: containers[0], cmd: []string{"/usr/bin/test", "!", "-f", file0}, - desc: "file removed from container0", + name: "file removed from container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "!", "-f", file1}, - desc: "file removed from container1", + name: "file removed from container1", }, { c: containers[1], cmd: []string{"/bin/mkdir", file1}, - desc: "create directory in container1", + name: "create directory in container1", }, { c: containers[0], cmd: []string{"/usr/bin/test", "-d", file0}, - desc: "dir appears in container0", + name: "dir appears in container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "-d", file1}, - desc: "dir appears in container1", + name: "dir appears in container1", }, { c: containers[0], cmd: []string{"/bin/rmdir", file0}, - desc: "create directory in container0", + name: "remove directory from container0", }, { c: containers[0], cmd: []string{"/usr/bin/test", "!", "-d", file0}, - desc: "dir removed from container0", + name: "dir removed from container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "!", "-d", file1}, - desc: "dir removed from container1", + name: "dir removed from container1", }, } - if err := execMany(execs); err != nil { - t.Fatal(err.Error()) - } + execMany(t, execs) }) } } // Test that pod mounts are mounted as readonly when requested. func TestMultiContainerSharedMountReadonly(t *testing.T) { - for name, conf := range configs(t, all...) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { rootDir, cleanup, err := testutil.SetupRootDir() if err != nil { @@ -1225,35 +1224,34 @@ func TestMultiContainerSharedMountReadonly(t *testing.T) { { c: containers[0], cmd: []string{"/usr/bin/test", "-d", mnt0.Destination}, - desc: "directory is mounted in container0", + name: "directory is mounted in container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "-d", mnt1.Destination}, - desc: "directory is mounted in container1", + name: "directory is mounted in container1", }, { c: containers[0], - cmd: []string{"/usr/bin/touch", file0}, + cmd: []string{"/bin/touch", file0}, want: 1, - desc: "fails to write to container0", + name: "fails to write to container0", }, { c: containers[1], - cmd: []string{"/usr/bin/touch", file1}, + cmd: []string{"/bin/touch", file1}, want: 1, - desc: "fails to write to container1", + name: "fails to write to container1", }, } - if err := execMany(execs); err != nil { - t.Fatal(err.Error()) - } + execMany(t, execs) }) } } // Test that shared pod mounts continue to work after container is restarted. func TestMultiContainerSharedMountRestart(t *testing.T) { + //TODO(gvisor.dev/issue/1487): This is failing with VFS2. for name, conf := range configs(t, all...) { t.Run(name, func(t *testing.T) { rootDir, cleanup, err := testutil.SetupRootDir() @@ -1291,23 +1289,21 @@ func TestMultiContainerSharedMountRestart(t *testing.T) { execs := []execDesc{ { c: containers[0], - cmd: []string{"/usr/bin/touch", file0}, - desc: "create file in container0", + cmd: []string{"/bin/touch", file0}, + name: "create file in container0", }, { c: containers[0], cmd: []string{"/usr/bin/test", "-f", file0}, - desc: "file appears in container0", + name: "file appears in container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "-f", file1}, - desc: "file appears in container1", + name: "file appears in container1", }, } - if err := execMany(execs); err != nil { - t.Fatal(err.Error()) - } + execMany(t, execs) containers[1].Destroy() @@ -1334,32 +1330,30 @@ func TestMultiContainerSharedMountRestart(t *testing.T) { { c: containers[0], cmd: []string{"/usr/bin/test", "-f", file0}, - desc: "file is still in container0", + name: "file is still in container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "-f", file1}, - desc: "file is still in container1", + name: "file is still in container1", }, { c: containers[1], cmd: []string{"/bin/rm", file1}, - desc: "file removed from container1", + name: "file removed from container1", }, { c: containers[0], cmd: []string{"/usr/bin/test", "!", "-f", file0}, - desc: "file removed from container0", + name: "file removed from container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "!", "-f", file1}, - desc: "file removed from container1", + name: "file removed from container1", }, } - if err := execMany(execs); err != nil { - t.Fatal(err.Error()) - } + execMany(t, execs) }) } } @@ -1367,53 +1361,53 @@ func TestMultiContainerSharedMountRestart(t *testing.T) { // Test that unsupported pod mounts options are ignored when matching master and // slave mounts. func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) { - rootDir, cleanup, err := testutil.SetupRootDir() - if err != nil { - t.Fatalf("error creating root dir: %v", err) - } - defer cleanup() - - conf := testutil.TestConfig(t) - conf.RootDir = rootDir + for name, conf := range configsWithVFS2(t, all...) { + t.Run(name, func(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir - // Setup the containers. - sleep := []string{"/bin/sleep", "100"} - podSpec, ids := createSpecs(sleep, sleep) - mnt0 := specs.Mount{ - Destination: "/mydir/test", - Source: "/some/dir", - Type: "tmpfs", - Options: []string{"rw", "rbind", "relatime"}, - } - podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0) + // Setup the containers. + sleep := []string{"/bin/sleep", "100"} + podSpec, ids := createSpecs(sleep, sleep) + mnt0 := specs.Mount{ + Destination: "/mydir/test", + Source: "/some/dir", + Type: "tmpfs", + Options: []string{"rw", "rbind", "relatime"}, + } + podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0) - mnt1 := mnt0 - mnt1.Destination = "/mydir2/test2" - mnt1.Options = []string{"rw", "nosuid"} - podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1) + mnt1 := mnt0 + mnt1.Destination = "/mydir2/test2" + mnt1.Options = []string{"rw", "nosuid"} + podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1) - createSharedMount(mnt0, "test-mount", podSpec...) + createSharedMount(mnt0, "test-mount", podSpec...) - containers, cleanup, err := startContainers(conf, podSpec, ids) - if err != nil { - t.Fatalf("error starting containers: %v", err) - } - defer cleanup() + containers, cleanup, err := startContainers(conf, podSpec, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() - execs := []execDesc{ - { - c: containers[0], - cmd: []string{"/usr/bin/test", "-d", mnt0.Destination}, - desc: "directory is mounted in container0", - }, - { - c: containers[1], - cmd: []string{"/usr/bin/test", "-d", mnt1.Destination}, - desc: "directory is mounted in container1", - }, - } - if err := execMany(execs); err != nil { - t.Fatal(err.Error()) + execs := []execDesc{ + { + c: containers[0], + cmd: []string{"/usr/bin/test", "-d", mnt0.Destination}, + name: "directory is mounted in container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "-d", mnt1.Destination}, + name: "directory is mounted in container1", + }, + } + execMany(t, execs) + }) } } diff --git a/runsc/debian/postinst.sh b/runsc/debian/postinst.sh index dc7aeee87..d1e28e17b 100755 --- a/runsc/debian/postinst.sh +++ b/runsc/debian/postinst.sh @@ -18,7 +18,14 @@ if [ "$1" != configure ]; then exit 0 fi +# Update docker configuration. if [ -f /etc/docker/daemon.json ]; then runsc install - systemctl restart docker || echo "unable to restart docker; you must do so manually." >&2 + if systemctl status docker 2>/dev/null; then + systemctl restart docker || echo "unable to restart docker; you must do so manually." >&2 + fi fi + +# For containerd-based installers, we don't automatically update the +# configuration. If it uses a v2 shim, then it will find the package binaries +# automatically when provided the appropriate annotation. diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD index 1036b0630..05e3637f7 100644 --- a/runsc/fsgofer/BUILD +++ b/runsc/fsgofer/BUILD @@ -31,5 +31,6 @@ go_test( deps = [ "//pkg/log", "//pkg/p9", + "//pkg/test/testutil", ], ) diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go index 1dce36965..88814b83c 100644 --- a/runsc/fsgofer/filter/config.go +++ b/runsc/fsgofer/filter/config.go @@ -128,6 +128,7 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_MADVISE: {}, unix.SYS_MEMFD_CREATE: {}, /// Used by flipcall.PacketWindowAllocator.Init(). syscall.SYS_MKDIRAT: {}, + syscall.SYS_MKNODAT: {}, // Used by the Go runtime as a temporarily workaround for a Linux // 5.2-5.4 bug. // diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index 74977c313..82a46910e 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -132,19 +132,19 @@ func (a *attachPoint) Attach() (p9.File, error) { return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix) } - f, err := openAnyFile(a.prefix, func(mode int) (*fd.FD, error) { + f, readable, err := openAnyFile(a.prefix, func(mode int) (*fd.FD, error) { return fd.Open(a.prefix, openFlags|mode, 0) }) if err != nil { return nil, fmt.Errorf("unable to open %q: %v", a.prefix, err) } - stat, err := stat(f.FD()) + stat, err := fstat(f.FD()) if err != nil { return nil, fmt.Errorf("unable to stat %q: %v", a.prefix, err) } - lf, err := newLocalFile(a, f, a.prefix, stat) + lf, err := newLocalFile(a, f, a.prefix, readable, stat) if err != nil { return nil, fmt.Errorf("unable to create localFile %q: %v", a.prefix, err) } @@ -212,6 +212,10 @@ type localFile struct { // opened with. file *fd.FD + // controlReadable tells whether 'file' was opened with read permissions + // during a walk. + controlReadable bool + // mode is the mode in which the file was opened. Set to invalidMode // if localFile isn't opened. mode p9.OpenFlags @@ -251,49 +255,57 @@ func reopenProcFd(f *fd.FD, mode int) (*fd.FD, error) { return fd.New(d), nil } -func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, error) { +func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, bool, error) { path := path.Join(parent.hostPath, name) - f, err := openAnyFile(path, func(mode int) (*fd.FD, error) { + f, readable, err := openAnyFile(path, func(mode int) (*fd.FD, error) { return fd.OpenAt(parent.file, name, openFlags|mode, 0) }) - return f, path, err + return f, path, readable, err } // openAnyFile attempts to open the file in O_RDONLY and if it fails fallsback // to O_PATH. 'path' is used for logging messages only. 'fn' is what does the // actual file open and is customizable by the caller. -func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, error) { +func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, bool, error) { // Attempt to open file in the following mode in order: // 1. RDONLY | NONBLOCK: for all files, directories, ro mounts, FIFOs. // Use non-blocking to prevent getting stuck inside open(2) for // FIFOs. This option has no effect on regular files. // 2. PATH: for symlinks, sockets. - modes := []int{syscall.O_RDONLY | syscall.O_NONBLOCK, unix.O_PATH} + options := []struct { + mode int + readable bool + }{ + { + mode: syscall.O_RDONLY | syscall.O_NONBLOCK, + readable: true, + }, + { + mode: unix.O_PATH, + readable: false, + }, + } var err error - var file *fd.FD - for i, mode := range modes { - file, err = fn(mode) + for i, option := range options { + var file *fd.FD + file, err = fn(option.mode) if err == nil { - // openat succeeded, we're done. - break + // Succeeded opening the file, we're done. + return file, option.readable, nil } switch e := extractErrno(err); e { case syscall.ENOENT: // File doesn't exist, no point in retrying. - return nil, e + return nil, false, e } - // openat failed. Try again with next mode, preserving 'err' in case this - // was the last attempt. - log.Debugf("Attempt %d to open file failed, mode: %#x, path: %q, err: %v", i, openFlags|mode, path, err) - } - if err != nil { - // All attempts to open file have failed, return the last error. - log.Debugf("Failed to open file, path: %q, err: %v", path, err) - return nil, extractErrno(err) + // File failed to open. Try again with next mode, preserving 'err' in case + // this was the last attempt. + log.Debugf("Attempt %d to open file failed, mode: %#x, path: %q, err: %v", i, openFlags|option.mode, path, err) } - - return file, nil + // All attempts to open file have failed, return the last error. + log.Debugf("Failed to open file, path: %q, err: %v", path, err) + return nil, false, extractErrno(err) } func getSupportedFileType(stat syscall.Stat_t, permitSocket bool) (fileType, error) { @@ -316,18 +328,19 @@ func getSupportedFileType(stat syscall.Stat_t, permitSocket bool) (fileType, err return ft, nil } -func newLocalFile(a *attachPoint, file *fd.FD, path string, stat syscall.Stat_t) (*localFile, error) { +func newLocalFile(a *attachPoint, file *fd.FD, path string, readable bool, stat syscall.Stat_t) (*localFile, error) { ft, err := getSupportedFileType(stat, a.conf.HostUDS) if err != nil { return nil, err } return &localFile{ - attachPoint: a, - hostPath: path, - file: file, - mode: invalidMode, - ft: ft, + attachPoint: a, + hostPath: path, + file: file, + mode: invalidMode, + ft: ft, + controlReadable: readable, }, nil } @@ -352,7 +365,7 @@ func newFDMaybe(file *fd.FD) *fd.FD { return dup } -func stat(fd int) (syscall.Stat_t, error) { +func fstat(fd int) (syscall.Stat_t, error) { var stat syscall.Stat_t if err := syscall.Fstat(fd, &stat); err != nil { return syscall.Stat_t{}, err @@ -360,6 +373,14 @@ func stat(fd int) (syscall.Stat_t, error) { return stat, nil } +func stat(path string) (syscall.Stat_t, error) { + var stat syscall.Stat_t + if err := syscall.Stat(path, &stat); err != nil { + return syscall.Stat_t{}, err + } + return stat, nil +} + func fchown(fd int, uid p9.UID, gid p9.GID) error { return syscall.Fchownat(fd, "", int(uid), int(gid), linux.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW) } @@ -372,7 +393,7 @@ func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) { // Check if control file can be used or if a new open must be created. var newFile *fd.FD - if flags == p9.ReadOnly { + if flags == p9.ReadOnly && l.controlReadable { log.Debugf("Open reusing control file, flags: %v, %q", flags, l.hostPath) newFile = l.file } else { @@ -388,7 +409,7 @@ func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) { } } - stat, err := stat(newFile.FD()) + stat, err := fstat(newFile.FD()) if err != nil { if newFile != l.file { newFile.Close() @@ -449,7 +470,7 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid if err := fchown(child.FD(), uid, gid); err != nil { return nil, nil, p9.QID{}, 0, extractErrno(err) } - stat, err := stat(child.FD()) + stat, err := fstat(child.FD()) if err != nil { return nil, nil, p9.QID{}, 0, extractErrno(err) } @@ -497,7 +518,7 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID) if err := fchown(f.FD(), uid, gid); err != nil { return p9.QID{}, extractErrno(err) } - stat, err := stat(f.FD()) + stat, err := fstat(f.FD()) if err != nil { return p9.QID{}, extractErrno(err) } @@ -510,24 +531,25 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID) func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) { // Duplicate current file if 'names' is empty. if len(names) == 0 { - newFile, err := openAnyFile(l.hostPath, func(mode int) (*fd.FD, error) { + newFile, readable, err := openAnyFile(l.hostPath, func(mode int) (*fd.FD, error) { return reopenProcFd(l.file, openFlags|mode) }) if err != nil { return nil, nil, extractErrno(err) } - stat, err := stat(newFile.FD()) + stat, err := fstat(newFile.FD()) if err != nil { newFile.Close() return nil, nil, extractErrno(err) } c := &localFile{ - attachPoint: l.attachPoint, - hostPath: l.hostPath, - file: newFile, - mode: invalidMode, + attachPoint: l.attachPoint, + hostPath: l.hostPath, + file: newFile, + mode: invalidMode, + controlReadable: readable, } return []p9.QID{l.attachPoint.makeQID(stat)}, c, nil } @@ -535,19 +557,19 @@ func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) { var qids []p9.QID last := l for _, name := range names { - f, path, err := openAnyFileFromParent(last, name) + f, path, readable, err := openAnyFileFromParent(last, name) if last != l { last.Close() } if err != nil { return nil, nil, extractErrno(err) } - stat, err := stat(f.FD()) + stat, err := fstat(f.FD()) if err != nil { f.Close() return nil, nil, extractErrno(err) } - c, err := newLocalFile(last.attachPoint, f, path, stat) + c, err := newLocalFile(last.attachPoint, f, path, readable, stat) if err != nil { f.Close() return nil, nil, extractErrno(err) @@ -592,7 +614,7 @@ func (l *localFile) FSync() error { // GetAttr implements p9.File. func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) { - stat, err := stat(l.file.FD()) + stat, err := fstat(l.file.FD()) if err != nil { return p9.QID{}, p9.AttrMask{}, p9.Attr{}, extractErrno(err) } @@ -880,7 +902,7 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9. if err := fchown(f.FD(), uid, gid); err != nil { return p9.QID{}, extractErrno(err) } - stat, err := stat(f.FD()) + stat, err := fstat(f.FD()) if err != nil { return p9.QID{}, extractErrno(err) } @@ -907,13 +929,39 @@ func (l *localFile) Link(target p9.File, newName string) error { } // Mknod implements p9.File. -// -// Not implemented. -func (*localFile) Mknod(_ string, _ p9.FileMode, _ uint32, _ uint32, _ p9.UID, _ p9.GID) (p9.QID, error) { +func (l *localFile) Mknod(name string, mode p9.FileMode, _ uint32, _ uint32, uid p9.UID, gid p9.GID) (p9.QID, error) { + conf := l.attachPoint.conf + if conf.ROMount { + if conf.PanicOnWrite { + panic("attempt to write to RO mount") + } + return p9.QID{}, syscall.EROFS + } + + hostPath := path.Join(l.hostPath, name) + + // Return EEXIST if the file already exists. + if _, err := stat(hostPath); err == nil { + return p9.QID{}, syscall.EEXIST + } + // From mknod(2) man page: // "EPERM: [...] if the filesystem containing pathname does not support // the type of node requested." - return p9.QID{}, syscall.EPERM + if mode.FileType() != p9.ModeRegular { + return p9.QID{}, syscall.EPERM + } + + // Allow Mknod to create regular files. + if err := syscall.Mknod(hostPath, uint32(mode), 0); err != nil { + return p9.QID{}, err + } + + stat, err := stat(hostPath) + if err != nil { + return p9.QID{}, extractErrno(err) + } + return l.attachPoint.makeQID(stat), nil } // UnlinkAt implements p9.File. diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go index 05af7e397..5b37e6aa1 100644 --- a/runsc/fsgofer/fsgofer_test.go +++ b/runsc/fsgofer/fsgofer_test.go @@ -26,6 +26,19 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/test/testutil" +) + +var allOpenFlags = []p9.OpenFlags{p9.ReadOnly, p9.WriteOnly, p9.ReadWrite} + +var ( + allTypes = []fileType{regular, directory, symlink} + + // allConfs is set in init(). + allConfs []Config + + rwConfs = []Config{{ROMount: false}} + roConfs = []Config{{ROMount: true}} ) func init() { @@ -39,6 +52,13 @@ func init() { } } +func configTestName(config *Config) string { + if config.ROMount { + return "ROMount" + } + return "RWMount" +} + func assertPanic(t *testing.T, f func()) { defer func() { if r := recover(); r == nil { @@ -88,18 +108,6 @@ func testReadWrite(f p9.File, flags p9.OpenFlags, content []byte) error { return nil } -var allOpenFlags = []p9.OpenFlags{p9.ReadOnly, p9.WriteOnly, p9.ReadWrite} - -var ( - allTypes = []fileType{regular, directory, symlink} - - // allConfs is set in init() above. - allConfs []Config - - rwConfs = []Config{{ROMount: false}} - roConfs = []Config{{ROMount: true}} -) - type state struct { root *localFile file *localFile @@ -117,42 +125,46 @@ func runAll(t *testing.T, test func(*testing.T, state)) { func runCustom(t *testing.T, types []fileType, confs []Config, test func(*testing.T, state)) { for _, c := range confs { - t.Logf("Config: %+v", c) - for _, ft := range types { - t.Logf("File type: %v", ft) + name := fmt.Sprintf("%s/%v", configTestName(&c), ft) + t.Run(name, func(t *testing.T) { + path, name, err := setup(ft) + if err != nil { + t.Fatalf("%v", err) + } + defer os.RemoveAll(path) - path, name, err := setup(ft) - if err != nil { - t.Fatalf("%v", err) - } - defer os.RemoveAll(path) + a, err := NewAttachPoint(path, c) + if err != nil { + t.Fatalf("NewAttachPoint failed: %v", err) + } + root, err := a.Attach() + if err != nil { + t.Fatalf("Attach failed, err: %v", err) + } - a, err := NewAttachPoint(path, c) - if err != nil { - t.Fatalf("NewAttachPoint failed: %v", err) - } - root, err := a.Attach() - if err != nil { - t.Fatalf("Attach failed, err: %v", err) - } + _, file, err := root.Walk([]string{name}) + if err != nil { + root.Close() + t.Fatalf("root.Walk({%q}) failed, err: %v", "symlink", err) + } - _, file, err := root.Walk([]string{name}) - if err != nil { + st := state{ + root: root.(*localFile), + file: file.(*localFile), + conf: c, + ft: ft, + } + test(t, st) + file.Close() root.Close() - t.Fatalf("root.Walk({%q}) failed, err: %v", "symlink", err) - } - - st := state{root: root.(*localFile), file: file.(*localFile), conf: c, ft: ft} - test(t, st) - file.Close() - root.Close() + }) } } } func setup(ft fileType) (string, string, error) { - path, err := ioutil.TempDir("", "root-") + path, err := ioutil.TempDir(testutil.TmpDir(), "root-") if err != nil { return "", "", fmt.Errorf("ioutil.TempDir() failed, err: %v", err) } @@ -308,6 +320,32 @@ func TestUnopened(t *testing.T) { }) } +// TestOpenOPath is a regression test to ensure that a file that cannot be open +// for read is allowed to be open. This was happening because the control file +// was open with O_PATH, but Open() was not checking for it and allowing the +// control file to be reused. +func TestOpenOPath(t *testing.T) { + runCustom(t, []fileType{regular}, rwConfs, func(t *testing.T, s state) { + // Fist remove all permissions on the file. + if err := s.file.SetAttr(p9.SetAttrMask{Permissions: true}, p9.SetAttr{Permissions: p9.FileMode(0)}); err != nil { + t.Fatalf("SetAttr(): %v", err) + } + // Then walk to the file again to open a new control file. + filename := filepath.Base(s.file.hostPath) + _, newFile, err := s.root.Walk([]string{filename}) + if err != nil { + t.Fatalf("root.Walk(%q): %v", filename, err) + } + + if newFile.(*localFile).controlReadable { + t.Fatalf("control file didn't open with O_PATH: %+v", newFile) + } + if _, _, _, err := newFile.Open(p9.ReadOnly); err != syscall.EACCES { + t.Fatalf("Open() should have failed, got: %v, wanted: EACCES", err) + } + }) +} + func SetGetAttr(l *localFile, valid p9.SetAttrMask, attr p9.SetAttr) (p9.Attr, error) { if err := l.SetAttr(valid, attr); err != nil { return p9.Attr{}, err diff --git a/runsc/main.go b/runsc/main.go index c9f47c579..69cb505fa 100644 --- a/runsc/main.go +++ b/runsc/main.go @@ -88,6 +88,7 @@ var ( referenceLeakMode = flag.String("ref-leak-mode", "disabled", "sets reference leak check mode: disabled (default), log-names, log-traces.") cpuNumFromQuota = flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)") vfs2Enabled = flag.Bool("vfs2", false, "TEST ONLY; use while VFSv2 is landing. This uses the new experimental VFS layer.") + fuseEnabled = flag.Bool("fuse", false, "TEST ONLY; use while FUSE in VFSv2 is landing. This allows the use of the new experimental FUSE filesystem.") // Test flags, not to be used outside tests, ever. testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.") @@ -242,6 +243,7 @@ func main() { OverlayfsStaleRead: *overlayfsStaleRead, CPUNumFromQuota: *cpuNumFromQuota, VFS2: *vfs2Enabled, + FUSE: *fuseEnabled, QDisc: queueingDiscipline, TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot, TestOnlyTestNameEnv: *testOnlyTestNameEnv, diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD index 035dcd3e3..2b9d4549d 100644 --- a/runsc/sandbox/BUILD +++ b/runsc/sandbox/BUILD @@ -29,7 +29,7 @@ go_library( "//runsc/console", "//runsc/specutils", "@com_github_cenkalti_backoff//:go_default_library", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", "@com_github_syndtr_gocapability//capability:go_default_library", "@com_github_vishvananda_netlink//:go_default_library", "@org_golang_x_sys//unix:go_default_library", diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go index deee619f3..817a923ad 100644 --- a/runsc/sandbox/network.go +++ b/runsc/sandbox/network.go @@ -134,7 +134,6 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG return err } if isRoot { - return fmt.Errorf("cannot run with network enabled in root network namespace") } diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index 6e1a2af25..2afcc27af 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -478,9 +478,7 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF // If the console control socket file is provided, then create a new // pty master/slave pair and set the TTY on the sandbox process. - if args.ConsoleSocket != "" { - cmd.Args = append(cmd.Args, "--console=true") - + if args.Spec.Process.Terminal && args.ConsoleSocket != "" { // console.NewWithSocket will send the master on the given // socket, and return the slave. tty, err := console.NewWithSocket(args.ConsoleSocket) diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD index 62d4f5113..43851a22f 100644 --- a/runsc/specutils/BUILD +++ b/runsc/specutils/BUILD @@ -18,7 +18,7 @@ go_library( "//pkg/sentry/kernel/auth", "@com_github_cenkalti_backoff//:go_default_library", "@com_github_mohae_deepcopy//:go_default_library", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", "@com_github_syndtr_gocapability//capability:go_default_library", "@org_golang_x_sys//unix:go_default_library", ], @@ -29,5 +29,5 @@ go_test( size = "small", srcs = ["specutils_test.go"], library = ":specutils", - deps = ["@com_github_opencontainers_runtime-spec//specs-go:go_default_library"], + deps = ["@com_github_opencontainers_runtime_spec//specs-go:go_default_library"], ) |