diff options
Diffstat (limited to 'runsc')
31 files changed, 881 insertions, 627 deletions
diff --git a/runsc/BUILD b/runsc/BUILD index 757f6d44c..96f697a5f 100644 --- a/runsc/BUILD +++ b/runsc/BUILD @@ -62,18 +62,22 @@ go_binary( ) pkg_tar( - name = "runsc-bin", - srcs = [":runsc"], + name = "debian-bin", + srcs = [ + ":runsc", + "//shim/v1:gvisor-containerd-shim", + "//shim/v2:containerd-shim-runsc-v1", + ], mode = "0755", package_dir = "/usr/bin", - strip_prefix = "/runsc/linux_amd64_pure_stripped", ) pkg_tar( name = "debian-data", extension = "tar.gz", deps = [ - ":runsc-bin", + ":debian-bin", + "//shim:config", ], ) diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index 2b1e6b13e..9f52438c2 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -40,6 +40,8 @@ go_library( "//pkg/sentry/arch:registers_go_proto", "//pkg/sentry/control", "//pkg/sentry/devices/memdev", + "//pkg/sentry/devices/ttydev", + "//pkg/sentry/devices/tundev", "//pkg/sentry/fdimport", "//pkg/sentry/fs", "//pkg/sentry/fs/dev", @@ -53,6 +55,7 @@ go_library( "//pkg/sentry/fs/user", "//pkg/sentry/fsimpl/devpts", "//pkg/sentry/fsimpl/devtmpfs", + "//pkg/sentry/fsimpl/fuse", "//pkg/sentry/fsimpl/gofer", "//pkg/sentry/fsimpl/host", "//pkg/sentry/fsimpl/overlay", @@ -87,6 +90,7 @@ go_library( "//pkg/tcpip", "//pkg/tcpip/link/fdbased", "//pkg/tcpip/link/loopback", + "//pkg/tcpip/link/packetsocket", "//pkg/tcpip/link/qdisc/fifo", "//pkg/tcpip/link/sniffer", "//pkg/tcpip/network/arp", @@ -103,7 +107,7 @@ go_library( "//runsc/boot/pprof", "//runsc/specutils", "@com_github_golang_protobuf//proto:go_default_library", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", "@org_golang_x_sys//unix:go_default_library", ], ) @@ -128,7 +132,7 @@ go_test( "//pkg/sync", "//pkg/unet", "//runsc/fsgofer", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/runsc/boot/config.go b/runsc/boot/config.go index bcec7e4db..80da8b3e6 100644 --- a/runsc/boot/config.go +++ b/runsc/boot/config.go @@ -187,6 +187,12 @@ type Config struct { // SoftwareGSO indicates that software segmentation offload is enabled. SoftwareGSO bool + // TXChecksumOffload indicates that TX Checksum Offload is enabled. + TXChecksumOffload bool + + // RXChecksumOffload indicates that RX Checksum Offload is enabled. + RXChecksumOffload bool + // QDisc indicates the type of queuening discipline to use by default // for non-loopback interfaces. QDisc QueueingDiscipline @@ -268,6 +274,9 @@ type Config struct { // Enables VFS2 (not plumbled through yet). VFS2 bool + + // Enables FUSE usage (not plumbled through yet). + FUSE bool } // ToFlags returns a slice of flags that correspond to the given Config. @@ -299,6 +308,8 @@ func (c *Config) ToFlags() []string { "--ref-leak-mode=" + refsLeakModeToString(c.ReferenceLeakMode), "--gso=" + strconv.FormatBool(c.HardwareGSO), "--software-gso=" + strconv.FormatBool(c.SoftwareGSO), + "--rx-checksum-offload=" + strconv.FormatBool(c.RXChecksumOffload), + "--tx-checksum-offload=" + strconv.FormatBool(c.TXChecksumOffload), "--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead), "--qdisc=" + c.QDisc.String(), } @@ -317,5 +328,9 @@ func (c *Config) ToFlags() []string { f = append(f, "--vfs2=true") } + if c.FUSE { + f = append(f, "--fuse=true") + } + return f } diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index 8125d5061..3e5e4c22f 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -155,7 +155,7 @@ func newController(fd int, l *Loader) (*controller, error) { srv.Register(&debug{}) srv.Register(&control.Logging{}) - if l.conf.ProfileEnable { + if l.root.conf.ProfileEnable { srv.Register(&control.Profile{ Kernel: l.k, }) @@ -333,7 +333,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { // Pause the kernel while we build a new one. cm.l.k.Pause() - p, err := createPlatform(cm.l.conf, deviceFile) + p, err := createPlatform(cm.l.root.conf, deviceFile) if err != nil { return fmt.Errorf("creating platform: %v", err) } @@ -349,8 +349,8 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { cm.l.k = k // Set up the restore environment. - mntr := newContainerMounter(cm.l.spec, cm.l.goferFDs, cm.l.k, cm.l.mountHints) - renv, err := mntr.createRestoreEnvironment(cm.l.conf) + mntr := newContainerMounter(cm.l.root.spec, cm.l.root.goferFDs, cm.l.k, cm.l.mountHints) + renv, err := mntr.createRestoreEnvironment(cm.l.root.conf) if err != nil { return fmt.Errorf("creating RestoreEnvironment: %v", err) } @@ -368,7 +368,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { return fmt.Errorf("file cannot be empty") } - if cm.l.conf.ProfileEnable { + if cm.l.root.conf.ProfileEnable { // pprof.Initialize opens /proc/self/maps, so has to be called before // installing seccomp filters. pprof.Initialize() @@ -387,13 +387,13 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { // Since we have a new kernel we also must make a new watchdog. dogOpts := watchdog.DefaultOpts - dogOpts.TaskTimeoutAction = cm.l.conf.WatchdogAction + dogOpts.TaskTimeoutAction = cm.l.root.conf.WatchdogAction dog := watchdog.New(k, dogOpts) // Change the loader fields to reflect the changes made when restoring. cm.l.k = k cm.l.watchdog = dog - cm.l.rootProcArgs = kernel.CreateProcessArgs{} + cm.l.root.procArgs = kernel.CreateProcessArgs{} cm.l.restore = true // Reinitialize the sandbox ID and processes map. Note that it doesn't diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index 60e33425f..149eb0b1b 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -310,19 +310,12 @@ var allowedSyscalls = seccomp.SyscallRules{ }, }, syscall.SYS_WRITE: {}, - // The only user in rawfile.NonBlockingWrite3 always passes iovcnt with - // values 2 or 3. Three iovec-s are passed, when the PACKET_VNET_HDR - // option is enabled for a packet socket. + // For rawfile.NonBlockingWriteIovec. syscall.SYS_WRITEV: []seccomp.Rule{ { seccomp.AllowAny{}, seccomp.AllowAny{}, - seccomp.AllowValue(2), - }, - { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(3), + seccomp.GreaterThan(0), }, }, } diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index e83584b82..59639ba19 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -29,6 +29,7 @@ import ( _ "gvisor.dev/gvisor/pkg/sentry/fs/sys" _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" _ "gvisor.dev/gvisor/pkg/sentry/fs/tty" + "gvisor.dev/gvisor/pkg/sentry/vfs" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/abi/linux" @@ -390,6 +391,10 @@ type mountHint struct { // root is the inode where the volume is mounted. For mounts with 'pod' share // the volume is mounted once and then bind mounted inside the containers. root *fs.Inode + + // vfsMount is the master mount for the volume. For mounts with 'pod' share + // the master volume is bind mounted inside the containers. + vfsMount *vfs.Mount } func (m *mountHint) setField(key, val string) error { @@ -571,9 +576,9 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin // processHints processes annotations that container hints about how volumes // should be mounted (e.g. a volume shared between containers). It must be // called for the root container only. -func (c *containerMounter) processHints(conf *Config) error { +func (c *containerMounter) processHints(conf *Config, creds *auth.Credentials) error { if conf.VFS2 { - return nil + return c.processHintsVFS2(conf, creds) } ctx := c.k.SupervisorContext() for _, hint := range c.hints.mounts { diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index b05a8bd45..9cd9c5909 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -77,29 +77,34 @@ import ( _ "gvisor.dev/gvisor/pkg/sentry/socket/unix" ) -// Loader keeps state needed to start the kernel and run the container.. -type Loader struct { - // k is the kernel. - k *kernel.Kernel - - // ctrl is the control server. - ctrl *controller - +type containerInfo struct { conf *Config - // console is set to true if terminal is enabled. - console bool + // spec is the base configuration for the root container. + spec *specs.Spec - watchdog *watchdog.Watchdog + // procArgs refers to the container's init task. + procArgs kernel.CreateProcessArgs // stdioFDs contains stdin, stdout, and stderr. stdioFDs []int // goferFDs are the FDs that attach the sandbox to the gofers. goferFDs []int +} - // spec is the base configuration for the root container. - spec *specs.Spec +// Loader keeps state needed to start the kernel and run the container.. +type Loader struct { + // k is the kernel. + k *kernel.Kernel + + // ctrl is the control server. + ctrl *controller + + // root contains information about the root container in the sandbox. + root containerInfo + + watchdog *watchdog.Watchdog // stopSignalForwarding disables forwarding of signals to the sandboxed // container. It should be called when a sandbox is destroyed. @@ -108,9 +113,6 @@ type Loader struct { // restore is set to true if we are restoring a container. restore bool - // rootProcArgs refers to the root sandbox init task. - rootProcArgs kernel.CreateProcessArgs - // sandboxID is the ID for the whole sandbox. sandboxID string @@ -175,8 +177,6 @@ type Args struct { // StdioFDs is the stdio for the application. The Loader takes ownership of // these FDs and may close them at any time. StdioFDs []int - // Console is set to true if using TTY. - Console bool // NumCPU is the number of CPUs to create inside the sandbox. NumCPU int // TotalMem is the initial amount of total memory to report back to the @@ -205,6 +205,10 @@ func New(args Args) (*Loader, error) { // Is this a VFSv2 kernel? if args.Conf.VFS2 { kernel.VFS2Enabled = true + if args.Conf.FUSE { + kernel.FUSEEnabled = true + } + vfs2.Override() } @@ -227,9 +231,7 @@ func New(args Args) (*Loader, error) { // Create VDSO. // // Pass k as the platform since it is savable, unlike the actual platform. - // - // FIXME(b/109889800): Use non-nil context. - vdso, err := loader.PrepareVDSO(nil, k) + vdso, err := loader.PrepareVDSO(k) if err != nil { return nil, fmt.Errorf("creating vdso: %v", err) } @@ -300,6 +302,12 @@ func New(args Args) (*Loader, error) { return nil, fmt.Errorf("initializing kernel: %v", err) } + if kernel.VFS2Enabled { + if err := registerFilesystems(k); err != nil { + return nil, fmt.Errorf("registering filesystems: %w", err) + } + } + if err := adjustDirentCache(k); err != nil { return nil, err } @@ -318,7 +326,7 @@ func New(args Args) (*Loader, error) { dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction dog := watchdog.New(k, dogOpts) - procArgs, err := newProcess(args.ID, args.Spec, creds, k, k.RootPIDNamespace()) + procArgs, err := createProcessArgs(args.ID, args.Spec, creds, k, k.RootPIDNamespace()) if err != nil { return nil, fmt.Errorf("creating init process for root container: %v", err) } @@ -366,17 +374,18 @@ func New(args Args) (*Loader, error) { eid := execID{cid: args.ID} l := &Loader{ - k: k, - conf: args.Conf, - console: args.Console, - watchdog: dog, - spec: args.Spec, - goferFDs: args.GoferFDs, - stdioFDs: stdioFDs, - rootProcArgs: procArgs, - sandboxID: args.ID, - processes: map[execID]*execProcess{eid: {}}, - mountHints: mountHints, + k: k, + watchdog: dog, + sandboxID: args.ID, + processes: map[execID]*execProcess{eid: {}}, + mountHints: mountHints, + root: containerInfo{ + conf: args.Conf, + stdioFDs: stdioFDs, + goferFDs: args.GoferFDs, + spec: args.Spec, + procArgs: procArgs, + }, } // We don't care about child signals; some platforms can generate a @@ -404,8 +413,8 @@ func New(args Args) (*Loader, error) { return l, nil } -// newProcess creates a process that can be run with kernel.CreateProcess. -func newProcess(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) { +// createProcessArgs creates args that can be used with kernel.CreateProcess. +func createProcessArgs(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) { // Create initial limits. ls, err := createLimitSet(spec) if err != nil { @@ -479,13 +488,13 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) { } func (l *Loader) installSeccompFilters() error { - if l.conf.DisableSeccomp { + if l.root.conf.DisableSeccomp { filter.Report("syscall filter is DISABLED. Running in less secure mode.") } else { opts := filter.Options{ Platform: l.k.Platform, - HostNetwork: l.conf.Network == NetworkHost, - ProfileEnable: l.conf.ProfileEnable, + HostNetwork: l.root.conf.Network == NetworkHost, + ProfileEnable: l.root.conf.ProfileEnable, ControllerFD: l.ctrl.srv.FD(), } if err := filter.Install(opts); err != nil { @@ -511,7 +520,7 @@ func (l *Loader) Run() error { } func (l *Loader) run() error { - if l.conf.Network == NetworkHost { + if l.root.conf.Network == NetworkHost { // Delay host network configuration to this point because network namespace // is configured after the loader is created and before Run() is called. log.Debugf("Configuring host network") @@ -532,10 +541,8 @@ func (l *Loader) run() error { // If we are restoring, we do not want to create a process. // l.restore is set by the container manager when a restore call is made. - var ttyFile *host.TTYFileOperations - var ttyFileVFS2 *hostvfs2.TTYFileDescription if !l.restore { - if l.conf.ProfileEnable { + if l.root.conf.ProfileEnable { pprof.Initialize() } @@ -545,82 +552,29 @@ func (l *Loader) run() error { return err } - // Create the FD map, which will set stdin, stdout, and stderr. If console - // is true, then ioctl calls will be passed through to the host fd. - ctx := l.rootProcArgs.NewContext(l.k) - var err error - - // CreateProcess takes a reference on FDMap if successful. We won't need - // ours either way. - l.rootProcArgs.FDTable, ttyFile, ttyFileVFS2, err = createFDTable(ctx, l.console, l.stdioFDs) - if err != nil { - return fmt.Errorf("importing fds: %v", err) - } - - // Setup the root container file system. - l.startGoferMonitor(l.sandboxID, l.goferFDs) - - mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints) - if err := mntr.processHints(l.conf); err != nil { - return err - } - if err := setupContainerFS(ctx, l.conf, mntr, &l.rootProcArgs); err != nil { - return err - } - - // Add the HOME enviroment variable if it is not already set. - var envv []string - if kernel.VFS2Enabled { - envv, err = user.MaybeAddExecUserHomeVFS2(ctx, l.rootProcArgs.MountNamespaceVFS2, - l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv) - - } else { - envv, err = user.MaybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace, - l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv) - } - if err != nil { - return err - } - l.rootProcArgs.Envv = envv - // Create the root container init task. It will begin running // when the kernel is started. - if _, _, err := l.k.CreateProcess(l.rootProcArgs); err != nil { - return fmt.Errorf("creating init process: %v", err) + if _, err := l.createContainerProcess(true, l.sandboxID, &l.root, ep); err != nil { + return err } - - // CreateProcess takes a reference on FDTable if successful. - l.rootProcArgs.FDTable.DecRef() } ep.tg = l.k.GlobalInit() - if ns, ok := specutils.GetNS(specs.PIDNamespace, l.spec); ok { + if ns, ok := specutils.GetNS(specs.PIDNamespace, l.root.spec); ok { ep.pidnsPath = ns.Path } - if l.console { - // Set the foreground process group on the TTY to the global init process - // group, since that is what we are about to start running. - switch { - case ttyFileVFS2 != nil: - ep.ttyVFS2 = ttyFileVFS2 - ttyFileVFS2.InitForegroundProcessGroup(ep.tg.ProcessGroup()) - case ttyFile != nil: - ep.tty = ttyFile - ttyFile.InitForegroundProcessGroup(ep.tg.ProcessGroup()) - } - } // Handle signals by forwarding them to the root container process // (except for panic signal, which should cause a panic). l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) { // Panic signal should cause a panic. - if l.conf.PanicSignal != -1 && sig == linux.Signal(l.conf.PanicSignal) { + if l.root.conf.PanicSignal != -1 && sig == linux.Signal(l.root.conf.PanicSignal) { panic("Signal-induced panic") } // Otherwise forward to root container. deliveryMode := DeliverToProcess - if l.console { + if l.root.spec.Process.Terminal { // Since we are running with a console, we should forward the signal to // the foreground process group so that job control signals like ^C can // be handled properly. @@ -637,7 +591,7 @@ func (l *Loader) run() error { // during restore, we can release l.stdioFDs now. VFS2 takes ownership of the // passed FDs, so only close for VFS1. if !kernel.VFS2Enabled { - for _, fd := range l.stdioFDs { + for _, fd := range l.root.stdioFDs { err := syscall.Close(fd) if err != nil { return fmt.Errorf("close dup()ed stdioFDs: %v", err) @@ -676,8 +630,8 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file l.mu.Lock() defer l.mu.Unlock() - eid := execID{cid: cid} - if _, ok := l.processes[eid]; !ok { + ep := l.processes[execID{cid: cid}] + if ep == nil { return fmt.Errorf("trying to start a deleted container %q", cid) } @@ -711,76 +665,112 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file if pidns == nil { pidns = l.k.RootPIDNamespace().NewChild(l.k.RootUserNamespace()) } - l.processes[eid].pidnsPath = ns.Path + ep.pidnsPath = ns.Path } else { pidns = l.k.RootPIDNamespace() } - procArgs, err := newProcess(cid, spec, creds, l.k, pidns) + + info := &containerInfo{ + conf: conf, + spec: spec, + } + info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns) if err != nil { return fmt.Errorf("creating new process: %v", err) } // setupContainerFS() dups stdioFDs, so we don't need to dup them here. - var stdioFDs []int for _, f := range files[:3] { - stdioFDs = append(stdioFDs, int(f.Fd())) + info.stdioFDs = append(info.stdioFDs, int(f.Fd())) } - // Create the FD map, which will set stdin, stdout, and stderr. - ctx := procArgs.NewContext(l.k) - fdTable, _, _, err := createFDTable(ctx, false, stdioFDs) - if err != nil { - return fmt.Errorf("importing fds: %v", err) - } - // CreateProcess takes a reference on fdTable if successful. We won't - // need ours either way. - procArgs.FDTable = fdTable - // Can't take ownership away from os.File. dup them to get a new FDs. - var goferFDs []int for _, f := range files[3:] { fd, err := syscall.Dup(int(f.Fd())) if err != nil { return fmt.Errorf("failed to dup file: %v", err) } - goferFDs = append(goferFDs, fd) + info.goferFDs = append(info.goferFDs, fd) + } + + tg, err := l.createContainerProcess(false, cid, info, ep) + if err != nil { + return err + } + + // Success! + l.k.StartProcess(tg) + ep.tg = tg + return nil +} + +func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo, ep *execProcess) (*kernel.ThreadGroup, error) { + console := false + if root { + // Only root container supports terminal for now. + console = info.spec.Process.Terminal } + // Create the FD map, which will set stdin, stdout, and stderr. + ctx := info.procArgs.NewContext(l.k) + fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, console, info.stdioFDs) + if err != nil { + return nil, fmt.Errorf("importing fds: %v", err) + } + // CreateProcess takes a reference on fdTable if successful. We won't need + // ours either way. + info.procArgs.FDTable = fdTable + // Setup the child container file system. - l.startGoferMonitor(cid, goferFDs) + l.startGoferMonitor(cid, info.goferFDs) - mntr := newContainerMounter(spec, goferFDs, l.k, l.mountHints) - if err := setupContainerFS(ctx, conf, mntr, &procArgs); err != nil { - return err + mntr := newContainerMounter(info.spec, info.goferFDs, l.k, l.mountHints) + if root { + if err := mntr.processHints(info.conf, info.procArgs.Credentials); err != nil { + return nil, err + } + } + if err := setupContainerFS(ctx, info.conf, mntr, &info.procArgs); err != nil { + return nil, err } // Add the HOME enviroment variable if it is not already set. var envv []string if kernel.VFS2Enabled { - envv, err = user.MaybeAddExecUserHomeVFS2(ctx, procArgs.MountNamespaceVFS2, - procArgs.Credentials.RealKUID, procArgs.Envv) + envv, err = user.MaybeAddExecUserHomeVFS2(ctx, info.procArgs.MountNamespaceVFS2, + info.procArgs.Credentials.RealKUID, info.procArgs.Envv) } else { - envv, err = user.MaybeAddExecUserHome(ctx, procArgs.MountNamespace, - procArgs.Credentials.RealKUID, procArgs.Envv) + envv, err = user.MaybeAddExecUserHome(ctx, info.procArgs.MountNamespace, + info.procArgs.Credentials.RealKUID, info.procArgs.Envv) } if err != nil { - return err + return nil, err } - procArgs.Envv = envv + info.procArgs.Envv = envv // Create and start the new process. - tg, _, err := l.k.CreateProcess(procArgs) + tg, _, err := l.k.CreateProcess(info.procArgs) if err != nil { - return fmt.Errorf("creating process: %v", err) + return nil, fmt.Errorf("creating process: %v", err) } - l.k.StartProcess(tg) - // CreateProcess takes a reference on FDTable if successful. - procArgs.FDTable.DecRef() + info.procArgs.FDTable.DecRef() - l.processes[eid].tg = tg - return nil + // Set the foreground process group on the TTY to the global init process + // group, since that is what we are about to start running. + if root { + switch { + case ttyFileVFS2 != nil: + ep.ttyVFS2 = ttyFileVFS2 + ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup()) + case ttyFile != nil: + ep.tty = ttyFile + ttyFile.InitForegroundProcessGroup(tg.ProcessGroup()) + } + } + + return tg, nil } // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on @@ -1059,7 +1049,7 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in // Enable SACK Recovery. if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil { - return nil, fmt.Errorf("failed to enable SACK: %v", err) + return nil, fmt.Errorf("failed to enable SACK: %s", err) } // Set default TTLs as required by socket/netstack. @@ -1068,11 +1058,9 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in // Enable Receive Buffer Auto-Tuning. if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil { - return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err) + return nil, fmt.Errorf("SetTransportProtocolOption failed: %s", err) } - s.FillIPTablesMetadata() - return &s, nil } diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index e448fd773..8e6fe57e1 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -479,13 +479,13 @@ func TestCreateMountNamespaceVFS2(t *testing.T) { defer l.Destroy() defer loaderCleanup() - mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints) - if err := mntr.processHints(l.conf); err != nil { + mntr := newContainerMounter(l.root.spec, l.root.goferFDs, l.k, l.mountHints) + if err := mntr.processHints(l.root.conf, l.root.procArgs.Credentials); err != nil { t.Fatalf("failed process hints: %v", err) } ctx := l.k.SupervisorContext() - mns, err := mntr.setupVFS2(ctx, l.conf, &l.rootProcArgs) + mns, err := mntr.setupVFS2(ctx, l.root.conf, &l.root.procArgs) if err != nil { t.Fatalf("failed to setupVFS2: %v", err) } @@ -499,7 +499,7 @@ func TestCreateMountNamespaceVFS2(t *testing.T) { Path: fspath.Parse(p), } - if d, err := l.k.VFS().GetDentryAt(ctx, l.rootProcArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil { + if d, err := l.k.VFS().GetDentryAt(ctx, l.root.procArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil { t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err) } else { d.DecRef() diff --git a/runsc/boot/network.go b/runsc/boot/network.go index 0af30456e..4e1fa7665 100644 --- a/runsc/boot/network.go +++ b/runsc/boot/network.go @@ -25,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/link/fdbased" "gvisor.dev/gvisor/pkg/tcpip/link/loopback" + "gvisor.dev/gvisor/pkg/tcpip/link/packetsocket" "gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo" "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" "gvisor.dev/gvisor/pkg/tcpip/network/arp" @@ -123,6 +124,8 @@ type FDBasedLink struct { Routes []Route GSOMaxSize uint32 SoftwareGSOEnabled bool + TXChecksumOffload bool + RXChecksumOffload bool LinkAddress net.HardwareAddr QDisc QueueingDiscipline @@ -236,7 +239,8 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct PacketDispatchMode: fdbased.RecvMMsg, GSOMaxSize: link.GSOMaxSize, SoftwareGSOEnabled: link.SoftwareGSOEnabled, - RXChecksumOffload: true, + TXChecksumOffload: link.TXChecksumOffload, + RXChecksumOffload: link.RXChecksumOffload, }) if err != nil { return err @@ -249,6 +253,9 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct linkEP = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000) } + // Enable support for AF_PACKET sockets to receive outgoing packets. + linkEP = packetsocket.New(linkEP) + log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels) if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil { return err diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index d1653b279..9a1ed8e9e 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -26,9 +26,12 @@ import ( "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/devices/memdev" + "gvisor.dev/gvisor/pkg/sentry/devices/ttydev" + "gvisor.dev/gvisor/pkg/sentry/devices/tundev" "gvisor.dev/gvisor/pkg/sentry/fs/user" "gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts" "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/fuse" "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer" "gvisor.dev/gvisor/pkg/sentry/fsimpl/overlay" "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc" @@ -40,7 +43,11 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) -func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) error { +func registerFilesystems(k *kernel.Kernel) error { + ctx := k.SupervisorContext() + creds := auth.NewRootCredentials(k.RootUserNamespace()) + vfsObj := k.VFS() + vfsObj.MustRegisterFilesystemType(devpts.Name, &devpts.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserList: true, // TODO(b/29356795): Users may mount this once the terminals are in a @@ -70,11 +77,28 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre AllowUserMount: true, AllowUserList: true, }) + vfsObj.MustRegisterFilesystemType(fuse.Name, &fuse.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) // Setup files in devtmpfs. if err := memdev.Register(vfsObj); err != nil { return fmt.Errorf("registering memdev: %w", err) } + if err := ttydev.Register(vfsObj); err != nil { + return fmt.Errorf("registering ttydev: %w", err) + } + + if kernel.FUSEEnabled { + if err := fuse.Register(vfsObj); err != nil { + return fmt.Errorf("registering fusedev: %w", err) + } + } + + if err := tundev.Register(vfsObj); err != nil { + return fmt.Errorf("registering tundev: %v", err) + } a, err := devtmpfs.NewAccessor(ctx, vfsObj, creds, devtmpfs.Name) if err != nil { return fmt.Errorf("creating devtmpfs accessor: %w", err) @@ -85,15 +109,25 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre return fmt.Errorf("initializing userspace: %w", err) } if err := memdev.CreateDevtmpfsFiles(ctx, a); err != nil { - return fmt.Errorf("creating devtmpfs files: %w", err) + return fmt.Errorf("creating memdev devtmpfs files: %w", err) + } + if err := ttydev.CreateDevtmpfsFiles(ctx, a); err != nil { + return fmt.Errorf("creating ttydev devtmpfs files: %w", err) + } + if err := tundev.CreateDevtmpfsFiles(ctx, a); err != nil { + return fmt.Errorf("creating tundev devtmpfs files: %v", err) } + + if kernel.FUSEEnabled { + if err := fuse.CreateDevtmpfsFile(ctx, a); err != nil { + return fmt.Errorf("creating fusedev devtmpfs files: %w", err) + } + } + return nil } func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { - if err := mntr.k.VFS().Init(); err != nil { - return fmt.Errorf("failed to initialize VFS: %w", err) - } mns, err := mntr.setupVFS2(ctx, conf, procArgs) if err != nil { return fmt.Errorf("failed to setupFS: %w", err) @@ -122,10 +156,6 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals rootCtx := procArgs.NewContext(c.k) - if err := registerFilesystems(rootCtx, c.k.VFS(), rootCreds); err != nil { - return nil, fmt.Errorf("register filesystems: %w", err) - } - mns, err := c.createMountNamespaceVFS2(rootCtx, conf, rootCreds) if err != nil { return nil, fmt.Errorf("creating mount namespace: %w", err) @@ -160,8 +190,14 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, for i := range mounts { submount := &mounts[i] log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options) - if err := c.mountSubmountVFS2(ctx, conf, mns, creds, submount); err != nil { - return err + if hint := c.hints.findMount(submount.Mount); hint != nil && hint.isSupported() { + if err := c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.Mount, hint); err != nil { + return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, submount.Destination, err) + } + } else { + if err := c.mountSubmountVFS2(ctx, conf, mns, creds, submount); err != nil { + return fmt.Errorf("mount submount %q: %w", submount.Destination, err) + } } } @@ -235,20 +271,18 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, // getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values // used for mounts. func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndFD) (string, *vfs.MountOptions, error) { - var ( - fsName string - data []string - ) + fsName := m.Type + var data []string // Find filesystem name and FS specific data field. switch m.Type { case devpts.Name, devtmpfs.Name, proc.Name, sys.Name: - fsName = m.Type + // Nothing to do. + case nonefs: fsName = sys.Name - case tmpfs.Name: - fsName = m.Type + case tmpfs.Name: var err error data, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...) if err != nil { @@ -257,10 +291,16 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndF case bind: fsName = gofer.Name + if m.fd == 0 { + // Check that an FD was provided to fails fast. Technically FD=0 is valid, + // but unlikely to be correct in this context. + return "", nil, fmt.Errorf("9P mount requires a connection FD") + } data = p9MountData(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */) default: log.Warningf("ignoring unknown filesystem type %q", m.Type) + return "", nil, nil } opts := &vfs.MountOptions{ @@ -300,7 +340,7 @@ func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath s } _, err := c.k.VFS().StatAt(ctx, creds, target, &vfs.StatOptions{}) if err == nil { - // Mount point exists, nothing else to do. + log.Debugf("Mount point %q already exists", currentPath) return nil } if err != syserror.ENOENT { @@ -378,3 +418,76 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds return fmt.Errorf(`stating "/tmp" inside container: %w`, err) } } + +// processHintsVFS2 processes annotations that container hints about how volumes +// should be mounted (e.g. a volume shared between containers). It must be +// called for the root container only. +func (c *containerMounter) processHintsVFS2(conf *Config, creds *auth.Credentials) error { + ctx := c.k.SupervisorContext() + for _, hint := range c.hints.mounts { + // TODO(b/142076984): Only support tmpfs for now. Bind mounts require a + // common gofer to mount all shared volumes. + if hint.mount.Type != tmpfs.Name { + continue + } + + log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type) + mnt, err := c.mountSharedMasterVFS2(ctx, conf, hint, creds) + if err != nil { + return fmt.Errorf("mounting shared master %q: %v", hint.name, err) + } + hint.vfsMount = mnt + } + return nil +} + +// mountSharedMasterVFS2 mounts the master of a volume that is shared among +// containers in a pod. +func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *Config, hint *mountHint, creds *auth.Credentials) (*vfs.Mount, error) { + // Map mount type to filesystem name, and parse out the options that we are + // capable of dealing with. + mntFD := &mountAndFD{Mount: hint.mount} + fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, mntFD) + if err != nil { + return nil, err + } + if len(fsName) == 0 { + return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type) + } + return c.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts) +} + +// mountSharedSubmount binds mount to a previously mounted volume that is shared +// among containers in the same pod. +func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount specs.Mount, source *mountHint) error { + if err := source.checkCompatible(mount); err != nil { + return err + } + + _, opts, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount}) + if err != nil { + return err + } + newMnt, err := c.k.VFS().NewDisconnectedMount(source.vfsMount.Filesystem(), source.vfsMount.Root(), opts) + if err != nil { + return err + } + defer newMnt.DecRef() + + root := mns.Root() + defer root.DecRef() + if err := c.makeSyntheticMount(ctx, mount.Destination, root, creds); err != nil { + return err + } + + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(mount.Destination), + } + if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil { + return err + } + log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name) + return nil +} diff --git a/runsc/cgroup/BUILD b/runsc/cgroup/BUILD index 7e34a284a..37f4253ba 100644 --- a/runsc/cgroup/BUILD +++ b/runsc/cgroup/BUILD @@ -10,7 +10,7 @@ go_library( "//pkg/cleanup", "//pkg/log", "@com_github_cenkalti_backoff//:go_default_library", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", ], ) @@ -22,6 +22,6 @@ go_test( tags = ["local"], deps = [ "//pkg/test/testutil", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", ], ) diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD index af3538ef0..1b5178dd5 100644 --- a/runsc/cmd/BUILD +++ b/runsc/cmd/BUILD @@ -45,7 +45,7 @@ go_library( "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/platform", - "//pkg/state", + "//pkg/state/pretty", "//pkg/state/statefile", "//pkg/sync", "//pkg/unet", @@ -58,7 +58,7 @@ go_library( "//runsc/fsgofer/filter", "//runsc/specutils", "@com_github_google_subcommands//:go_default_library", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", "@com_github_syndtr_gocapability//capability:go_default_library", "@org_golang_x_sys//unix:go_default_library", ], @@ -87,9 +87,9 @@ go_test( "//runsc/boot", "//runsc/container", "//runsc/specutils", - "@com_github_google_go-cmp//cmp:go_default_library", - "@com_github_google_go-cmp//cmp/cmpopts:go_default_library", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_google_go_cmp//cmp:go_default_library", + "@com_github_google_go_cmp//cmp/cmpopts:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", "@com_github_syndtr_gocapability//capability:go_default_library", ], ) diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go index 01204ab4d..f4f247721 100644 --- a/runsc/cmd/boot.go +++ b/runsc/cmd/boot.go @@ -54,10 +54,6 @@ type Boot struct { // provided in that order. stdioFDs intFlags - // console is set to true if the sandbox should allow terminal ioctl(2) - // syscalls. - console bool - // applyCaps determines if capabilities defined in the spec should be applied // to the process. applyCaps bool @@ -115,7 +111,6 @@ func (b *Boot) SetFlags(f *flag.FlagSet) { f.IntVar(&b.deviceFD, "device-fd", -1, "FD for the platform device file") f.Var(&b.ioFDs, "io-fds", "list of FDs to connect 9P clients. They must follow this order: root first, then mounts as defined in the spec") f.Var(&b.stdioFDs, "stdio-fds", "list of FDs containing sandbox stdin, stdout, and stderr in that order") - f.BoolVar(&b.console, "console", false, "set to true if the sandbox should allow terminal ioctl(2) syscalls") f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process") f.BoolVar(&b.setUpRoot, "setup-root", false, "if true, set up an empty root for the process") f.BoolVar(&b.pidns, "pidns", false, "if true, the sandbox is in its own PID namespace") @@ -229,7 +224,6 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) Device: os.NewFile(uintptr(b.deviceFD), "platform device"), GoferFDs: b.ioFDs.GetArray(), StdioFDs: b.stdioFDs.GetArray(), - Console: b.console, NumCPU: b.cpuNum, TotalMem: b.totalMem, UserLogFD: b.userLogFD, diff --git a/runsc/cmd/spec.go b/runsc/cmd/spec.go index a2b0a4b14..55194e641 100644 --- a/runsc/cmd/spec.go +++ b/runsc/cmd/spec.go @@ -16,124 +16,122 @@ package cmd import ( "context" - "fmt" - "io/ioutil" + "encoding/json" + "io" "os" "path/filepath" "github.com/google/subcommands" + specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/runsc/flag" ) -func genSpec(cwd string) []byte { - var template = fmt.Sprintf(`{ - "ociVersion": "1.0.0", - "process": { - "terminal": true, - "user": { - "uid": 0, - "gid": 0 - }, - "args": [ - "sh" - ], - "env": [ - "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", - "TERM=xterm" - ], - "cwd": "%s", - "capabilities": { - "bounding": [ - "CAP_AUDIT_WRITE", - "CAP_KILL", - "CAP_NET_BIND_SERVICE" - ], - "effective": [ - "CAP_AUDIT_WRITE", - "CAP_KILL", - "CAP_NET_BIND_SERVICE" - ], - "inheritable": [ - "CAP_AUDIT_WRITE", - "CAP_KILL", - "CAP_NET_BIND_SERVICE" - ], - "permitted": [ - "CAP_AUDIT_WRITE", - "CAP_KILL", - "CAP_NET_BIND_SERVICE" - ], - "ambient": [ - "CAP_AUDIT_WRITE", - "CAP_KILL", - "CAP_NET_BIND_SERVICE" - ] - }, - "rlimits": [ - { - "type": "RLIMIT_NOFILE", - "hard": 1024, - "soft": 1024 - } - ] - }, - "root": { - "path": "rootfs", - "readonly": true - }, - "hostname": "runsc", - "mounts": [ - { - "destination": "/proc", - "type": "proc", - "source": "proc" +func writeSpec(w io.Writer, cwd string, netns string, args []string) error { + spec := &specs.Spec{ + Version: "1.0.0", + Process: &specs.Process{ + Terminal: true, + User: specs.User{ + UID: 0, + GID: 0, + }, + Args: args, + Env: []string{ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm", + }, + Cwd: cwd, + Capabilities: &specs.LinuxCapabilities{ + Bounding: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Effective: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Inheritable: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Permitted: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + // TODO(gvisor.dev/issue/3166): support ambient capabilities + }, + Rlimits: []specs.POSIXRlimit{ + { + Type: "RLIMIT_NOFILE", + Hard: 1024, + Soft: 1024, + }, + }, }, - { - "destination": "/dev", - "type": "tmpfs", - "source": "tmpfs", - "options": [] + Root: &specs.Root{ + Path: "rootfs", + Readonly: true, }, - { - "destination": "/sys", - "type": "sysfs", - "source": "sysfs", - "options": [ - "nosuid", - "noexec", - "nodev", - "ro" - ] - } - ], - "linux": { - "namespaces": [ + Hostname: "runsc", + Mounts: []specs.Mount{ { - "type": "pid" + Destination: "/proc", + Type: "proc", + Source: "proc", }, { - "type": "network" + Destination: "/dev", + Type: "tmpfs", + Source: "tmpfs", }, { - "type": "ipc" + Destination: "/sys", + Type: "sysfs", + Source: "sysfs", + Options: []string{ + "nosuid", + "noexec", + "nodev", + "ro", + }, }, - { - "type": "uts" + }, + Linux: &specs.Linux{ + Namespaces: []specs.LinuxNamespace{ + { + Type: "pid", + }, + { + Type: "network", + Path: netns, + }, + { + Type: "ipc", + }, + { + Type: "uts", + }, + { + Type: "mount", + }, }, - { - "type": "mount" - } - ] + }, } -}`, cwd) - return []byte(template) + e := json.NewEncoder(w) + e.SetIndent("", " ") + return e.Encode(spec) } // Spec implements subcommands.Command for the "spec" command. type Spec struct { bundle string cwd string + netns string } // Name implements subcommands.Command.Name. @@ -148,21 +146,26 @@ func (*Spec) Synopsis() string { // Usage implements subcommands.Command.Usage. func (*Spec) Usage() string { - return `spec [options] - create a new OCI bundle specification file. + return `spec [options] [-- args...] - create a new OCI bundle specification file. + +The spec command creates a new specification file (config.json) for a new OCI +bundle. -The spec command creates a new specification file (config.json) for a new OCI bundle. +The specification file is a starter file that runs the command specified by +'args' in the container. If 'args' is not specified the default is to run the +'sh' program. -The specification file is a starter file that runs the "sh" command in the container. You -should edit the file to suit your needs. You can find out more about the format of the -specification file by visiting the OCI runtime spec repository: +While a number of flags are provided to change values in the specification, you +can examine the file and edit it to suit your needs after this command runs. +You can find out more about the format of the specification file by visiting +the OCI runtime spec repository: https://github.com/opencontainers/runtime-spec/ EXAMPLE: $ mkdir -p bundle/rootfs $ cd bundle - $ runsc spec + $ runsc spec -- /hello $ docker export $(docker create hello-world) | tar -xf - -C rootfs - $ sed -i 's;"sh";"/hello";' config.json $ sudo runsc run hello ` @@ -173,18 +176,29 @@ func (s *Spec) SetFlags(f *flag.FlagSet) { f.StringVar(&s.bundle, "bundle", ".", "path to the root of the OCI bundle") f.StringVar(&s.cwd, "cwd", "/", "working directory that will be set for the executable, "+ "this value MUST be an absolute path") + f.StringVar(&s.netns, "netns", "", "network namespace path") } // Execute implements subcommands.Command.Execute. func (s *Spec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + // Grab the arguments. + containerArgs := f.Args() + if len(containerArgs) == 0 { + containerArgs = []string{"sh"} + } + confPath := filepath.Join(s.bundle, "config.json") if _, err := os.Stat(confPath); !os.IsNotExist(err) { Fatalf("file %q already exists", confPath) } - var spec = genSpec(s.cwd) + configFile, err := os.OpenFile(confPath, os.O_WRONLY|os.O_CREATE, 0664) + if err != nil { + Fatalf("opening file %q: %v", confPath, err) + } - if err := ioutil.WriteFile(confPath, spec, 0664); err != nil { + err = writeSpec(configFile, s.cwd, s.netns, containerArgs) + if err != nil { Fatalf("writing to %q: %v", confPath, err) } diff --git a/runsc/cmd/statefile.go b/runsc/cmd/statefile.go index e6f1907da..daed9e728 100644 --- a/runsc/cmd/statefile.go +++ b/runsc/cmd/statefile.go @@ -20,7 +20,7 @@ import ( "os" "github.com/google/subcommands" - "gvisor.dev/gvisor/pkg/state" + "gvisor.dev/gvisor/pkg/state/pretty" "gvisor.dev/gvisor/pkg/state/statefile" "gvisor.dev/gvisor/runsc/flag" ) @@ -105,8 +105,14 @@ func (s *Statefile) Execute(_ context.Context, f *flag.FlagSet, args ...interfac if err != nil { Fatalf("error parsing statefile: %v", err) } - if err := state.PrettyPrint(output, rc, s.html); err != nil { - Fatalf("error printing state: %v", err) + if s.html { + if err := pretty.PrintHTML(output, rc); err != nil { + Fatalf("error printing state: %v", err) + } + } else { + if err := pretty.PrintText(output, rc); err != nil { + Fatalf("error printing state: %v", err) + } } return subcommands.ExitSuccess } diff --git a/runsc/container/BUILD b/runsc/container/BUILD index 49cfb0837..9a9ee7e2a 100644 --- a/runsc/container/BUILD +++ b/runsc/container/BUILD @@ -27,7 +27,7 @@ go_library( "//runsc/specutils", "@com_github_cenkalti_backoff//:go_default_library", "@com_github_gofrs_flock//:go_default_library", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", ], ) @@ -68,7 +68,7 @@ go_test( "//runsc/specutils", "@com_github_cenkalti_backoff//:go_default_library", "@com_github_kr_pty//:go_default_library", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go index 3813c6b93..995d4e267 100644 --- a/runsc/container/console_test.go +++ b/runsc/container/console_test.go @@ -122,6 +122,7 @@ func TestConsoleSocket(t *testing.T) { for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { spec := testutil.NewSpecWithArgs("true") + spec.Process.Terminal = true _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) if err != nil { t.Fatalf("error setting up container: %v", err) diff --git a/runsc/container/container.go b/runsc/container/container.go index 6d297d0df..7ad09bf23 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -324,7 +324,7 @@ func New(conf *boot.Config, args Args) (*Container, error) { } } if err := runInCgroup(cg, func() error { - ioFiles, specFile, err := c.createGoferProcess(args.Spec, conf, args.BundleDir) + ioFiles, specFile, err := c.createGoferProcess(args.Spec, conf, args.BundleDir, args.Attached) if err != nil { return err } @@ -427,7 +427,7 @@ func (c *Container) Start(conf *boot.Config) error { // the start (and all their children processes). if err := runInCgroup(c.Sandbox.Cgroup, func() error { // Create the gofer process. - ioFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir) + ioFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir, false) if err != nil { return err } @@ -861,7 +861,7 @@ func (c *Container) waitForStopped() error { return backoff.Retry(op, b) } -func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir string) ([]*os.File, *os.File, error) { +func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir string, attached bool) ([]*os.File, *os.File, error) { // Start with the general config flags. args := conf.ToFlags() @@ -955,6 +955,14 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund cmd.ExtraFiles = goferEnds cmd.Args[0] = "runsc-gofer" + if attached { + // The gofer is attached to the lifetime of this process, so it + // should synchronously die when this process dies. + cmd.SysProcAttr = &syscall.SysProcAttr{ + Pdeathsig: syscall.SIGKILL, + } + } + // Enter new namespaces to isolate from the rest of the system. Don't unshare // cgroup because gofer is added to a cgroup in the caller's namespace. nss := []specs.LinuxNamespace{ diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index cd76645bd..5e8247bc8 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -643,7 +643,9 @@ func TestExec(t *testing.T) { if err != nil { t.Fatalf("error creating temporary directory: %v", err) } - cmd := fmt.Sprintf("ln -s /bin/true %q/symlink && sleep 100", dir) + // Note that some shells may exec the final command in a sequence as + // an optimization. We avoid this here by adding the exit 0. + cmd := fmt.Sprintf("ln -s /bin/true %q/symlink && sleep 100 && exit 0", dir) spec := testutil.NewSpecWithArgs("sh", "-c", cmd) _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go index a27a01942..e189648f4 100644 --- a/runsc/container/multi_container_test.go +++ b/runsc/container/multi_container_test.go @@ -100,19 +100,20 @@ type execDesc struct { c *Container cmd []string want int - desc string + name string } -func execMany(execs []execDesc) error { +func execMany(t *testing.T, execs []execDesc) { for _, exec := range execs { - args := &control.ExecArgs{Argv: exec.cmd} - if ws, err := exec.c.executeSync(args); err != nil { - return fmt.Errorf("error executing %+v: %v", args, err) - } else if ws.ExitStatus() != exec.want { - return fmt.Errorf("%q: exec %q got exit status: %d, want: %d", exec.desc, exec.cmd, ws.ExitStatus(), exec.want) - } + t.Run(exec.name, func(t *testing.T) { + args := &control.ExecArgs{Argv: exec.cmd} + if ws, err := exec.c.executeSync(args); err != nil { + t.Errorf("error executing %+v: %v", args, err) + } else if ws.ExitStatus() != exec.want { + t.Errorf("%q: exec %q got exit status: %d, want: %d", exec.name, exec.cmd, ws.ExitStatus(), exec.want) + } + }) } - return nil } func createSharedMount(mount specs.Mount, name string, pod ...*specs.Spec) { @@ -1072,7 +1073,7 @@ func TestMultiContainerContainerDestroyStress(t *testing.T) { // Test that pod shared mounts are properly mounted in 2 containers and that // changes from one container is reflected in the other. func TestMultiContainerSharedMount(t *testing.T) { - for name, conf := range configs(t, all...) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { rootDir, cleanup, err := testutil.SetupRootDir() if err != nil { @@ -1110,84 +1111,82 @@ func TestMultiContainerSharedMount(t *testing.T) { { c: containers[0], cmd: []string{"/usr/bin/test", "-d", mnt0.Destination}, - desc: "directory is mounted in container0", + name: "directory is mounted in container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "-d", mnt1.Destination}, - desc: "directory is mounted in container1", + name: "directory is mounted in container1", }, { c: containers[0], - cmd: []string{"/usr/bin/touch", file0}, - desc: "create file in container0", + cmd: []string{"/bin/touch", file0}, + name: "create file in container0", }, { c: containers[0], cmd: []string{"/usr/bin/test", "-f", file0}, - desc: "file appears in container0", + name: "file appears in container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "-f", file1}, - desc: "file appears in container1", + name: "file appears in container1", }, { c: containers[1], cmd: []string{"/bin/rm", file1}, - desc: "file removed from container1", + name: "remove file from container1", }, { c: containers[0], cmd: []string{"/usr/bin/test", "!", "-f", file0}, - desc: "file removed from container0", + name: "file removed from container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "!", "-f", file1}, - desc: "file removed from container1", + name: "file removed from container1", }, { c: containers[1], cmd: []string{"/bin/mkdir", file1}, - desc: "create directory in container1", + name: "create directory in container1", }, { c: containers[0], cmd: []string{"/usr/bin/test", "-d", file0}, - desc: "dir appears in container0", + name: "dir appears in container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "-d", file1}, - desc: "dir appears in container1", + name: "dir appears in container1", }, { c: containers[0], cmd: []string{"/bin/rmdir", file0}, - desc: "create directory in container0", + name: "remove directory from container0", }, { c: containers[0], cmd: []string{"/usr/bin/test", "!", "-d", file0}, - desc: "dir removed from container0", + name: "dir removed from container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "!", "-d", file1}, - desc: "dir removed from container1", + name: "dir removed from container1", }, } - if err := execMany(execs); err != nil { - t.Fatal(err.Error()) - } + execMany(t, execs) }) } } // Test that pod mounts are mounted as readonly when requested. func TestMultiContainerSharedMountReadonly(t *testing.T) { - for name, conf := range configs(t, all...) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { rootDir, cleanup, err := testutil.SetupRootDir() if err != nil { @@ -1225,35 +1224,34 @@ func TestMultiContainerSharedMountReadonly(t *testing.T) { { c: containers[0], cmd: []string{"/usr/bin/test", "-d", mnt0.Destination}, - desc: "directory is mounted in container0", + name: "directory is mounted in container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "-d", mnt1.Destination}, - desc: "directory is mounted in container1", + name: "directory is mounted in container1", }, { c: containers[0], - cmd: []string{"/usr/bin/touch", file0}, + cmd: []string{"/bin/touch", file0}, want: 1, - desc: "fails to write to container0", + name: "fails to write to container0", }, { c: containers[1], - cmd: []string{"/usr/bin/touch", file1}, + cmd: []string{"/bin/touch", file1}, want: 1, - desc: "fails to write to container1", + name: "fails to write to container1", }, } - if err := execMany(execs); err != nil { - t.Fatal(err.Error()) - } + execMany(t, execs) }) } } // Test that shared pod mounts continue to work after container is restarted. func TestMultiContainerSharedMountRestart(t *testing.T) { + //TODO(gvisor.dev/issue/1487): This is failing with VFS2. for name, conf := range configs(t, all...) { t.Run(name, func(t *testing.T) { rootDir, cleanup, err := testutil.SetupRootDir() @@ -1291,23 +1289,21 @@ func TestMultiContainerSharedMountRestart(t *testing.T) { execs := []execDesc{ { c: containers[0], - cmd: []string{"/usr/bin/touch", file0}, - desc: "create file in container0", + cmd: []string{"/bin/touch", file0}, + name: "create file in container0", }, { c: containers[0], cmd: []string{"/usr/bin/test", "-f", file0}, - desc: "file appears in container0", + name: "file appears in container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "-f", file1}, - desc: "file appears in container1", + name: "file appears in container1", }, } - if err := execMany(execs); err != nil { - t.Fatal(err.Error()) - } + execMany(t, execs) containers[1].Destroy() @@ -1334,32 +1330,30 @@ func TestMultiContainerSharedMountRestart(t *testing.T) { { c: containers[0], cmd: []string{"/usr/bin/test", "-f", file0}, - desc: "file is still in container0", + name: "file is still in container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "-f", file1}, - desc: "file is still in container1", + name: "file is still in container1", }, { c: containers[1], cmd: []string{"/bin/rm", file1}, - desc: "file removed from container1", + name: "file removed from container1", }, { c: containers[0], cmd: []string{"/usr/bin/test", "!", "-f", file0}, - desc: "file removed from container0", + name: "file removed from container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "!", "-f", file1}, - desc: "file removed from container1", + name: "file removed from container1", }, } - if err := execMany(execs); err != nil { - t.Fatal(err.Error()) - } + execMany(t, execs) }) } } @@ -1367,53 +1361,53 @@ func TestMultiContainerSharedMountRestart(t *testing.T) { // Test that unsupported pod mounts options are ignored when matching master and // slave mounts. func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) { - rootDir, cleanup, err := testutil.SetupRootDir() - if err != nil { - t.Fatalf("error creating root dir: %v", err) - } - defer cleanup() - - conf := testutil.TestConfig(t) - conf.RootDir = rootDir + for name, conf := range configsWithVFS2(t, all...) { + t.Run(name, func(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir - // Setup the containers. - sleep := []string{"/bin/sleep", "100"} - podSpec, ids := createSpecs(sleep, sleep) - mnt0 := specs.Mount{ - Destination: "/mydir/test", - Source: "/some/dir", - Type: "tmpfs", - Options: []string{"rw", "rbind", "relatime"}, - } - podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0) + // Setup the containers. + sleep := []string{"/bin/sleep", "100"} + podSpec, ids := createSpecs(sleep, sleep) + mnt0 := specs.Mount{ + Destination: "/mydir/test", + Source: "/some/dir", + Type: "tmpfs", + Options: []string{"rw", "rbind", "relatime"}, + } + podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0) - mnt1 := mnt0 - mnt1.Destination = "/mydir2/test2" - mnt1.Options = []string{"rw", "nosuid"} - podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1) + mnt1 := mnt0 + mnt1.Destination = "/mydir2/test2" + mnt1.Options = []string{"rw", "nosuid"} + podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1) - createSharedMount(mnt0, "test-mount", podSpec...) + createSharedMount(mnt0, "test-mount", podSpec...) - containers, cleanup, err := startContainers(conf, podSpec, ids) - if err != nil { - t.Fatalf("error starting containers: %v", err) - } - defer cleanup() + containers, cleanup, err := startContainers(conf, podSpec, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() - execs := []execDesc{ - { - c: containers[0], - cmd: []string{"/usr/bin/test", "-d", mnt0.Destination}, - desc: "directory is mounted in container0", - }, - { - c: containers[1], - cmd: []string{"/usr/bin/test", "-d", mnt1.Destination}, - desc: "directory is mounted in container1", - }, - } - if err := execMany(execs); err != nil { - t.Fatal(err.Error()) + execs := []execDesc{ + { + c: containers[0], + cmd: []string{"/usr/bin/test", "-d", mnt0.Destination}, + name: "directory is mounted in container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "-d", mnt1.Destination}, + name: "directory is mounted in container1", + }, + } + execMany(t, execs) + }) } } diff --git a/runsc/debian/postinst.sh b/runsc/debian/postinst.sh index dc7aeee87..d1e28e17b 100755 --- a/runsc/debian/postinst.sh +++ b/runsc/debian/postinst.sh @@ -18,7 +18,14 @@ if [ "$1" != configure ]; then exit 0 fi +# Update docker configuration. if [ -f /etc/docker/daemon.json ]; then runsc install - systemctl restart docker || echo "unable to restart docker; you must do so manually." >&2 + if systemctl status docker 2>/dev/null; then + systemctl restart docker || echo "unable to restart docker; you must do so manually." >&2 + fi fi + +# For containerd-based installers, we don't automatically update the +# configuration. If it uses a v2 shim, then it will find the package binaries +# automatically when provided the appropriate annotation. diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD index 1036b0630..05e3637f7 100644 --- a/runsc/fsgofer/BUILD +++ b/runsc/fsgofer/BUILD @@ -31,5 +31,6 @@ go_test( deps = [ "//pkg/log", "//pkg/p9", + "//pkg/test/testutil", ], ) diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go index 1dce36965..88814b83c 100644 --- a/runsc/fsgofer/filter/config.go +++ b/runsc/fsgofer/filter/config.go @@ -128,6 +128,7 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_MADVISE: {}, unix.SYS_MEMFD_CREATE: {}, /// Used by flipcall.PacketWindowAllocator.Init(). syscall.SYS_MKDIRAT: {}, + syscall.SYS_MKNODAT: {}, // Used by the Go runtime as a temporarily workaround for a Linux // 5.2-5.4 bug. // diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index edc239013..ebefeacf2 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -48,36 +48,6 @@ const ( openFlags = syscall.O_NOFOLLOW | syscall.O_CLOEXEC ) -type fileType int - -const ( - regular fileType = iota - directory - symlink - socket - unknown -) - -// String implements fmt.Stringer. -func (f fileType) String() string { - switch f { - case regular: - return "regular" - case directory: - return "directory" - case symlink: - return "symlink" - case socket: - return "socket" - } - return "unknown" -} - -// ControlSocketAddr generates an abstract unix socket name for the given id. -func ControlSocketAddr(id string) string { - return fmt.Sprintf("\x00runsc-gofer.%s", id) -} - // Config sets configuration options for each attach point. type Config struct { // ROMount is set to true if this is a readonly mount. @@ -132,19 +102,19 @@ func (a *attachPoint) Attach() (p9.File, error) { return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix) } - f, err := openAnyFile(a.prefix, func(mode int) (*fd.FD, error) { + f, readable, err := openAnyFile(a.prefix, func(mode int) (*fd.FD, error) { return fd.Open(a.prefix, openFlags|mode, 0) }) if err != nil { return nil, fmt.Errorf("unable to open %q: %v", a.prefix, err) } - stat, err := stat(f.FD()) + stat, err := fstat(f.FD()) if err != nil { return nil, fmt.Errorf("unable to stat %q: %v", a.prefix, err) } - lf, err := newLocalFile(a, f, a.prefix, stat) + lf, err := newLocalFile(a, f, a.prefix, readable, stat) if err != nil { return nil, fmt.Errorf("unable to create localFile %q: %v", a.prefix, err) } @@ -175,8 +145,6 @@ func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID { log.Warningf("first 8 bytes of host inode id %x will be truncated to construct virtual inode id", stat.Ino) } ino := uint64(dev)<<56 | maskedIno - log.Debugf("host inode %x on device %x mapped to virtual inode %x", stat.Ino, stat.Dev, ino) - return p9.QID{ Type: p9.FileMode(stat.Mode).QIDType(), Path: ino, @@ -201,8 +169,6 @@ func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID { // entire file up when it's opened in write mode, and would perform badly when // multiple files are only being opened for read (esp. startup). type localFile struct { - p9.DefaultWalkGetAttr - // attachPoint is the attachPoint that serves this localFile. attachPoint *attachPoint @@ -214,12 +180,19 @@ type localFile struct { // opened with. file *fd.FD + // controlReadable tells whether 'file' was opened with read permissions + // during a walk. + controlReadable bool + // mode is the mode in which the file was opened. Set to invalidMode // if localFile isn't opened. mode p9.OpenFlags - // ft is the fileType for this file. - ft fileType + // fileType for this file. It is equivalent to: + // syscall.Stat_t.Mode & syscall.S_IFMT + fileType uint32 + + qid p9.QID // readDirMu protects against concurrent Readdir calls. readDirMu sync.Mutex @@ -253,83 +226,88 @@ func reopenProcFd(f *fd.FD, mode int) (*fd.FD, error) { return fd.New(d), nil } -func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, error) { +func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, bool, error) { path := path.Join(parent.hostPath, name) - f, err := openAnyFile(path, func(mode int) (*fd.FD, error) { + f, readable, err := openAnyFile(path, func(mode int) (*fd.FD, error) { return fd.OpenAt(parent.file, name, openFlags|mode, 0) }) - return f, path, err + return f, path, readable, err } // openAnyFile attempts to open the file in O_RDONLY and if it fails fallsback // to O_PATH. 'path' is used for logging messages only. 'fn' is what does the // actual file open and is customizable by the caller. -func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, error) { +func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, bool, error) { // Attempt to open file in the following mode in order: // 1. RDONLY | NONBLOCK: for all files, directories, ro mounts, FIFOs. // Use non-blocking to prevent getting stuck inside open(2) for // FIFOs. This option has no effect on regular files. // 2. PATH: for symlinks, sockets. - modes := []int{syscall.O_RDONLY | syscall.O_NONBLOCK, unix.O_PATH} + options := []struct { + mode int + readable bool + }{ + { + mode: syscall.O_RDONLY | syscall.O_NONBLOCK, + readable: true, + }, + { + mode: unix.O_PATH, + readable: false, + }, + } var err error - var file *fd.FD - for i, mode := range modes { - file, err = fn(mode) + for i, option := range options { + var file *fd.FD + file, err = fn(option.mode) if err == nil { - // openat succeeded, we're done. - break + // Succeeded opening the file, we're done. + return file, option.readable, nil } switch e := extractErrno(err); e { case syscall.ENOENT: // File doesn't exist, no point in retrying. - return nil, e + return nil, false, e } - // openat failed. Try again with next mode, preserving 'err' in case this - // was the last attempt. - log.Debugf("Attempt %d to open file failed, mode: %#x, path: %q, err: %v", i, openFlags|mode, path, err) - } - if err != nil { - // All attempts to open file have failed, return the last error. - log.Debugf("Failed to open file, path: %q, err: %v", path, err) - return nil, extractErrno(err) + // File failed to open. Try again with next mode, preserving 'err' in case + // this was the last attempt. + log.Debugf("Attempt %d to open file failed, mode: %#x, path: %q, err: %v", i, openFlags|option.mode, path, err) } - - return file, nil + // All attempts to open file have failed, return the last error. + log.Debugf("Failed to open file, path: %q, err: %v", path, err) + return nil, false, extractErrno(err) } -func getSupportedFileType(stat syscall.Stat_t, permitSocket bool) (fileType, error) { - var ft fileType +func checkSupportedFileType(stat syscall.Stat_t, permitSocket bool) error { switch stat.Mode & syscall.S_IFMT { - case syscall.S_IFREG: - ft = regular - case syscall.S_IFDIR: - ft = directory - case syscall.S_IFLNK: - ft = symlink + case syscall.S_IFREG, syscall.S_IFDIR, syscall.S_IFLNK: + return nil + case syscall.S_IFSOCK: if !permitSocket { - return unknown, syscall.EPERM + return syscall.EPERM } - ft = socket + return nil + default: - return unknown, syscall.EPERM + return syscall.EPERM } - return ft, nil } -func newLocalFile(a *attachPoint, file *fd.FD, path string, stat syscall.Stat_t) (*localFile, error) { - ft, err := getSupportedFileType(stat, a.conf.HostUDS) - if err != nil { +func newLocalFile(a *attachPoint, file *fd.FD, path string, readable bool, stat syscall.Stat_t) (*localFile, error) { + if err := checkSupportedFileType(stat, a.conf.HostUDS); err != nil { return nil, err } return &localFile{ - attachPoint: a, - hostPath: path, - file: file, - mode: invalidMode, - ft: ft, + attachPoint: a, + hostPath: path, + file: file, + mode: invalidMode, + fileType: stat.Mode & syscall.S_IFMT, + qid: a.makeQID(stat), + controlReadable: readable, }, nil } @@ -348,13 +326,13 @@ func newFDMaybe(file *fd.FD) *fd.FD { // fd is blocking; non-blocking is required. if err := syscall.SetNonblock(dup.FD(), true); err != nil { - dup.Close() + _ = dup.Close() return nil } return dup } -func stat(fd int) (syscall.Stat_t, error) { +func fstat(fd int) (syscall.Stat_t, error) { var stat syscall.Stat_t if err := syscall.Fstat(fd, &stat); err != nil { return syscall.Stat_t{}, err @@ -362,6 +340,14 @@ func stat(fd int) (syscall.Stat_t, error) { return stat, nil } +func stat(path string) (syscall.Stat_t, error) { + var stat syscall.Stat_t + if err := syscall.Stat(path, &stat); err != nil { + return syscall.Stat_t{}, err + } + return stat, nil +} + func fchown(fd int, uid p9.UID, gid p9.GID) error { return syscall.Fchownat(fd, "", int(uid), int(gid), linux.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW) } @@ -374,7 +360,7 @@ func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) { // Check if control file can be used or if a new open must be created. var newFile *fd.FD - if flags == p9.ReadOnly { + if flags == p9.ReadOnly && l.controlReadable { log.Debugf("Open reusing control file, flags: %v, %q", flags, l.hostPath) newFile = l.file } else { @@ -390,16 +376,8 @@ func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) { } } - stat, err := stat(newFile.FD()) - if err != nil { - if newFile != l.file { - newFile.Close() - } - return nil, p9.QID{}, 0, extractErrno(err) - } - var fd *fd.FD - if stat.Mode&syscall.S_IFMT == syscall.S_IFREG { + if l.fileType == syscall.S_IFREG { // Donate FD for regular files only. fd = newFDMaybe(newFile) } @@ -412,7 +390,7 @@ func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) { l.file = newFile } l.mode = flags & p9.OpenFlagsModeMask - return fd, l.attachPoint.makeQID(stat), 0, nil + return fd, l.qid, 0, nil } // Create implements p9.File. @@ -440,7 +418,7 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid return nil, nil, p9.QID{}, 0, extractErrno(err) } cu := cleanup.Make(func() { - child.Close() + _ = child.Close() // Best effort attempt to remove the file in case of failure. if err := syscall.Unlinkat(l.file.FD(), name); err != nil { log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, name), err) @@ -451,7 +429,7 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid if err := fchown(child.FD(), uid, gid); err != nil { return nil, nil, p9.QID{}, 0, extractErrno(err) } - stat, err := stat(child.FD()) + stat, err := fstat(child.FD()) if err != nil { return nil, nil, p9.QID{}, 0, extractErrno(err) } @@ -461,10 +439,12 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid hostPath: path.Join(l.hostPath, name), file: child, mode: mode, + fileType: syscall.S_IFREG, + qid: l.attachPoint.makeQID(stat), } cu.Release() - return newFDMaybe(c.file), c, l.attachPoint.makeQID(stat), 0, nil + return newFDMaybe(c.file), c, c.qid, 0, nil } // Mkdir implements p9.File. @@ -499,7 +479,7 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID) if err := fchown(f.FD(), uid, gid); err != nil { return p9.QID{}, extractErrno(err) } - stat, err := stat(f.FD()) + stat, err := fstat(f.FD()) if err != nil { return p9.QID{}, extractErrno(err) } @@ -510,55 +490,74 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID) // Walk implements p9.File. func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) { + qids, file, _, err := l.walk(names) + return qids, file, err +} + +// WalkGetAttr implements p9.File. +func (l *localFile) WalkGetAttr(names []string) ([]p9.QID, p9.File, p9.AttrMask, p9.Attr, error) { + qids, file, stat, err := l.walk(names) + if err != nil { + return nil, nil, p9.AttrMask{}, p9.Attr{}, err + } + mask, attr := l.fillAttr(stat) + return qids, file, mask, attr, nil +} + +func (l *localFile) walk(names []string) ([]p9.QID, p9.File, syscall.Stat_t, error) { // Duplicate current file if 'names' is empty. if len(names) == 0 { - newFile, err := openAnyFile(l.hostPath, func(mode int) (*fd.FD, error) { + newFile, readable, err := openAnyFile(l.hostPath, func(mode int) (*fd.FD, error) { return reopenProcFd(l.file, openFlags|mode) }) if err != nil { - return nil, nil, extractErrno(err) + return nil, nil, syscall.Stat_t{}, extractErrno(err) } - stat, err := stat(newFile.FD()) + stat, err := fstat(newFile.FD()) if err != nil { - newFile.Close() - return nil, nil, extractErrno(err) + _ = newFile.Close() + return nil, nil, syscall.Stat_t{}, extractErrno(err) } c := &localFile{ - attachPoint: l.attachPoint, - hostPath: l.hostPath, - file: newFile, - mode: invalidMode, + attachPoint: l.attachPoint, + hostPath: l.hostPath, + file: newFile, + mode: invalidMode, + fileType: l.fileType, + qid: l.attachPoint.makeQID(stat), + controlReadable: readable, } - return []p9.QID{l.attachPoint.makeQID(stat)}, c, nil + return []p9.QID{c.qid}, c, stat, nil } var qids []p9.QID + var lastStat syscall.Stat_t last := l for _, name := range names { - f, path, err := openAnyFileFromParent(last, name) + f, path, readable, err := openAnyFileFromParent(last, name) if last != l { - last.Close() + _ = last.Close() } if err != nil { - return nil, nil, extractErrno(err) + return nil, nil, syscall.Stat_t{}, extractErrno(err) } - stat, err := stat(f.FD()) + lastStat, err = fstat(f.FD()) if err != nil { - f.Close() - return nil, nil, extractErrno(err) + _ = f.Close() + return nil, nil, syscall.Stat_t{}, extractErrno(err) } - c, err := newLocalFile(last.attachPoint, f, path, stat) + c, err := newLocalFile(last.attachPoint, f, path, readable, lastStat) if err != nil { - f.Close() - return nil, nil, extractErrno(err) + _ = f.Close() + return nil, nil, syscall.Stat_t{}, extractErrno(err) } - qids = append(qids, l.attachPoint.makeQID(stat)) + qids = append(qids, c.qid) last = c } - return qids, last, nil + return qids, last, lastStat, nil } // StatFS implements p9.File. @@ -594,11 +593,15 @@ func (l *localFile) FSync() error { // GetAttr implements p9.File. func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) { - stat, err := stat(l.file.FD()) + stat, err := fstat(l.file.FD()) if err != nil { return p9.QID{}, p9.AttrMask{}, p9.Attr{}, extractErrno(err) } + mask, attr := l.fillAttr(stat) + return l.qid, mask, attr, nil +} +func (l *localFile) fillAttr(stat syscall.Stat_t) (p9.AttrMask, p9.Attr) { attr := p9.Attr{ Mode: p9.FileMode(stat.Mode), UID: p9.UID(stat.Uid), @@ -627,8 +630,7 @@ func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) MTime: true, CTime: true, } - - return l.attachPoint.makeQID(stat), valid, attr, nil + return valid, attr } // SetAttr implements p9.File. Due to mismatch in file API, options @@ -669,7 +671,7 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error { // Check if it's possible to use cached file, or if another one needs to be // opened for write. f := l.file - if l.ft == regular && l.mode != p9.WriteOnly && l.mode != p9.ReadWrite { + if l.fileType == syscall.S_IFREG && l.mode != p9.WriteOnly && l.mode != p9.ReadWrite { var err error f, err = reopenProcFd(l.file, openFlags|os.O_WRONLY) if err != nil { @@ -725,7 +727,7 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error { } } - if l.ft == symlink { + if l.fileType == syscall.S_IFLNK { // utimensat operates different that other syscalls. To operate on a // symlink it *requires* AT_SYMLINK_NOFOLLOW with dirFD and a non-empty // name. @@ -882,7 +884,7 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9. if err := fchown(f.FD(), uid, gid); err != nil { return p9.QID{}, extractErrno(err) } - stat, err := stat(f.FD()) + stat, err := fstat(f.FD()) if err != nil { return p9.QID{}, extractErrno(err) } @@ -909,13 +911,39 @@ func (l *localFile) Link(target p9.File, newName string) error { } // Mknod implements p9.File. -// -// Not implemented. -func (*localFile) Mknod(_ string, _ p9.FileMode, _ uint32, _ uint32, _ p9.UID, _ p9.GID) (p9.QID, error) { +func (l *localFile) Mknod(name string, mode p9.FileMode, _ uint32, _ uint32, _ p9.UID, _ p9.GID) (p9.QID, error) { + conf := l.attachPoint.conf + if conf.ROMount { + if conf.PanicOnWrite { + panic("attempt to write to RO mount") + } + return p9.QID{}, syscall.EROFS + } + + hostPath := path.Join(l.hostPath, name) + + // Return EEXIST if the file already exists. + if _, err := stat(hostPath); err == nil { + return p9.QID{}, syscall.EEXIST + } + // From mknod(2) man page: // "EPERM: [...] if the filesystem containing pathname does not support // the type of node requested." - return p9.QID{}, syscall.EPERM + if mode.FileType() != p9.ModeRegular { + return p9.QID{}, syscall.EPERM + } + + // Allow Mknod to create regular files. + if err := syscall.Mknod(hostPath, uint32(mode), 0); err != nil { + return p9.QID{}, err + } + + stat, err := stat(hostPath) + if err != nil { + return p9.QID{}, extractErrno(err) + } + return l.attachPoint.makeQID(stat), nil } // UnlinkAt implements p9.File. @@ -1081,13 +1109,13 @@ func (l *localFile) Connect(flags p9.ConnectFlags) (*fd.FD, error) { } if err := syscall.SetNonblock(f, true); err != nil { - syscall.Close(f) + _ = syscall.Close(f) return nil, err } sa := syscall.SockaddrUnix{Name: l.hostPath} if err := syscall.Connect(f, &sa); err != nil { - syscall.Close(f) + _ = syscall.Close(f) return nil, err } diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go index 05af7e397..94f167417 100644 --- a/runsc/fsgofer/fsgofer_test.go +++ b/runsc/fsgofer/fsgofer_test.go @@ -26,6 +26,19 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/test/testutil" +) + +var allOpenFlags = []p9.OpenFlags{p9.ReadOnly, p9.WriteOnly, p9.ReadWrite} + +var ( + allTypes = []uint32{syscall.S_IFREG, syscall.S_IFDIR, syscall.S_IFLNK} + + // allConfs is set in init(). + allConfs []Config + + rwConfs = []Config{{ROMount: false}} + roConfs = []Config{{ROMount: true}} ) func init() { @@ -39,6 +52,13 @@ func init() { } } +func configTestName(config *Config) string { + if config.ROMount { + return "ROMount" + } + return "RWMount" +} + func assertPanic(t *testing.T, f func()) { defer func() { if r := recover(); r == nil { @@ -88,71 +108,76 @@ func testReadWrite(f p9.File, flags p9.OpenFlags, content []byte) error { return nil } -var allOpenFlags = []p9.OpenFlags{p9.ReadOnly, p9.WriteOnly, p9.ReadWrite} - -var ( - allTypes = []fileType{regular, directory, symlink} - - // allConfs is set in init() above. - allConfs []Config - - rwConfs = []Config{{ROMount: false}} - roConfs = []Config{{ROMount: true}} -) - type state struct { - root *localFile - file *localFile - conf Config - ft fileType + root *localFile + file *localFile + conf Config + fileType uint32 } func (s state) String() string { - return fmt.Sprintf("type(%v)", s.ft) + return fmt.Sprintf("type(%v)", s.fileType) +} + +func typeName(fileType uint32) string { + switch fileType { + case syscall.S_IFREG: + return "file" + case syscall.S_IFDIR: + return "directory" + case syscall.S_IFLNK: + return "symlink" + default: + panic(fmt.Sprintf("invalid file type for test: %d", fileType)) + } } func runAll(t *testing.T, test func(*testing.T, state)) { runCustom(t, allTypes, allConfs, test) } -func runCustom(t *testing.T, types []fileType, confs []Config, test func(*testing.T, state)) { +func runCustom(t *testing.T, types []uint32, confs []Config, test func(*testing.T, state)) { for _, c := range confs { - t.Logf("Config: %+v", c) - for _, ft := range types { - t.Logf("File type: %v", ft) + name := fmt.Sprintf("%s/%s", configTestName(&c), typeName(ft)) + t.Run(name, func(t *testing.T) { + path, name, err := setup(ft) + if err != nil { + t.Fatalf("%v", err) + } + defer os.RemoveAll(path) - path, name, err := setup(ft) - if err != nil { - t.Fatalf("%v", err) - } - defer os.RemoveAll(path) + a, err := NewAttachPoint(path, c) + if err != nil { + t.Fatalf("NewAttachPoint failed: %v", err) + } + root, err := a.Attach() + if err != nil { + t.Fatalf("Attach failed, err: %v", err) + } - a, err := NewAttachPoint(path, c) - if err != nil { - t.Fatalf("NewAttachPoint failed: %v", err) - } - root, err := a.Attach() - if err != nil { - t.Fatalf("Attach failed, err: %v", err) - } + _, file, err := root.Walk([]string{name}) + if err != nil { + root.Close() + t.Fatalf("root.Walk({%q}) failed, err: %v", "symlink", err) + } - _, file, err := root.Walk([]string{name}) - if err != nil { + st := state{ + root: root.(*localFile), + file: file.(*localFile), + conf: c, + fileType: ft, + } + test(t, st) + file.Close() root.Close() - t.Fatalf("root.Walk({%q}) failed, err: %v", "symlink", err) - } - - st := state{root: root.(*localFile), file: file.(*localFile), conf: c, ft: ft} - test(t, st) - file.Close() - root.Close() + }) } } } -func setup(ft fileType) (string, string, error) { - path, err := ioutil.TempDir("", "root-") +func setup(fileType uint32) (string, string, error) { + path, err := ioutil.TempDir(testutil.TmpDir(), "root-") if err != nil { return "", "", fmt.Errorf("ioutil.TempDir() failed, err: %v", err) } @@ -169,26 +194,26 @@ func setup(ft fileType) (string, string, error) { defer root.Close() var name string - switch ft { - case regular: + switch fileType { + case syscall.S_IFREG: name = "file" _, f, _, _, err := root.Create(name, p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) if err != nil { return "", "", fmt.Errorf("createFile(root, %q) failed, err: %v", "test", err) } defer f.Close() - case directory: + case syscall.S_IFDIR: name = "dir" if _, err := root.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil { return "", "", fmt.Errorf("root.MkDir(%q) failed, err: %v", name, err) } - case symlink: + case syscall.S_IFLNK: name = "symlink" if _, err := root.Symlink("/some/target", name, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil { return "", "", fmt.Errorf("root.Symlink(%q) failed, err: %v", name, err) } default: - panic(fmt.Sprintf("unknown file type %v", ft)) + panic(fmt.Sprintf("unknown file type %v", fileType)) } return path, name, nil } @@ -202,7 +227,7 @@ func createFile(dir *localFile, name string) (*localFile, error) { } func TestReadWrite(t *testing.T) { - runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) { + runCustom(t, []uint32{syscall.S_IFDIR}, rwConfs, func(t *testing.T, s state) { child, err := createFile(s.file, "test") if err != nil { t.Fatalf("%v: createFile() failed, err: %v", s, err) @@ -232,7 +257,7 @@ func TestReadWrite(t *testing.T) { } func TestCreate(t *testing.T) { - runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) { + runCustom(t, []uint32{syscall.S_IFDIR}, rwConfs, func(t *testing.T, s state) { for i, flags := range allOpenFlags { _, l, _, _, err := s.file.Create(fmt.Sprintf("test-%d", i), flags, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) if err != nil { @@ -249,7 +274,7 @@ func TestCreate(t *testing.T) { // TestReadWriteDup tests that a file opened in any mode can be dup'ed and // reopened in any other mode. func TestReadWriteDup(t *testing.T) { - runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) { + runCustom(t, []uint32{syscall.S_IFDIR}, rwConfs, func(t *testing.T, s state) { child, err := createFile(s.file, "test") if err != nil { t.Fatalf("%v: createFile() failed, err: %v", s, err) @@ -291,7 +316,7 @@ func TestReadWriteDup(t *testing.T) { } func TestUnopened(t *testing.T) { - runCustom(t, []fileType{regular}, allConfs, func(t *testing.T, s state) { + runCustom(t, []uint32{syscall.S_IFREG}, allConfs, func(t *testing.T, s state) { b := []byte("foobar") if _, err := s.file.WriteAt(b, 0); err != syscall.EBADF { t.Errorf("%v: WriteAt() should have failed, got: %v, expected: syscall.EBADF", s, err) @@ -308,6 +333,32 @@ func TestUnopened(t *testing.T) { }) } +// TestOpenOPath is a regression test to ensure that a file that cannot be open +// for read is allowed to be open. This was happening because the control file +// was open with O_PATH, but Open() was not checking for it and allowing the +// control file to be reused. +func TestOpenOPath(t *testing.T) { + runCustom(t, []uint32{syscall.S_IFREG}, rwConfs, func(t *testing.T, s state) { + // Fist remove all permissions on the file. + if err := s.file.SetAttr(p9.SetAttrMask{Permissions: true}, p9.SetAttr{Permissions: p9.FileMode(0)}); err != nil { + t.Fatalf("SetAttr(): %v", err) + } + // Then walk to the file again to open a new control file. + filename := filepath.Base(s.file.hostPath) + _, newFile, err := s.root.Walk([]string{filename}) + if err != nil { + t.Fatalf("root.Walk(%q): %v", filename, err) + } + + if newFile.(*localFile).controlReadable { + t.Fatalf("control file didn't open with O_PATH: %+v", newFile) + } + if _, _, _, err := newFile.Open(p9.ReadOnly); err != syscall.EACCES { + t.Fatalf("Open() should have failed, got: %v, wanted: EACCES", err) + } + }) +} + func SetGetAttr(l *localFile, valid p9.SetAttrMask, attr p9.SetAttr) (p9.Attr, error) { if err := l.SetAttr(valid, attr); err != nil { return p9.Attr{}, err @@ -324,7 +375,7 @@ func TestSetAttrPerm(t *testing.T) { valid := p9.SetAttrMask{Permissions: true} attr := p9.SetAttr{Permissions: 0777} got, err := SetGetAttr(s.file, valid, attr) - if s.ft == symlink { + if s.fileType == syscall.S_IFLNK { if err == nil { t.Fatalf("%v: SetGetAttr(valid, %v) should have failed", s, attr.Permissions) } @@ -345,7 +396,7 @@ func TestSetAttrSize(t *testing.T) { valid := p9.SetAttrMask{Size: true} attr := p9.SetAttr{Size: size} got, err := SetGetAttr(s.file, valid, attr) - if s.ft == symlink || s.ft == directory { + if s.fileType == syscall.S_IFLNK || s.fileType == syscall.S_IFDIR { if err == nil { t.Fatalf("%v: SetGetAttr(valid, %v) should have failed", s, attr.Permissions) } @@ -427,7 +478,7 @@ func TestLink(t *testing.T) { } err = dir.Link(s.file, linkFile) - if s.ft == directory { + if s.fileType == syscall.S_IFDIR { if err != syscall.EPERM { t.Errorf("%v: Link(target, %s) should have failed, got: %v, expected: syscall.EPERM", s, linkFile, err) } @@ -485,7 +536,7 @@ func TestROMountPanics(t *testing.T) { } func TestWalkNotFound(t *testing.T) { - runCustom(t, []fileType{directory}, allConfs, func(t *testing.T, s state) { + runCustom(t, []uint32{syscall.S_IFDIR}, allConfs, func(t *testing.T, s state) { if _, _, err := s.file.Walk([]string{"nobody-here"}); err != syscall.ENOENT { t.Errorf("%v: Walk(%q) should have failed, got: %v, expected: syscall.ENOENT", s, "nobody-here", err) } @@ -506,7 +557,7 @@ func TestWalkDup(t *testing.T) { } func TestReaddir(t *testing.T) { - runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) { + runCustom(t, []uint32{syscall.S_IFDIR}, rwConfs, func(t *testing.T, s state) { name := "dir" if _, err := s.file.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil { t.Fatalf("%v: MkDir(%s) failed, err: %v", s, name, err) diff --git a/runsc/main.go b/runsc/main.go index 920ed84a5..69cb505fa 100644 --- a/runsc/main.go +++ b/runsc/main.go @@ -71,7 +71,9 @@ var ( platformName = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm.") network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.") hardwareGSO = flag.Bool("gso", true, "enable hardware segmentation offload if it is supported by a network device.") - softwareGSO = flag.Bool("software-gso", true, "enable software segmentation offload when hardware ofload can't be enabled.") + softwareGSO = flag.Bool("software-gso", true, "enable software segmentation offload when hardware offload can't be enabled.") + txChecksumOffload = flag.Bool("tx-checksum-offload", false, "enable TX checksum offload.") + rxChecksumOffload = flag.Bool("rx-checksum-offload", true, "enable RX checksum offload.") qDisc = flag.String("qdisc", "fifo", "specifies which queueing discipline to apply by default to the non loopback nics used by the sandbox.") fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.") fsGoferHostUDS = flag.Bool("fsgofer-host-uds", false, "allow the gofer to mount Unix Domain Sockets.") @@ -86,6 +88,7 @@ var ( referenceLeakMode = flag.String("ref-leak-mode", "disabled", "sets reference leak check mode: disabled (default), log-names, log-traces.") cpuNumFromQuota = flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)") vfs2Enabled = flag.Bool("vfs2", false, "TEST ONLY; use while VFSv2 is landing. This uses the new experimental VFS layer.") + fuseEnabled = flag.Bool("fuse", false, "TEST ONLY; use while FUSE in VFSv2 is landing. This allows the use of the new experimental FUSE filesystem.") // Test flags, not to be used outside tests, ever. testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.") @@ -223,6 +226,8 @@ func main() { Network: netType, HardwareGSO: *hardwareGSO, SoftwareGSO: *softwareGSO, + TXChecksumOffload: *txChecksumOffload, + RXChecksumOffload: *rxChecksumOffload, LogPackets: *logPackets, Platform: platformType, Strace: *strace, @@ -238,6 +243,7 @@ func main() { OverlayfsStaleRead: *overlayfsStaleRead, CPUNumFromQuota: *cpuNumFromQuota, VFS2: *vfs2Enabled, + FUSE: *fuseEnabled, QDisc: queueingDiscipline, TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot, TestOnlyTestNameEnv: *testOnlyTestNameEnv, diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD index 035dcd3e3..2b9d4549d 100644 --- a/runsc/sandbox/BUILD +++ b/runsc/sandbox/BUILD @@ -29,7 +29,7 @@ go_library( "//runsc/console", "//runsc/specutils", "@com_github_cenkalti_backoff//:go_default_library", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", "@com_github_syndtr_gocapability//capability:go_default_library", "@com_github_vishvananda_netlink//:go_default_library", "@org_golang_x_sys//unix:go_default_library", diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go index 209bfdb20..817a923ad 100644 --- a/runsc/sandbox/network.go +++ b/runsc/sandbox/network.go @@ -62,7 +62,7 @@ func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Confi // Build the path to the net namespace of the sandbox process. // This is what we will copy. nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net") - if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.HardwareGSO, conf.SoftwareGSO, conf.NumNetworkChannels, conf.QDisc); err != nil { + if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.HardwareGSO, conf.SoftwareGSO, conf.TXChecksumOffload, conf.RXChecksumOffload, conf.NumNetworkChannels, conf.QDisc); err != nil { return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err) } case boot.NetworkHost: @@ -115,7 +115,7 @@ func isRootNS() (bool, error) { // createInterfacesAndRoutesFromNS scrapes the interface and routes from the // net namespace with the given path, creates them in the sandbox, and removes // them from the host. -func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareGSO bool, softwareGSO bool, numNetworkChannels int, qDisc boot.QueueingDiscipline) error { +func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareGSO bool, softwareGSO bool, txChecksumOffload bool, rxChecksumOffload bool, numNetworkChannels int, qDisc boot.QueueingDiscipline) error { // Join the network namespace that we will be copying. restore, err := joinNetNS(nsPath) if err != nil { @@ -134,7 +134,6 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG return err } if isRoot { - return fmt.Errorf("cannot run with network enabled in root network namespace") } @@ -197,11 +196,13 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG } link := boot.FDBasedLink{ - Name: iface.Name, - MTU: iface.MTU, - Routes: routes, - NumChannels: numNetworkChannels, - QDisc: qDisc, + Name: iface.Name, + MTU: iface.MTU, + Routes: routes, + TXChecksumOffload: txChecksumOffload, + RXChecksumOffload: rxChecksumOffload, + NumChannels: numNetworkChannels, + QDisc: qDisc, } // Get the link for the interface. diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index 6e1a2af25..2afcc27af 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -478,9 +478,7 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF // If the console control socket file is provided, then create a new // pty master/slave pair and set the TTY on the sandbox process. - if args.ConsoleSocket != "" { - cmd.Args = append(cmd.Args, "--console=true") - + if args.Spec.Process.Terminal && args.ConsoleSocket != "" { // console.NewWithSocket will send the master on the given // socket, and return the slave. tty, err := console.NewWithSocket(args.ConsoleSocket) diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD index 4ccd77f63..43851a22f 100644 --- a/runsc/specutils/BUILD +++ b/runsc/specutils/BUILD @@ -17,7 +17,8 @@ go_library( "//pkg/log", "//pkg/sentry/kernel/auth", "@com_github_cenkalti_backoff//:go_default_library", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_mohae_deepcopy//:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", "@com_github_syndtr_gocapability//capability:go_default_library", "@org_golang_x_sys//unix:go_default_library", ], @@ -28,5 +29,5 @@ go_test( size = "small", srcs = ["specutils_test.go"], library = ":specutils", - deps = ["@com_github_opencontainers_runtime-spec//specs-go:go_default_library"], + deps = ["@com_github_opencontainers_runtime_spec//specs-go:go_default_library"], ) diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go index f1fa573c5..5015c3a84 100644 --- a/runsc/specutils/specutils.go +++ b/runsc/specutils/specutils.go @@ -29,6 +29,7 @@ import ( "time" "github.com/cenkalti/backoff" + "github.com/mohae/deepcopy" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bits" @@ -44,20 +45,31 @@ var ExePath = "/proc/self/exe" var Version = specs.Version // LogSpec logs the spec in a human-friendly way. -func LogSpec(spec *specs.Spec) { - log.Debugf("Spec: %+v", spec) - log.Debugf("Spec.Hooks: %+v", spec.Hooks) - log.Debugf("Spec.Linux: %+v", spec.Linux) - if spec.Linux != nil && spec.Linux.Resources != nil { - res := spec.Linux.Resources - log.Debugf("Spec.Linux.Resources.Memory: %+v", res.Memory) - log.Debugf("Spec.Linux.Resources.CPU: %+v", res.CPU) - log.Debugf("Spec.Linux.Resources.BlockIO: %+v", res.BlockIO) - log.Debugf("Spec.Linux.Resources.Network: %+v", res.Network) - } - log.Debugf("Spec.Process: %+v", spec.Process) - log.Debugf("Spec.Root: %+v", spec.Root) - log.Debugf("Spec.Mounts: %+v", spec.Mounts) +func LogSpec(orig *specs.Spec) { + if !log.IsLogging(log.Debug) { + return + } + + // Strip down parts of the spec that are not interesting. + spec := deepcopy.Copy(orig).(*specs.Spec) + if spec.Process != nil { + spec.Process.Capabilities = nil + } + if spec.Linux != nil { + spec.Linux.Seccomp = nil + spec.Linux.MaskedPaths = nil + spec.Linux.ReadonlyPaths = nil + if spec.Linux.Resources != nil { + spec.Linux.Resources.Devices = nil + } + } + + out, err := json.MarshalIndent(spec, "", " ") + if err != nil { + log.Debugf("Failed to marshal spec: %v", err) + return + } + log.Debugf("Spec:\n%s", out) } // ValidateSpec validates that the spec is compatible with runsc. |