diff options
Diffstat (limited to 'runsc')
40 files changed, 1093 insertions, 421 deletions
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index 248f77c34..8c73dc5dc 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -38,6 +38,7 @@ go_library( "//pkg/memutil", "//pkg/rand", "//pkg/refs", + "//pkg/refsvfs2", "//pkg/sentry/arch", "//pkg/sentry/arch:registers_go_proto", "//pkg/sentry/control", @@ -74,6 +75,7 @@ go_library( "//pkg/sentry/platform", "//pkg/sentry/sighandling", "//pkg/sentry/socket/hostinet", + "//pkg/sentry/socket/netfilter", "//pkg/sentry/socket/netlink", "//pkg/sentry/socket/netlink/route", "//pkg/sentry/socket/netlink/uevent", diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go index 7076ae2e2..a3a76b609 100644 --- a/runsc/boot/compat.go +++ b/runsc/boot/compat.go @@ -53,7 +53,7 @@ type compatEmitter struct { func newCompatEmitter(logFD int) (*compatEmitter, error) { nameMap, ok := getSyscallNameMap() if !ok { - return nil, fmt.Errorf("Linux syscall table not found") + return nil, fmt.Errorf("syscall table not found") } c := &compatEmitter{ diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index 894651519..865126ac5 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -30,6 +30,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/netstack" "gvisor.dev/gvisor/pkg/sentry/state" "gvisor.dev/gvisor/pkg/sentry/time" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sentry/watchdog" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/urpc" @@ -195,7 +196,7 @@ type containerManager struct { // StartRoot will start the root container process. func (cm *containerManager) StartRoot(cid *string, _ *struct{}) error { - log.Debugf("containerManager.StartRoot %q", *cid) + log.Debugf("containerManager.StartRoot, cid: %s", *cid) // Tell the root container to start and wait for the result. cm.startChan <- struct{}{} if err := <-cm.startResultChan; err != nil { @@ -206,14 +207,35 @@ func (cm *containerManager) StartRoot(cid *string, _ *struct{}) error { // Processes retrieves information about processes running in the sandbox. func (cm *containerManager) Processes(cid *string, out *[]*control.Process) error { - log.Debugf("containerManager.Processes: %q", *cid) + log.Debugf("containerManager.Processes, cid: %s", *cid) return control.Processes(cm.l.k, *cid, out) } +// CreateArgs contains arguments to the Create method. +type CreateArgs struct { + // CID is the ID of the container to start. + CID string + + // FilePayload may contain a TTY file for the terminal, if enabled. + urpc.FilePayload +} + // Create creates a container within a sandbox. -func (cm *containerManager) Create(cid *string, _ *struct{}) error { - log.Debugf("containerManager.Create: %q", *cid) - return cm.l.createContainer(*cid) +func (cm *containerManager) Create(args *CreateArgs, _ *struct{}) error { + log.Debugf("containerManager.Create: %s", args.CID) + + if len(args.Files) > 1 { + return fmt.Errorf("start arguments must have at most 1 files for TTY") + } + var tty *fd.FD + if len(args.Files) == 1 { + var err error + tty, err = fd.NewFromFile(args.Files[0]) + if err != nil { + return fmt.Errorf("error dup'ing TTY file: %w", err) + } + } + return cm.l.createContainer(args.CID, tty) } // StartArgs contains arguments to the Start method. @@ -228,20 +250,18 @@ type StartArgs struct { CID string // FilePayload contains, in order: - // * stdin, stdout, and stderr. - // * the file descriptor over which the sandbox will - // request files from its root filesystem. + // * stdin, stdout, and stderr (optional: if terminal is disabled). + // * file descriptors to connect to gofer to serve the root filesystem. urpc.FilePayload } // Start runs a created container within a sandbox. func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { - log.Debugf("containerManager.Start: %+v", args) - // Validate arguments. if args == nil { return errors.New("start missing arguments") } + log.Debugf("containerManager.Start, cid: %s, args: %+v", args.CID, args) if args.Spec == nil { return errors.New("start arguments missing spec") } @@ -251,44 +271,66 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { if args.CID == "" { return errors.New("start argument missing container ID") } - if len(args.FilePayload.Files) < 4 { - return fmt.Errorf("start arguments must contain stdin, stderr, and stdout followed by at least one file for the container root gofer") + if len(args.Files) < 1 { + return fmt.Errorf("start arguments must contain at least one file for the container root gofer") } // All validation passed, logs the spec for debugging. specutils.LogSpec(args.Spec) - fds, err := fd.NewFromFiles(args.FilePayload.Files) + goferFiles := args.Files + var stdios []*fd.FD + if !args.Spec.Process.Terminal { + // When not using a terminal, stdios come as the first 3 files in the + // payload. + if l := len(args.Files); l < 4 { + return fmt.Errorf("start arguments (len: %d) must contain stdios and files for the container root gofer", l) + } + var err error + stdios, err = fd.NewFromFiles(goferFiles[:3]) + if err != nil { + return fmt.Errorf("error dup'ing stdio files: %w", err) + } + goferFiles = goferFiles[3:] + } + defer func() { + for _, fd := range stdios { + _ = fd.Close() + } + }() + + goferFDs, err := fd.NewFromFiles(goferFiles) if err != nil { - return err + return fmt.Errorf("error dup'ing gofer files: %w", err) } defer func() { - for _, fd := range fds { + for _, fd := range goferFDs { _ = fd.Close() } }() - if err := cm.l.startContainer(args.Spec, args.Conf, args.CID, fds); err != nil { - log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err) + + if err := cm.l.startContainer(args.Spec, args.Conf, args.CID, stdios, goferFDs); err != nil { + log.Debugf("containerManager.Start failed, cid: %s, args: %+v, err: %v", args.CID, args, err) return err } - log.Debugf("Container %q started", args.CID) + log.Debugf("Container started, cid: %s", args.CID) return nil } // Destroy stops a container if it is still running and cleans up its // filesystem. func (cm *containerManager) Destroy(cid *string, _ *struct{}) error { - log.Debugf("containerManager.destroy %q", *cid) + log.Debugf("containerManager.destroy, cid: %s", *cid) return cm.l.destroyContainer(*cid) } // ExecuteAsync starts running a command on a created or running sandbox. It // returns the PID of the new process. func (cm *containerManager) ExecuteAsync(args *control.ExecArgs, pid *int32) error { - log.Debugf("containerManager.ExecuteAsync: %+v", args) + log.Debugf("containerManager.ExecuteAsync, cid: %s, args: %+v", args.ContainerID, args) tgid, err := cm.l.executeAsync(args) if err != nil { - log.Debugf("containerManager.ExecuteAsync failed: %+v: %v", args, err) + log.Debugf("containerManager.ExecuteAsync failed, cid: %s, args: %+v, err: %v", args.ContainerID, args, err) return err } *pid = int32(tgid) @@ -330,18 +372,18 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { log.Debugf("containerManager.Restore") var specFile, deviceFile *os.File - switch numFiles := len(o.FilePayload.Files); numFiles { + switch numFiles := len(o.Files); numFiles { case 2: // The device file is donated to the platform. // Can't take ownership away from os.File. dup them to get a new FD. - fd, err := syscall.Dup(int(o.FilePayload.Files[1].Fd())) + fd, err := syscall.Dup(int(o.Files[1].Fd())) if err != nil { return fmt.Errorf("failed to dup file: %v", err) } deviceFile = os.NewFile(uintptr(fd), "platform device") fallthrough case 1: - specFile = o.FilePayload.Files[0] + specFile = o.Files[0] case 0: return fmt.Errorf("at least one file must be passed to Restore") default: @@ -367,12 +409,20 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { cm.l.k = k // Set up the restore environment. + ctx := k.SupervisorContext() mntr := newContainerMounter(cm.l.root.spec, cm.l.root.goferFDs, cm.l.k, cm.l.mountHints) - renv, err := mntr.createRestoreEnvironment(cm.l.root.conf) - if err != nil { - return fmt.Errorf("creating RestoreEnvironment: %v", err) + if kernel.VFS2Enabled { + ctx, err = mntr.configureRestore(ctx, cm.l.root.conf) + if err != nil { + return fmt.Errorf("configuring filesystem restore: %v", err) + } + } else { + renv, err := mntr.createRestoreEnvironment(cm.l.root.conf) + if err != nil { + return fmt.Errorf("creating RestoreEnvironment: %v", err) + } + fs.SetRestoreEnvironment(*renv) } - fs.SetRestoreEnvironment(*renv) // Prepare to load from the state file. if eps, ok := networkStack.(*netstack.Stack); ok { @@ -399,7 +449,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { // Load the state. loadOpts := state.LoadOpts{Source: specFile} - if err := loadOpts.Load(k, networkStack, time.NewCalibratedClocks()); err != nil { + if err := loadOpts.Load(ctx, k, networkStack, time.NewCalibratedClocks(), &vfs.CompleteRestoreOptions{}); err != nil { return err } @@ -444,9 +494,9 @@ func (cm *containerManager) Resume(_, _ *struct{}) error { // Wait waits for the init process in the given container. func (cm *containerManager) Wait(cid *string, waitStatus *uint32) error { - log.Debugf("containerManager.Wait") + log.Debugf("containerManager.Wait, cid: %s", *cid) err := cm.l.waitContainer(*cid, waitStatus) - log.Debugf("containerManager.Wait returned, waitStatus: %v: %v", waitStatus, err) + log.Debugf("containerManager.Wait returned, cid: %s, waitStatus: %#x, err: %v", *cid, *waitStatus, err) return err } @@ -461,8 +511,10 @@ type WaitPIDArgs struct { // WaitPID waits for the process with PID 'pid' in the sandbox. func (cm *containerManager) WaitPID(args *WaitPIDArgs, waitStatus *uint32) error { - log.Debugf("containerManager.Wait") - return cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, waitStatus) + log.Debugf("containerManager.Wait, cid: %s, pid: %d", args.CID, args.PID) + err := cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, waitStatus) + log.Debugf("containerManager.Wait, cid: %s, pid: %d, waitStatus: %#x, err: %v", args.CID, args.PID, *waitStatus, err) + return err } // SignalDeliveryMode enumerates different signal delivery modes. @@ -519,6 +571,6 @@ type SignalArgs struct { // indicated process, to all processes in the container, or to the foreground // process group. func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error { - log.Debugf("containerManager.Signal %+v", args) + log.Debugf("containerManager.Signal: cid: %s, PID: %d, signal: %d, mode: %v", args.CID, args.PID, args.Signo, args.Mode) return cm.l.signal(args.CID, args.PID, args.Signo, args.Mode) } diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index a7c4ebb0c..4e3bb9ac7 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -343,6 +343,16 @@ func hostInetFilters() seccomp.SyscallRules { }, { seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IP), + seccomp.EqualTo(syscall.IP_PKTINFO), + }, + { + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IP), + seccomp.EqualTo(syscall.IP_RECVORIGDSTADDR), + }, + { + seccomp.MatchAny{}, seccomp.EqualTo(syscall.SOL_IPV6), seccomp.EqualTo(syscall.IPV6_TCLASS), }, @@ -358,6 +368,11 @@ func hostInetFilters() seccomp.SyscallRules { }, { seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(linux.IPV6_RECVORIGDSTADDR), + }, + { + seccomp.MatchAny{}, seccomp.EqualTo(syscall.SOL_SOCKET), seccomp.EqualTo(syscall.SO_ERROR), }, @@ -393,6 +408,11 @@ func hostInetFilters() seccomp.SyscallRules { }, { seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_TIMESTAMP), + }, + { + seccomp.MatchAny{}, seccomp.EqualTo(syscall.SOL_TCP), seccomp.EqualTo(syscall.TCP_NODELAY), }, @@ -401,6 +421,11 @@ func hostInetFilters() seccomp.SyscallRules { seccomp.EqualTo(syscall.SOL_TCP), seccomp.EqualTo(syscall.TCP_INFO), }, + { + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_TCP), + seccomp.EqualTo(linux.TCP_INQ), + }, }, syscall.SYS_IOCTL: []seccomp.Rule{ { @@ -449,6 +474,13 @@ func hostInetFilters() seccomp.SyscallRules { }, { seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_TIMESTAMP), + seccomp.MatchAny{}, + seccomp.EqualTo(4), + }, + { + seccomp.MatchAny{}, seccomp.EqualTo(syscall.SOL_TCP), seccomp.EqualTo(syscall.TCP_NODELAY), seccomp.MatchAny{}, @@ -456,6 +488,13 @@ func hostInetFilters() seccomp.SyscallRules { }, { seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_TCP), + seccomp.EqualTo(linux.TCP_INQ), + seccomp.MatchAny{}, + seccomp.EqualTo(4), + }, + { + seccomp.MatchAny{}, seccomp.EqualTo(syscall.SOL_IP), seccomp.EqualTo(syscall.IP_TOS), seccomp.MatchAny{}, @@ -470,6 +509,20 @@ func hostInetFilters() seccomp.SyscallRules { }, { seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IP), + seccomp.EqualTo(syscall.IP_PKTINFO), + seccomp.MatchAny{}, + seccomp.EqualTo(4), + }, + { + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IP), + seccomp.EqualTo(syscall.IP_RECVORIGDSTADDR), + seccomp.MatchAny{}, + seccomp.EqualTo(4), + }, + { + seccomp.MatchAny{}, seccomp.EqualTo(syscall.SOL_IPV6), seccomp.EqualTo(syscall.IPV6_TCLASS), seccomp.MatchAny{}, @@ -482,6 +535,13 @@ func hostInetFilters() seccomp.SyscallRules { seccomp.MatchAny{}, seccomp.EqualTo(4), }, + { + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(linux.IPV6_RECVORIGDSTADDR), + seccomp.MatchAny{}, + seccomp.EqualTo(4), + }, }, syscall.SYS_SHUTDOWN: []seccomp.Rule{ { diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index ddf288456..2b0d2cd51 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -22,15 +22,6 @@ import ( "strings" "syscall" - // Include filesystem types that OCI spec might mount. - _ "gvisor.dev/gvisor/pkg/sentry/fs/dev" - _ "gvisor.dev/gvisor/pkg/sentry/fs/host" - _ "gvisor.dev/gvisor/pkg/sentry/fs/proc" - _ "gvisor.dev/gvisor/pkg/sentry/fs/sys" - _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" - _ "gvisor.dev/gvisor/pkg/sentry/fs/tty" - "gvisor.dev/gvisor/pkg/sentry/vfs" - specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -48,9 +39,18 @@ import ( tmpfsvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/specutils" + + // Include filesystem types that OCI spec might mount. + _ "gvisor.dev/gvisor/pkg/sentry/fs/dev" + _ "gvisor.dev/gvisor/pkg/sentry/fs/host" + _ "gvisor.dev/gvisor/pkg/sentry/fs/proc" + _ "gvisor.dev/gvisor/pkg/sentry/fs/sys" + _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" + _ "gvisor.dev/gvisor/pkg/sentry/fs/tty" ) const ( @@ -105,33 +105,28 @@ func addOverlay(ctx context.Context, conf *config.Config, lower *fs.Inode, name // mandatory mounts that are required by the OCI specification. func compileMounts(spec *specs.Spec) []specs.Mount { // Keep track of whether proc and sys were mounted. - var procMounted, sysMounted bool + var procMounted, sysMounted, devMounted, devptsMounted bool var mounts []specs.Mount - // Always mount /dev. - mounts = append(mounts, specs.Mount{ - Type: devtmpfs.Name, - Destination: "/dev", - }) - - mounts = append(mounts, specs.Mount{ - Type: devpts.Name, - Destination: "/dev/pts", - }) - // Mount all submounts from the spec. for _, m := range spec.Mounts { if !specutils.IsSupportedDevMount(m) { log.Warningf("ignoring dev mount at %q", m.Destination) continue } - mounts = append(mounts, m) switch filepath.Clean(m.Destination) { case "/proc": procMounted = true case "/sys": sysMounted = true + case "/dev": + m.Type = devtmpfs.Name + devMounted = true + case "/dev/pts": + m.Type = devpts.Name + devptsMounted = true } + mounts = append(mounts, m) } // Mount proc and sys even if the user did not ask for it, as the spec @@ -149,6 +144,18 @@ func compileMounts(spec *specs.Spec) []specs.Mount { Destination: "/sys", }) } + if !devMounted { + mandatoryMounts = append(mandatoryMounts, specs.Mount{ + Type: devtmpfs.Name, + Destination: "/dev", + }) + } + if !devptsMounted { + mandatoryMounts = append(mandatoryMounts, specs.Mount{ + Type: devpts.Name, + Destination: "/dev/pts", + }) + } // The mandatory mounts should be ordered right after the root, in case // there are submounts of these mandatory mounts already in the spec. diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 8ad000497..3df013d34 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -35,6 +35,7 @@ import ( "gvisor.dev/gvisor/pkg/memutil" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/refsvfs2" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/fdimport" @@ -49,6 +50,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/sighandling" + "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" "gvisor.dev/gvisor/pkg/sentry/syscalls/linux/vfs2" "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sentry/usage" @@ -155,6 +157,11 @@ type execProcess struct { // pidnsPath is the pid namespace path in spec pidnsPath string + + // hostTTY is present when creating a sub-container with terminal enabled. + // TTY file is passed during container create and must be saved until + // container start. + hostTTY *fd.FD } func init() { @@ -476,6 +483,10 @@ func (l *Loader) Destroy() { // save/restore. l.k.Release() + // All sentry-created resources should have been released at this point; + // check for reference leaks. + refsvfs2.DoLeakCheck() + // In the success case, stdioFDs and goferFDs will only contain // released/closed FDs that ownership has been passed over to host FDs and // gofer sessions. Close them here in case of failure. @@ -582,7 +593,9 @@ func (l *Loader) run() error { // Create the root container init task. It will begin running // when the kernel is started. - if _, err := l.createContainerProcess(true, l.sandboxID, &l.root, ep); err != nil { + var err error + _, ep.tty, ep.ttyVFS2, err = l.createContainerProcess(true, l.sandboxID, &l.root) + if err != nil { return err } @@ -621,7 +634,7 @@ func (l *Loader) run() error { } // createContainer creates a new container inside the sandbox. -func (l *Loader) createContainer(cid string) error { +func (l *Loader) createContainer(cid string, tty *fd.FD) error { l.mu.Lock() defer l.mu.Unlock() @@ -629,14 +642,14 @@ func (l *Loader) createContainer(cid string) error { if _, ok := l.processes[eid]; ok { return fmt.Errorf("container %q already exists", cid) } - l.processes[eid] = &execProcess{} + l.processes[eid] = &execProcess{hostTTY: tty} return nil } // startContainer starts a child container. It returns the thread group ID of // the newly created process. Used FDs are either closed or released. It's safe // for the caller to close any remaining files upon return. -func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, files []*fd.FD) error { +func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, stdioFDs, goferFDs []*fd.FD) error { // Create capabilities. caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities) if err != nil { @@ -689,36 +702,41 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid strin info := &containerInfo{ conf: conf, spec: spec, - stdioFDs: files[:3], - goferFDs: files[3:], + goferFDs: goferFDs, } info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns) if err != nil { return fmt.Errorf("creating new process: %v", err) } - tg, err := l.createContainerProcess(false, cid, info, ep) + + // Use stdios or TTY depending on the spec configuration. + if spec.Process.Terminal { + if len(stdioFDs) > 0 { + return fmt.Errorf("using TTY, stdios not expected: %v", stdioFDs) + } + if ep.hostTTY == nil { + return fmt.Errorf("terminal enabled but no TTY provided. Did you set --console-socket on create?") + } + info.stdioFDs = []*fd.FD{ep.hostTTY, ep.hostTTY, ep.hostTTY} + ep.hostTTY = nil + } else { + info.stdioFDs = stdioFDs + } + + ep.tg, ep.tty, ep.ttyVFS2, err = l.createContainerProcess(false, cid, info) if err != nil { return err } - - // Success! - l.k.StartProcess(tg) - ep.tg = tg + l.k.StartProcess(ep.tg) return nil } -func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo, ep *execProcess) (*kernel.ThreadGroup, error) { - console := false - if root { - // Only root container supports terminal for now. - console = info.spec.Process.Terminal - } - +func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo) (*kernel.ThreadGroup, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { // Create the FD map, which will set stdin, stdout, and stderr. ctx := info.procArgs.NewContext(l.k) - fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, console, info.stdioFDs) + fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, info.spec.Process.Terminal, info.stdioFDs) if err != nil { - return nil, fmt.Errorf("importing fds: %v", err) + return nil, nil, nil, fmt.Errorf("importing fds: %v", err) } // CreateProcess takes a reference on fdTable if successful. We won't need // ours either way. @@ -730,14 +748,14 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn mntr := newContainerMounter(info.spec, info.goferFDs, l.k, l.mountHints) if root { if err := mntr.processHints(info.conf, info.procArgs.Credentials); err != nil { - return nil, err + return nil, nil, nil, err } } if err := setupContainerFS(ctx, info.conf, mntr, &info.procArgs); err != nil { - return nil, err + return nil, nil, nil, err } - // Add the HOME enviroment variable if it is not already set. + // Add the HOME environment variable if it is not already set. var envv []string if kernel.VFS2Enabled { envv, err = user.MaybeAddExecUserHomeVFS2(ctx, info.procArgs.MountNamespaceVFS2, @@ -748,29 +766,25 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn info.procArgs.Credentials.RealKUID, info.procArgs.Envv) } if err != nil { - return nil, err + return nil, nil, nil, err } info.procArgs.Envv = envv // Create and start the new process. tg, _, err := l.k.CreateProcess(info.procArgs) if err != nil { - return nil, fmt.Errorf("creating process: %v", err) + return nil, nil, nil, fmt.Errorf("creating process: %v", err) } // CreateProcess takes a reference on FDTable if successful. info.procArgs.FDTable.DecRef(ctx) // Set the foreground process group on the TTY to the global init process // group, since that is what we are about to start running. - if root { - switch { - case ttyFileVFS2 != nil: - ep.ttyVFS2 = ttyFileVFS2 - ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup()) - case ttyFile != nil: - ep.tty = ttyFile - ttyFile.InitForegroundProcessGroup(tg.ProcessGroup()) - } + switch { + case ttyFileVFS2 != nil: + ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup()) + case ttyFile != nil: + ttyFile.InitForegroundProcessGroup(tg.ProcessGroup()) } // Install seccomp filters with the new task if there are any. @@ -778,7 +792,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil { program, err := seccomp.BuildProgram(info.spec.Linux.Seccomp) if err != nil { - return nil, fmt.Errorf("building seccomp program: %v", err) + return nil, nil, nil, fmt.Errorf("building seccomp program: %v", err) } if log.IsLogging(log.Debug) { @@ -789,7 +803,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn task := tg.Leader() // NOTE: It seems Flags are ignored by runc so we ignore them too. if err := task.AppendSyscallFilter(program, true); err != nil { - return nil, fmt.Errorf("appending seccomp filters: %v", err) + return nil, nil, nil, fmt.Errorf("appending seccomp filters: %v", err) } } } else { @@ -798,7 +812,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn } } - return tg, nil + return tg, ttyFile, ttyFileVFS2, nil } // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on @@ -882,7 +896,7 @@ func (l *Loader) destroyContainer(cid string) error { } } - log.Debugf("Container destroyed %q", cid) + log.Debugf("Container destroyed, cid: %s", cid) return nil } @@ -1068,7 +1082,12 @@ func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID st func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) { netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol} - transProtos := []stack.TransportProtocolFactory{tcp.NewProtocol, udp.NewProtocol, icmp.NewProtocol4} + transProtos := []stack.TransportProtocolFactory{ + tcp.NewProtocol, + udp.NewProtocol, + icmp.NewProtocol4, + icmp.NewProtocol6, + } s := netstack.Stack{stack.New(stack.Options{ NetworkProtocols: netProtos, TransportProtocols: transProtos, @@ -1079,6 +1098,7 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in // privileges. RawFactory: raw.EndpointFactory{}, UniqueID: uniqueID, + IPTables: netfilter.DefaultLinuxTables(), })} // Enable SACK Recovery. diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index e376f944b..b77b4762e 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -266,7 +266,7 @@ type CreateMountTestcase struct { func createMountTestcases() []*CreateMountTestcase { testCases := []*CreateMountTestcase{ - &CreateMountTestcase{ + { // Only proc. name: "only proc mount", spec: specs.Spec{ @@ -304,11 +304,10 @@ func createMountTestcases() []*CreateMountTestcase { }, }, }, - // /some/deep/path should be mounted, along with /proc, - // /dev, and /sys. + // /some/deep/path should be mounted, along with /proc, /dev, and /sys. expectedPaths: []string{"/some/very/very/deep/path", "/proc", "/dev", "/sys"}, }, - &CreateMountTestcase{ + { // Mounts are nested inside each other. name: "nested mounts", spec: specs.Spec{ @@ -352,7 +351,7 @@ func createMountTestcases() []*CreateMountTestcase { expectedPaths: []string{"/foo", "/foo/bar", "/foo/bar/baz", "/foo/qux", "/foo/qux-quz", "/foo/some/very/very/deep/path", "/proc", "/dev", "/sys"}, }, - &CreateMountTestcase{ + { name: "mount inside /dev", spec: specs.Spec{ Root: &specs.Root{ @@ -395,35 +394,37 @@ func createMountTestcases() []*CreateMountTestcase { }, expectedPaths: []string{"/proc", "/dev", "/dev/fd-foo", "/dev/foo", "/dev/bar", "/sys"}, }, - } - - vfsCase := &CreateMountTestcase{ - name: "mounts inside mandatory mounts", - spec: specs.Spec{ - Root: &specs.Root{ - Path: os.TempDir(), - Readonly: true, - }, - Mounts: []specs.Mount{ - { - Destination: "/proc", - Type: "tmpfs", - }, - { - Destination: "/sys/bar", - Type: "tmpfs", + { + name: "mounts inside mandatory mounts", + spec: specs.Spec{ + Root: &specs.Root{ + Path: os.TempDir(), + Readonly: true, }, - - { - Destination: "/tmp/baz", - Type: "tmpfs", + Mounts: []specs.Mount{ + { + Destination: "/proc", + Type: "tmpfs", + }, + { + Destination: "/sys/bar", + Type: "tmpfs", + }, + { + Destination: "/tmp/baz", + Type: "tmpfs", + }, + { + Destination: "/dev/goo", + Type: "tmpfs", + }, }, }, + expectedPaths: []string{"/proc", "/sys", "/sys/bar", "/tmp", "/tmp/baz", "/dev/goo"}, }, - expectedPaths: []string{"/proc", "/sys", "/sys/bar", "/tmp", "/tmp/baz"}, } - return append(testCases, vfsCase) + return testCases } // Test that MountNamespace can be created with various specs. diff --git a/runsc/boot/network.go b/runsc/boot/network.go index 988573640..3d3a813df 100644 --- a/runsc/boot/network.go +++ b/runsc/boot/network.go @@ -28,7 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/link/packetsocket" "gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo" "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" - "gvisor.dev/gvisor/pkg/tcpip/network/arp" "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" "gvisor.dev/gvisor/pkg/tcpip/stack" @@ -41,9 +40,9 @@ var ( // "::1/8" on "lo" interface. DefaultLoopbackLink = LoopbackLink{ Name: "lo", - Addresses: []net.IP{ - net.IP("\x7f\x00\x00\x01"), - net.IPv6loopback, + Addresses: []IPWithPrefix{ + {Address: net.IP("\x7f\x00\x00\x01"), PrefixLen: 8}, + {Address: net.IPv6loopback, PrefixLen: 128}, }, Routes: []Route{ { @@ -83,7 +82,7 @@ type DefaultRoute struct { type FDBasedLink struct { Name string MTU int - Addresses []net.IP + Addresses []IPWithPrefix Routes []Route GSOMaxSize uint32 SoftwareGSOEnabled bool @@ -100,7 +99,7 @@ type FDBasedLink struct { // LoopbackLink configures a loopback li nk. type LoopbackLink struct { Name string - Addresses []net.IP + Addresses []IPWithPrefix Routes []Route } @@ -118,6 +117,19 @@ type CreateLinksAndRoutesArgs struct { Defaultv6Gateway DefaultRoute } +// IPWithPrefix is an address with its subnet prefix length. +type IPWithPrefix struct { + // Address is a network address. + Address net.IP + + // PrefixLen is the subnet prefix length. + PrefixLen int +} + +func (ip IPWithPrefix) String() string { + return fmt.Sprintf("%s/%d", ip.Address, ip.PrefixLen) +} + // Empty returns true if route hasn't been set. func (r *Route) Empty() bool { return r.Destination.IP == nil && r.Destination.Mask == nil && r.Gateway == nil @@ -265,20 +277,19 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct // createNICWithAddrs creates a NIC in the network stack and adds the given // addresses. -func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []net.IP) error { +func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []IPWithPrefix) error { opts := stack.NICOptions{Name: name} if err := n.Stack.CreateNICWithOptions(id, sniffer.New(ep), opts); err != nil { return fmt.Errorf("CreateNICWithOptions(%d, _, %+v) failed: %v", id, opts, err) } - // Always start with an arp address for the NIC. - if err := n.Stack.AddAddress(id, arp.ProtocolNumber, arp.ProtocolAddress); err != nil { - return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, arp.ProtocolNumber, arp.ProtocolAddress, err) - } - for _, addr := range addrs { - proto, tcpipAddr := ipToAddressAndProto(addr) - if err := n.Stack.AddAddress(id, proto, tcpipAddr); err != nil { + proto, tcpipAddr := ipToAddressAndProto(addr.Address) + ap := tcpip.AddressWithPrefix{ + Address: tcpipAddr, + PrefixLen: addr.PrefixLen, + } + if err := n.Stack.AddAddressWithPrefix(id, proto, ap); err != nil { return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, proto, tcpipAddr, err) } } diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index 004da5b40..3fd28e516 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -210,6 +210,9 @@ func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *c ReadOnly: c.root.Readonly, GetFilesystemOptions: vfs.GetFilesystemOptions{ Data: strings.Join(data, ","), + InternalData: gofer.InternalFilesystemOptions{ + UniqueID: "/", + }, }, InternalMount: true, } @@ -247,36 +250,76 @@ func (c *containerMounter) configureOverlay(ctx context.Context, creds *auth.Cre overlayOpts := *lowerOpts overlayOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{} - // Next mount upper and lower. Upper is a tmpfs mount to keep all - // modifications inside the sandbox. - upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts) - if err != nil { - return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err) - } - cu := cleanup.Make(func() { upper.DecRef(ctx) }) - defer cu.Clean() - // All writes go to the upper layer, be paranoid and make lower readonly. lowerOpts.ReadOnly = true lower, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, lowerFSName, lowerOpts) if err != nil { return nil, nil, err } - cu.Add(func() { lower.DecRef(ctx) }) + cu := cleanup.Make(func() { lower.DecRef(ctx) }) + defer cu.Clean() - // Propagate the lower layer's root's owner, group, and mode to the upper - // layer's root for consistency with VFS1. - upperRootVD := vfs.MakeVirtualDentry(upper, upper.Root()) + // Determine the lower layer's root's type. lowerRootVD := vfs.MakeVirtualDentry(lower, lower.Root()) stat, err := c.k.VFS().StatAt(ctx, creds, &vfs.PathOperation{ Root: lowerRootVD, Start: lowerRootVD, }, &vfs.StatOptions{ - Mask: linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE, + Mask: linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE | linux.STATX_TYPE, }) if err != nil { - return nil, nil, err + return nil, nil, fmt.Errorf("failed to stat lower layer's root: %v", err) + } + if stat.Mask&linux.STATX_TYPE == 0 { + return nil, nil, fmt.Errorf("failed to get file type of lower layer's root") + } + rootType := stat.Mode & linux.S_IFMT + if rootType != linux.S_IFDIR && rootType != linux.S_IFREG { + return nil, nil, fmt.Errorf("lower layer's root has unsupported file type %v", rootType) + } + + // Upper is a tmpfs mount to keep all modifications inside the sandbox. + upperOpts.GetFilesystemOptions.InternalData = tmpfs.FilesystemOpts{ + RootFileType: uint16(rootType), + } + upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts) + if err != nil { + return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err) + } + cu.Add(func() { upper.DecRef(ctx) }) + + // If the overlay mount consists of a regular file, copy up its contents + // from the lower layer, since in the overlay the otherwise-empty upper + // layer file will take precedence. + upperRootVD := vfs.MakeVirtualDentry(upper, upper.Root()) + if rootType == linux.S_IFREG { + lowerFD, err := c.k.VFS().OpenAt(ctx, creds, &vfs.PathOperation{ + Root: lowerRootVD, + Start: lowerRootVD, + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }) + if err != nil { + return nil, nil, fmt.Errorf("failed to open lower layer root for copying: %v", err) + } + defer lowerFD.DecRef(ctx) + upperFD, err := c.k.VFS().OpenAt(ctx, creds, &vfs.PathOperation{ + Root: upperRootVD, + Start: upperRootVD, + }, &vfs.OpenOptions{ + Flags: linux.O_WRONLY, + }) + if err != nil { + return nil, nil, fmt.Errorf("failed to open upper layer root for copying: %v", err) + } + defer upperFD.DecRef(ctx) + if _, err := vfs.CopyRegularFileData(ctx, upperFD, lowerFD); err != nil { + return nil, nil, fmt.Errorf("failed to copy up overlay file: %v", err) + } } + + // Propagate the lower layer's root's owner, group, and mode to the upper + // layer's root for consistency with VFS1. err = c.k.VFS().SetStatAt(ctx, creds, &vfs.PathOperation{ Root: upperRootVD, Start: upperRootVD, @@ -427,6 +470,7 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mo fsName := m.Type useOverlay := false var data []string + var iopts interface{} // Find filesystem name and FS specific data field. switch m.Type { @@ -451,6 +495,9 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mo return "", nil, false, fmt.Errorf("9P mount requires a connection FD") } data = p9MountData(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */) + iopts = gofer.InternalFilesystemOptions{ + UniqueID: m.Destination, + } // If configured, add overlay to all writable mounts. useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly @@ -462,7 +509,8 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mo opts := &vfs.MountOptions{ GetFilesystemOptions: vfs.GetFilesystemOptions{ - Data: strings.Join(data, ","), + Data: strings.Join(data, ","), + InternalData: iopts, }, InternalMount: true, } @@ -667,3 +715,21 @@ func (c *containerMounter) makeMountPoint(ctx context.Context, creds *auth.Crede } return c.k.VFS().MakeSyntheticMountpoint(ctx, dest, root, creds) } + +// configureRestore returns an updated context.Context including filesystem +// state used by restore defined by conf. +func (c *containerMounter) configureRestore(ctx context.Context, conf *config.Config) (context.Context, error) { + fdmap := make(map[string]int) + fdmap["/"] = c.fds.remove() + mounts, err := c.prepareMountsVFS2() + if err != nil { + return ctx, err + } + for i := range c.mounts { + submount := &mounts[i] + if submount.fd >= 0 { + fdmap[submount.Destination] = submount.fd + } + } + return context.WithValue(ctx, gofer.CtxRestoreServerFDMap, fdmap), nil +} diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go index 56da21584..e5294de55 100644 --- a/runsc/cgroup/cgroup.go +++ b/runsc/cgroup/cgroup.go @@ -21,6 +21,7 @@ import ( "context" "errors" "fmt" + "io" "io/ioutil" "os" "path/filepath" @@ -198,8 +199,13 @@ func LoadPaths(pid string) (map[string]string, error) { } defer f.Close() + return loadPathsHelper(f) +} + +func loadPathsHelper(cgroup io.Reader) (map[string]string, error) { paths := make(map[string]string) - scanner := bufio.NewScanner(f) + + scanner := bufio.NewScanner(cgroup) for scanner.Scan() { // Format: ID:[name=]controller1,controller2:path // Example: 2:cpu,cpuacct:/user.slice @@ -207,6 +213,9 @@ func LoadPaths(pid string) (map[string]string, error) { if len(tokens) != 3 { return nil, fmt.Errorf("invalid cgroups file, line: %q", scanner.Text()) } + if len(tokens[1]) == 0 { + continue + } for _, ctrlr := range strings.Split(tokens[1], ",") { // Remove prefix for cgroups with no controller, eg. systemd. ctrlr = strings.TrimPrefix(ctrlr, "name=") @@ -225,7 +234,7 @@ func LoadPaths(pid string) (map[string]string, error) { type Cgroup struct { Name string `json:"name"` Parents map[string]string `json:"parents"` - Own bool `json:"own"` + Own map[string]bool `json:"own"` } // New creates a new Cgroup instance if the spec includes a cgroup path. @@ -242,9 +251,11 @@ func New(spec *specs.Spec) (*Cgroup, error) { return nil, fmt.Errorf("finding current cgroups: %w", err) } } + own := make(map[string]bool) return &Cgroup{ Name: spec.Linux.CgroupsPath, Parents: parents, + Own: own, }, nil } @@ -252,18 +263,8 @@ func New(spec *specs.Spec) (*Cgroup, error) { // already exists, it means that the caller has already provided a // pre-configured cgroups, and 'res' is ignored. func (c *Cgroup) Install(res *specs.LinuxResources) error { - if _, err := os.Stat(c.makePath("memory")); err == nil { - // If cgroup has already been created; it has been setup by caller. Don't - // make any changes to configuration, just join when sandbox/gofer starts. - log.Debugf("Using pre-created cgroup %q", c.Name) - return nil - } - log.Debugf("Creating cgroup %q", c.Name) - // Mark that cgroup resources are owned by me. - c.Own = true - // The Cleanup object cleans up partially created cgroups when an error occurs. // Errors occuring during cleanup itself are ignored. clean := cleanup.Make(func() { _ = c.Uninstall() }) @@ -271,6 +272,16 @@ func (c *Cgroup) Install(res *specs.LinuxResources) error { for key, cfg := range controllers { path := c.makePath(key) + if _, err := os.Stat(path); err == nil { + // If cgroup has already been created; it has been setup by caller. Don't + // make any changes to configuration, just join when sandbox/gofer starts. + log.Debugf("Using pre-created cgroup %q", path) + continue + } + + // Mark that cgroup resources are owned by me. + c.Own[key] = true + if err := os.MkdirAll(path, 0755); err != nil { if cfg.optional && errors.Is(err, syscall.EROFS) { log.Infof("Skipping cgroup %q", key) @@ -289,12 +300,12 @@ func (c *Cgroup) Install(res *specs.LinuxResources) error { // Uninstall removes the settings done in Install(). If cgroup path already // existed when Install() was called, Uninstall is a noop. func (c *Cgroup) Uninstall() error { - if !c.Own { - // cgroup is managed by caller, don't touch it. - return nil - } log.Debugf("Deleting cgroup %q", c.Name) for key := range controllers { + if !c.Own[key] { + // cgroup is managed by caller, don't touch it. + continue + } path := c.makePath(key) log.Debugf("Removing cgroup controller for key=%q path=%q", key, path) diff --git a/runsc/cgroup/cgroup_test.go b/runsc/cgroup/cgroup_test.go index 4db5ee5c3..931144cf9 100644 --- a/runsc/cgroup/cgroup_test.go +++ b/runsc/cgroup/cgroup_test.go @@ -29,7 +29,10 @@ func TestUninstallEnoent(t *testing.T) { c := Cgroup{ // set a non-existent name Name: "runsc-test-uninstall-656e6f656e740a", - Own: true, + } + c.Own = make(map[string]bool) + for key := range controllers { + c.Own[key] = true } if err := c.Uninstall(); err != nil { t.Errorf("Uninstall() failed: %v", err) @@ -647,3 +650,83 @@ func TestPids(t *testing.T) { }) } } + +func TestLoadPaths(t *testing.T) { + for _, tc := range []struct { + name string + cgroups string + want map[string]string + err string + }{ + { + name: "abs-path", + cgroups: "0:ctr:/path", + want: map[string]string{"ctr": "/path"}, + }, + { + name: "rel-path", + cgroups: "0:ctr:rel-path", + want: map[string]string{"ctr": "rel-path"}, + }, + { + name: "non-controller", + cgroups: "0:name=systemd:/path", + want: map[string]string{"systemd": "/path"}, + }, + { + name: "empty", + }, + { + name: "multiple", + cgroups: "0:ctr0:/path0\n" + + "1:ctr1:/path1\n" + + "2::/empty\n", + want: map[string]string{ + "ctr0": "/path0", + "ctr1": "/path1", + }, + }, + { + name: "missing-field", + cgroups: "0:nopath\n", + err: "invalid cgroups file", + }, + { + name: "too-many-fields", + cgroups: "0:ctr:/path:extra\n", + err: "invalid cgroups file", + }, + { + name: "multiple-malformed", + cgroups: "0:ctr0:/path0\n" + + "1:ctr1:/path1\n" + + "2:\n", + err: "invalid cgroups file", + }, + } { + t.Run(tc.name, func(t *testing.T) { + r := strings.NewReader(tc.cgroups) + got, err := loadPathsHelper(r) + if len(tc.err) == 0 { + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } + } else if !strings.Contains(err.Error(), tc.err) { + t.Fatalf("Wrong error message, want: *%s*, got: %v", tc.err, err) + } + for key, vWant := range tc.want { + vGot, ok := got[key] + if !ok { + t.Errorf("Missing controller %q", key) + } + if vWant != vGot { + t.Errorf("Wrong controller %q value, want: %q, got: %q", key, vWant, vGot) + } + delete(got, key) + } + for k, v := range got { + t.Errorf("Unexpected controller %q: %q", k, v) + } + }) + } +} diff --git a/runsc/cli/main.go b/runsc/cli/main.go index bca015db5..6c3bf4d21 100644 --- a/runsc/cli/main.go +++ b/runsc/cli/main.go @@ -22,6 +22,7 @@ import ( "io/ioutil" "os" "os/signal" + "runtime" "syscall" "time" @@ -82,6 +83,7 @@ func Main(version string) { subcommands.Register(new(cmd.Spec), "") subcommands.Register(new(cmd.State), "") subcommands.Register(new(cmd.Start), "") + subcommands.Register(new(cmd.Symbolize), "") subcommands.Register(new(cmd.Wait), "") // Register internal commands with the internal group name. This causes @@ -207,6 +209,8 @@ func Main(version string) { log.Infof("***************************") log.Infof("Args: %s", os.Args) log.Infof("Version %s", version) + log.Infof("GOOS: %s", runtime.GOOS) + log.Infof("GOARCH: %s", runtime.GOARCH) log.Infof("PID: %d", os.Getpid()) log.Infof("UID: %d, GID: %d", os.Getuid(), os.Getgid()) log.Infof("Configuration:") diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD index 2556f6d9e..19520d7ab 100644 --- a/runsc/cmd/BUILD +++ b/runsc/cmd/BUILD @@ -32,6 +32,7 @@ go_library( "start.go", "state.go", "statefile.go", + "symbolize.go", "syscalls.go", "wait.go", ], @@ -39,6 +40,7 @@ go_library( "//runsc:__subpackages__", ], deps = [ + "//pkg/coverage", "//pkg/log", "//pkg/p9", "//pkg/sentry/control", diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go index cd419e1aa..2c92e3067 100644 --- a/runsc/cmd/boot.go +++ b/runsc/cmd/boot.go @@ -131,11 +131,11 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) return subcommands.ExitUsageError } - // Ensure that if there is a panic, all goroutine stacks are printed. - debug.SetTraceback("system") - conf := args[0].(*config.Config) + // Set traceback level + debug.SetTraceback(conf.Traceback) + if b.attached { // Ensure this process is killed after parent process terminates when // attached mode is enabled. In the unfortunate event that the parent diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go index 8fe0c427a..c0bc8f064 100644 --- a/runsc/cmd/checkpoint.go +++ b/runsc/cmd/checkpoint.go @@ -75,7 +75,7 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa conf := args[0].(*config.Config) waitStatus := args[1].(*syscall.WaitStatus) - cont, err := container.Load(conf.RootDir, id) + cont, err := container.LoadAndCheck(conf.RootDir, id) if err != nil { Fatalf("loading container: %v", err) } @@ -149,6 +149,9 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa } ws, err := cont.Wait() + if err != nil { + Fatalf("Error waiting for container: %v", err) + } *waitStatus = ws return subcommands.ExitSuccess diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go index 132198222..609e8231c 100644 --- a/runsc/cmd/debug.go +++ b/runsc/cmd/debug.go @@ -91,7 +91,7 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) return subcommands.ExitUsageError } var err error - c, err = container.Load(conf.RootDir, f.Arg(0)) + c, err = container.LoadAndCheck(conf.RootDir, f.Arg(0)) if err != nil { return Errorf("loading container %q: %v", f.Arg(0), err) } @@ -106,7 +106,7 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) return Errorf("listing containers: %v", err) } for _, id := range ids { - candidate, err := container.Load(conf.RootDir, id) + candidate, err := container.LoadAndCheck(conf.RootDir, id) if err != nil { return Errorf("loading container %q: %v", id, err) } diff --git a/runsc/cmd/delete.go b/runsc/cmd/delete.go index 4e49deff8..a25637265 100644 --- a/runsc/cmd/delete.go +++ b/runsc/cmd/delete.go @@ -68,7 +68,7 @@ func (d *Delete) Execute(_ context.Context, f *flag.FlagSet, args ...interface{} func (d *Delete) execute(ids []string, conf *config.Config) error { for _, id := range ids { - c, err := container.Load(conf.RootDir, id) + c, err := container.LoadAndCheck(conf.RootDir, id) if err != nil { if os.IsNotExist(err) && d.force { log.Warningf("couldn't find container %q: %v", id, err) diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go index 640de4c47..8a8d9f752 100644 --- a/runsc/cmd/do.go +++ b/runsc/cmd/do.go @@ -81,7 +81,7 @@ func (c *Do) SetFlags(f *flag.FlagSet) { // Execute implements subcommands.Command.Execute. func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { if len(f.Args()) == 0 { - c.Usage() + f.Usage() return subcommands.ExitUsageError } diff --git a/runsc/cmd/events.go b/runsc/cmd/events.go index 25fe2cf1c..3836b7b4e 100644 --- a/runsc/cmd/events.go +++ b/runsc/cmd/events.go @@ -74,7 +74,7 @@ func (evs *Events) Execute(ctx context.Context, f *flag.FlagSet, args ...interfa id := f.Arg(0) conf := args[0].(*config.Config) - c, err := container.Load(conf.RootDir, id) + c, err := container.LoadAndCheck(conf.RootDir, id) if err != nil { Fatalf("loading sandbox: %v", err) } @@ -85,7 +85,12 @@ func (evs *Events) Execute(ctx context.Context, f *flag.FlagSet, args ...interfa ev, err := c.Event() if err != nil { log.Warningf("Error getting events for container: %v", err) + if evs.stats { + return subcommands.ExitFailure + } } + log.Debugf("Events: %+v", ev) + // err must be preserved because it is used below when breaking // out of the loop. b, err := json.Marshal(ev) @@ -101,11 +106,9 @@ func (evs *Events) Execute(ctx context.Context, f *flag.FlagSet, args ...interfa if err != nil { return subcommands.ExitFailure } - break + return subcommands.ExitSuccess } time.Sleep(time.Duration(evs.intervalSec) * time.Second) } - - return subcommands.ExitSuccess } diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go index 775ed4b43..eafd6285c 100644 --- a/runsc/cmd/exec.go +++ b/runsc/cmd/exec.go @@ -112,7 +112,7 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) } waitStatus := args[1].(*syscall.WaitStatus) - c, err := container.Load(conf.RootDir, id) + c, err := container.LoadAndCheck(conf.RootDir, id) if err != nil { Fatalf("loading sandbox: %v", err) } @@ -150,7 +150,7 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) } func (ex *Exec) exec(c *container.Container, e *control.ExecArgs, waitStatus *syscall.WaitStatus) subcommands.ExitStatus { - // Start the new process and get it pid. + // Start the new process and get its pid. pid, err := c.Execute(e) if err != nil { return Errorf("executing processes for container: %v", err) diff --git a/runsc/cmd/kill.go b/runsc/cmd/kill.go index 04eee99b2..fe69e2a08 100644 --- a/runsc/cmd/kill.go +++ b/runsc/cmd/kill.go @@ -69,7 +69,7 @@ func (k *Kill) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) Fatalf("it is invalid to specify both --all and --pid") } - c, err := container.Load(conf.RootDir, id) + c, err := container.LoadAndCheck(conf.RootDir, id) if err != nil { Fatalf("loading container: %v", err) } diff --git a/runsc/cmd/list.go b/runsc/cmd/list.go index f92d6fef9..6907eb16a 100644 --- a/runsc/cmd/list.go +++ b/runsc/cmd/list.go @@ -79,7 +79,7 @@ func (l *List) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) // Collect the containers. var containers []*container.Container for _, id := range ids { - c, err := container.Load(conf.RootDir, id) + c, err := container.LoadAndCheck(conf.RootDir, id) if err != nil { Fatalf("loading container %q: %v", id, err) } diff --git a/runsc/cmd/pause.go b/runsc/cmd/pause.go index 0eb1402ed..fe7d4e257 100644 --- a/runsc/cmd/pause.go +++ b/runsc/cmd/pause.go @@ -55,7 +55,7 @@ func (*Pause) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s id := f.Arg(0) conf := args[0].(*config.Config) - cont, err := container.Load(conf.RootDir, id) + cont, err := container.LoadAndCheck(conf.RootDir, id) if err != nil { Fatalf("loading container: %v", err) } diff --git a/runsc/cmd/ps.go b/runsc/cmd/ps.go index bc58c928f..18d7a1436 100644 --- a/runsc/cmd/ps.go +++ b/runsc/cmd/ps.go @@ -60,7 +60,7 @@ func (ps *PS) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) id := f.Arg(0) conf := args[0].(*config.Config) - c, err := container.Load(conf.RootDir, id) + c, err := container.LoadAndCheck(conf.RootDir, id) if err != nil { Fatalf("loading sandbox: %v", err) } diff --git a/runsc/cmd/resume.go b/runsc/cmd/resume.go index f24823f99..a00928204 100644 --- a/runsc/cmd/resume.go +++ b/runsc/cmd/resume.go @@ -56,7 +56,7 @@ func (r *Resume) Execute(_ context.Context, f *flag.FlagSet, args ...interface{} id := f.Arg(0) conf := args[0].(*config.Config) - cont, err := container.Load(conf.RootDir, id) + cont, err := container.LoadAndCheck(conf.RootDir, id) if err != nil { Fatalf("loading container: %v", err) } diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go index 88991b521..f6499cc44 100644 --- a/runsc/cmd/start.go +++ b/runsc/cmd/start.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" + "gvisor.dev/gvisor/runsc/specutils" ) // Start implements subcommands.Command for the "start" command. @@ -54,10 +55,16 @@ func (*Start) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s id := f.Arg(0) conf := args[0].(*config.Config) - c, err := container.Load(conf.RootDir, id) + c, err := container.LoadAndCheck(conf.RootDir, id) if err != nil { Fatalf("loading container: %v", err) } + // Read the spec again here to ensure flag annotations from the spec are + // applied to "conf". + if _, err := specutils.ReadSpec(c.BundleDir, conf); err != nil { + Fatalf("reading spec: %v", err) + } + if err := c.Start(conf); err != nil { Fatalf("starting container: %v", err) } diff --git a/runsc/cmd/state.go b/runsc/cmd/state.go index 2bd2ab9f8..d8a70dd7f 100644 --- a/runsc/cmd/state.go +++ b/runsc/cmd/state.go @@ -57,7 +57,7 @@ func (*State) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s id := f.Arg(0) conf := args[0].(*config.Config) - c, err := container.Load(conf.RootDir, id) + c, err := container.LoadAndCheck(conf.RootDir, id) if err != nil { Fatalf("loading container: %v", err) } diff --git a/runsc/cmd/symbolize.go b/runsc/cmd/symbolize.go new file mode 100644 index 000000000..fc0c69358 --- /dev/null +++ b/runsc/cmd/symbolize.go @@ -0,0 +1,91 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "bufio" + "context" + "os" + "strconv" + "strings" + + "github.com/google/subcommands" + "gvisor.dev/gvisor/pkg/coverage" + "gvisor.dev/gvisor/runsc/flag" +) + +// Symbolize implements subcommands.Command for the "symbolize" command. +type Symbolize struct { + dumpAll bool +} + +// Name implements subcommands.Command.Name. +func (*Symbolize) Name() string { + return "symbolize" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Symbolize) Synopsis() string { + return "Convert synthetic instruction pointers from kcov into positions in the runsc source code. Only used when Go coverage is enabled." +} + +// Usage implements subcommands.Command.Usage. +func (*Symbolize) Usage() string { + return `symbolize - converts synthetic instruction pointers into positions in the runsc source code. + +This command takes instruction pointers from stdin and converts them into their +corresponding file names and line/column numbers in the runsc source code. The +inputs are not interpreted as actual addresses, but as synthetic values that are +exposed through /sys/kernel/debug/kcov. One can extract coverage information +from kcov and translate those values into locations in the source code by +running symbolize on the same runsc binary. +` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (c *Symbolize) SetFlags(f *flag.FlagSet) { + f.BoolVar(&c.dumpAll, "all", false, "dump information on all coverage blocks along with their synthetic PCs") +} + +// Execute implements subcommands.Command.Execute. +func (c *Symbolize) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if f.NArg() != 0 { + f.Usage() + return subcommands.ExitUsageError + } + if !coverage.KcovAvailable() { + return Errorf("symbolize can only be used when coverage is available.") + } + coverage.InitCoverageData() + + if c.dumpAll { + coverage.WriteAllBlocks(os.Stdout) + return subcommands.ExitSuccess + } + + scanner := bufio.NewScanner(os.Stdin) + for scanner.Scan() { + // Input is always base 16, but may or may not have a leading "0x". + str := strings.TrimPrefix(scanner.Text(), "0x") + pc, err := strconv.ParseUint(str, 16 /* base */, 64 /* bitSize */) + if err != nil { + return Errorf("Failed to symbolize \"%s\": %v", scanner.Text(), err) + } + if err := coverage.Symbolize(os.Stdout, pc); err != nil { + return Errorf("Failed to symbolize \"%s\": %v", scanner.Text(), err) + } + } + return subcommands.ExitSuccess +} diff --git a/runsc/cmd/wait.go b/runsc/cmd/wait.go index 28d0642ed..c1d6aeae2 100644 --- a/runsc/cmd/wait.go +++ b/runsc/cmd/wait.go @@ -72,7 +72,7 @@ func (wt *Wait) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) id := f.Arg(0) conf := args[0].(*config.Config) - c, err := container.Load(conf.RootDir, id) + c, err := container.LoadAndCheck(conf.RootDir, id) if err != nil { Fatalf("loading container: %v", err) } diff --git a/runsc/config/config.go b/runsc/config/config.go index f30f79f68..b02d8e2e1 100644 --- a/runsc/config/config.go +++ b/runsc/config/config.go @@ -37,6 +37,9 @@ type Config struct { // RootDir is the runtime root directory. RootDir string `flag:"root"` + // Traceback changes the Go runtime's traceback level. + Traceback string `flag:"traceback"` + // Debug indicates that debug logging should be enabled. Debug bool `flag:"debug"` diff --git a/runsc/config/flags.go b/runsc/config/flags.go index a5f25cfa2..02ab9255a 100644 --- a/runsc/config/flags.go +++ b/runsc/config/flags.go @@ -29,7 +29,7 @@ import ( var registration sync.Once -// This is the set of flags used to populate Config. +// RegisterFlags registers flags used to populate Config. func RegisterFlags() { registration.Do(func() { // Although these flags are not part of the OCI spec, they are used by @@ -49,6 +49,7 @@ func RegisterFlags() { flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s.") flag.Bool("alsologtostderr", false, "send log messages to stderr.") flag.Bool("allow-flag-override", false, "allow OCI annotations (dev.gvisor.flag.<name>) to override flags for debugging.") + flag.String("traceback", "system", "golang runtime's traceback level") // Debugging flags: strace related flag.Bool("strace", false, "enable strace.") @@ -70,7 +71,7 @@ func RegisterFlags() { flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.") flag.Bool("overlayfs-stale-read", true, "assume root mount is an overlay filesystem") flag.Bool("fsgofer-host-uds", false, "allow the gofer to mount Unix Domain Sockets.") - flag.Bool("vfs2", false, "TEST ONLY; use while VFSv2 is landing. This uses the new experimental VFS layer.") + flag.Bool("vfs2", false, "enables VFSv2. This uses the new VFS layer that is faster than the previous one.") flag.Bool("fuse", false, "TEST ONLY; use while FUSE in VFSv2 is landing. This allows the use of the new experimental FUSE filesystem.") // Flags that control sandbox runtime behavior: network related. diff --git a/runsc/console/console.go b/runsc/console/console.go index dbb88e117..b36028792 100644 --- a/runsc/console/console.go +++ b/runsc/console/console.go @@ -24,8 +24,8 @@ import ( "golang.org/x/sys/unix" ) -// NewWithSocket creates pty master/replica pair, sends the master FD over the given -// socket, and returns the replica. +// NewWithSocket creates pty master/replica pair, sends the master FD over the +// given socket, and returns the replica. func NewWithSocket(socketPath string) (*os.File, error) { // Create a new pty master and replica. ptyMaster, ptyReplica, err := pty.Open() diff --git a/runsc/container/BUILD b/runsc/container/BUILD index c33755482..8793c8916 100644 --- a/runsc/container/BUILD +++ b/runsc/container/BUILD @@ -1,4 +1,4 @@ -load("//tools:defs.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library", "go_test", "more_shards") package(licenses = ["notice"]) @@ -24,6 +24,7 @@ go_library( "//runsc/boot", "//runsc/cgroup", "//runsc/config", + "//runsc/console", "//runsc/sandbox", "//runsc/specutils", "@com_github_cenkalti_backoff//:go_default_library", @@ -48,7 +49,7 @@ go_test( "//test/cmd/test_app", ], library = ":container", - shard_count = 10, + shard_count = more_shards, tags = [ "requires-kvm", ], diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go index 4228399b8..1b0fdebd6 100644 --- a/runsc/container/console_test.go +++ b/runsc/container/console_test.go @@ -18,6 +18,7 @@ import ( "bytes" "fmt" "io" + "math/rand" "os" "path/filepath" "syscall" @@ -27,7 +28,6 @@ import ( "github.com/kr/pty" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/sentry/control" - "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/test/testutil" "gvisor.dev/gvisor/pkg/unet" @@ -38,19 +38,22 @@ import ( // path is under 108 charactors (the unix socket path length limit), // relativizing the path if necessary. func socketPath(bundleDir string) (string, error) { - path := filepath.Join(bundleDir, "socket") + num := rand.Intn(10000) + path := filepath.Join(bundleDir, fmt.Sprintf("socket-%4d", num)) + const maxPathLen = 108 + if len(path) <= maxPathLen { + return path, nil + } + + // Path is too large, try to make it smaller. cwd, err := os.Getwd() if err != nil { return "", fmt.Errorf("error getting cwd: %v", err) } - relPath, err := filepath.Rel(cwd, path) + path, err = filepath.Rel(cwd, path) if err != nil { return "", fmt.Errorf("error getting relative path for %q from cwd %q: %v", path, cwd, err) } - if len(path) > len(relPath) { - path = relPath - } - const maxPathLen = 108 if len(path) > maxPathLen { return "", fmt.Errorf("could not get socket path under length limit %d: %s", maxPathLen, path) } @@ -159,6 +162,82 @@ func TestConsoleSocket(t *testing.T) { } } +// Test that an pty FD is sent over the console socket if one is provided. +func TestMultiContainerConsoleSocket(t *testing.T) { + for name, conf := range configsWithVFS2(t, all...) { + t.Run(name, func(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir + + // Setup the containers. + sleep := []string{"sleep", "100"} + tru := []string{"true"} + testSpecs, ids := createSpecs(sleep, tru) + testSpecs[1].Process.Terminal = true + + bundleDir, cleanup, err := testutil.SetupBundleDir(testSpecs[0]) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + args := Args{ + ID: ids[0], + Spec: testSpecs[0], + BundleDir: bundleDir, + } + rootCont, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer rootCont.Destroy() + if err := rootCont.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + bundleDir, cleanup, err = testutil.SetupBundleDir(testSpecs[0]) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + sock, err := socketPath(bundleDir) + if err != nil { + t.Fatalf("error getting socket path: %v", err) + } + srv, cleanup := createConsoleSocket(t, sock) + defer cleanup() + + // Create the container and pass the socket name. + args = Args{ + ID: ids[1], + Spec: testSpecs[1], + BundleDir: bundleDir, + ConsoleSocket: sock, + } + cont, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer cont.Destroy() + if err := cont.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Make sure we get a console PTY. + ptyMaster, err := receiveConsolePTY(srv) + if err != nil { + t.Fatalf("error receiving console FD: %v", err) + } + ptyMaster.Close() + }) + } +} + // Test that job control signals work on a console created with "exec -ti". func TestJobControlSignalExec(t *testing.T) { spec := testutil.NewSpecWithArgs("/bin/sleep", "10000") @@ -221,9 +300,9 @@ func TestJobControlSignalExec(t *testing.T) { // Make sure all the processes are running. expectedPL := []*control.Process{ // Root container process. - {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, + newProcessBuilder().Cmd("sleep").Process(), // Bash from exec process. - {PID: 2, Cmd: "bash", Threads: []kernel.ThreadID{2}}, + newProcessBuilder().PID(2).Cmd("bash").Process(), } if err := waitForProcessList(c, expectedPL); err != nil { t.Error(err) @@ -233,7 +312,7 @@ func TestJobControlSignalExec(t *testing.T) { ptyMaster.Write([]byte("sleep 100\n")) // Wait for it to start. Sleep's PPID is bash's PID. - expectedPL = append(expectedPL, &control.Process{PID: 3, PPID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{3}}) + expectedPL = append(expectedPL, newProcessBuilder().PID(3).PPID(2).Cmd("sleep").Process()) if err := waitForProcessList(c, expectedPL); err != nil { t.Error(err) } @@ -254,7 +333,7 @@ func TestJobControlSignalExec(t *testing.T) { // Sleep is dead, but it may take more time for bash to notice and // change the foreground process back to itself. We know it is done // when bash writes "Terminated" to the pty. - if err := testutil.WaitUntilRead(ptyMaster, "Terminated", nil, 5*time.Second); err != nil { + if err := testutil.WaitUntilRead(ptyMaster, "Terminated", 5*time.Second); err != nil { t.Fatalf("bash did not take over pty: %v", err) } @@ -359,7 +438,7 @@ func TestJobControlSignalRootContainer(t *testing.T) { // Wait for bash to start. expectedPL := []*control.Process{ - {PID: 1, Cmd: "bash", Threads: []kernel.ThreadID{1}}, + newProcessBuilder().PID(1).Cmd("bash").Process(), } if err := waitForProcessList(c, expectedPL); err != nil { t.Fatalf("error waiting for processes: %v", err) @@ -369,7 +448,7 @@ func TestJobControlSignalRootContainer(t *testing.T) { ptyMaster.Write([]byte("sleep 100\n")) // Wait for sleep to start. - expectedPL = append(expectedPL, &control.Process{PID: 2, PPID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{2}}) + expectedPL = append(expectedPL, newProcessBuilder().PID(2).PPID(1).Cmd("sleep").Process()) if err := waitForProcessList(c, expectedPL); err != nil { t.Fatalf("error waiting for processes: %v", err) } @@ -393,7 +472,7 @@ func TestJobControlSignalRootContainer(t *testing.T) { // Sleep is dead, but it may take more time for bash to notice and // change the foreground process back to itself. We know it is done // when bash writes "Terminated" to the pty. - if err := testutil.WaitUntilRead(ptyBuf, "Terminated", nil, 5*time.Second); err != nil { + if err := testutil.WaitUntilRead(ptyBuf, "Terminated", 5*time.Second); err != nil { t.Fatalf("bash did not take over pty: %v", err) } @@ -414,6 +493,104 @@ func TestJobControlSignalRootContainer(t *testing.T) { } } +// Test that terminal works with root and sub-containers. +func TestMultiContainerTerminal(t *testing.T) { + for name, conf := range configsWithVFS2(t, all...) { + t.Run(name, func(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir + + // Don't let bash execute from profile or rc files, otherwise our PID + // counts get messed up. + bash := []string{"/bin/bash", "--noprofile", "--norc"} + testSpecs, ids := createSpecs(bash, bash) + + type termContainer struct { + container *Container + master *os.File + } + var containers []termContainer + for i, spec := range testSpecs { + bundleDir, cleanup, err := testutil.SetupBundleDir(spec) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + spec.Process.Terminal = true + sock, err := socketPath(bundleDir) + if err != nil { + t.Fatalf("error getting socket path: %v", err) + } + srv, cleanup := createConsoleSocket(t, sock) + defer cleanup() + + // Create the container and pass the socket name. + args := Args{ + ID: ids[i], + Spec: spec, + BundleDir: bundleDir, + ConsoleSocket: sock, + } + cont, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer cont.Destroy() + + if err := cont.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Make sure we get a console PTY. + ptyMaster, err := receiveConsolePTY(srv) + if err != nil { + t.Fatalf("error receiving console FD: %v", err) + } + defer ptyMaster.Close() + + containers = append(containers, termContainer{ + container: cont, + master: ptyMaster, + }) + } + + for _, tc := range containers { + // Bash output as well as sandbox output will be written to the PTY + // file. Writes after a certain point will block unless we drain the + // PTY, so we must continually copy from it. + // + // We log the output to stderr for debugabilitly, and also to a buffer, + // since we wait on particular output from bash below. We use a custom + // blockingBuffer which is thread-safe and also blocks on Read calls, + // which makes this a suitable Reader for WaitUntilRead. + ptyBuf := newBlockingBuffer() + tee := io.TeeReader(tc.master, ptyBuf) + go io.Copy(os.Stderr, tee) + + // Wait for bash to start. + expectedPL := []*control.Process{ + newProcessBuilder().Cmd("bash").Process(), + } + if err := waitForProcessList(tc.container, expectedPL); err != nil { + t.Fatalf("error waiting for processes: %v", err) + } + + // Execute echo command and check that it was executed correctly. Use + // a variable to ensure it's not matching against command echo. + tc.master.Write([]byte("echo foo-${PWD}-123\n")) + if err := testutil.WaitUntilRead(ptyBuf, "foo-/-123", 5*time.Second); err != nil { + t.Fatalf("echo didn't execute: %v", err) + } + } + }) + } +} + // blockingBuffer is a thread-safe buffer that blocks when reading if the // buffer is empty. It implements io.ReadWriter. type blockingBuffer struct { diff --git a/runsc/container/container.go b/runsc/container/container.go index 63f64ce6e..418a27beb 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -38,6 +38,7 @@ import ( "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/cgroup" "gvisor.dev/gvisor/runsc/config" + "gvisor.dev/gvisor/runsc/console" "gvisor.dev/gvisor/runsc/sandbox" "gvisor.dev/gvisor/runsc/specutils" ) @@ -79,6 +80,7 @@ func validateID(id string) error { // - It calls 'runsc delete'. runc implementation kills --all SIGKILL once // again just to be sure, waits, and then proceeds with remaining teardown. // +// Container is thread-unsafe. type Container struct { // ID is the container ID. ID string `json:"id"` @@ -159,9 +161,9 @@ func loadSandbox(rootDir, id string) ([]*Container, error) { // container to which id unambiguously refers to. Returns ErrNotExist if // container doesn't exist. func Load(rootDir, partialID string) (*Container, error) { - log.Debugf("Load container %q %q", rootDir, partialID) + log.Debugf("Load container, rootDir: %q, partial cid: %s", rootDir, partialID) if err := validateID(partialID); err != nil { - return nil, fmt.Errorf("validating id: %v", err) + return nil, fmt.Errorf("invalid container id: %v", err) } id, err := findContainerID(rootDir, partialID) @@ -184,22 +186,31 @@ func Load(rootDir, partialID string) (*Container, error) { } return nil, fmt.Errorf("reading container metadata file %q: %v", state.statePath(), err) } + return c, nil +} + +// LoadAndCheck is similar to Load(), but also checks if the container is still +// running to get an error earlier to the caller. +func LoadAndCheck(rootDir, partialID string) (*Container, error) { + c, err := Load(rootDir, partialID) + if err != nil { + // Preserve error so that callers can distinguish 'not found' errors. + return nil, err + } - // If the status is "Running" or "Created", check that the sandbox - // process still exists, and set it to Stopped if it does not. + // If the status is "Running" or "Created", check that the sandbox/container + // is still running, setting it to Stopped if not. // // This is inherently racy. - if c.Status == Running || c.Status == Created { - // Check if the sandbox process is still running. + switch c.Status { + case Created: if !c.isSandboxRunning() { // Sandbox no longer exists, so this container definitely does not exist. c.changeStatus(Stopped) - } else if c.Status == Running { - // Container state should reflect the actual state of the application, so - // we don't consider gofer process here. - if err := c.SignalContainer(syscall.Signal(0), false); err != nil { - c.changeStatus(Stopped) - } + } + case Running: + if err := c.SignalContainer(syscall.Signal(0), false); err != nil { + c.changeStatus(Stopped) } } @@ -271,7 +282,7 @@ type Args struct { // indicates that an existing Sandbox should be used. The caller must call // Destroy() on the container. func New(conf *config.Config, args Args) (*Container, error) { - log.Debugf("Create container %q in root dir: %s", args.ID, conf.RootDir) + log.Debugf("Create container, cid: %s, rootDir: %q", args.ID, conf.RootDir) if err := validateID(args.ID); err != nil { return nil, err } @@ -310,7 +321,15 @@ func New(conf *config.Config, args Args) (*Container, error) { // indicate the ID of the sandbox, which is the same as the ID of the // init container in the sandbox. if isRoot(args.Spec) { - log.Debugf("Creating new sandbox for container %q", args.ID) + log.Debugf("Creating new sandbox for container, cid: %s", args.ID) + + if args.Spec.Linux == nil { + args.Spec.Linux = &specs.Linux{} + } + // Don't force the use of cgroups in tests because they lack permission to do so. + if args.Spec.Linux.CgroupsPath == "" && !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { + args.Spec.Linux.CgroupsPath = "/" + args.ID + } // Create and join cgroup before processes are created to ensure they are // part of the cgroup from the start (and all their children processes). @@ -321,7 +340,13 @@ func New(conf *config.Config, args Args) (*Container, error) { if cg != nil { // If there is cgroup config, install it before creating sandbox process. if err := cg.Install(args.Spec.Linux.Resources); err != nil { - return nil, fmt.Errorf("configuring cgroup: %v", err) + switch { + case errors.Is(err, syscall.EACCES) && conf.Rootless: + log.Warningf("Skipping cgroup configuration in rootless mode: %v", err) + cg = nil + default: + return nil, fmt.Errorf("configuring cgroup: %v", err) + } } } if err := runInCgroup(cg, func() error { @@ -366,15 +391,30 @@ func New(conf *config.Config, args Args) (*Container, error) { if !ok { return nil, fmt.Errorf("no sandbox ID found when creating container") } - log.Debugf("Creating new container %q in sandbox %q", c.ID, sbid) + log.Debugf("Creating new container, cid: %s, sandbox: %s", c.ID, sbid) // Find the sandbox associated with this ID. - sb, err := Load(conf.RootDir, sbid) + sb, err := LoadAndCheck(conf.RootDir, sbid) if err != nil { return nil, err } c.Sandbox = sb.Sandbox - if err := c.Sandbox.CreateContainer(c.ID); err != nil { + + // If the console control socket file is provided, then create a new + // pty master/slave pair and send the TTY to the sandbox process. + var tty *os.File + if c.ConsoleSocket != "" { + // Create a new TTY pair and send the master on the provided socket. + var err error + tty, err = console.NewWithSocket(c.ConsoleSocket) + if err != nil { + return nil, fmt.Errorf("setting up console with socket %q: %w", c.ConsoleSocket, err) + } + // tty file is transferred to the sandbox, then it can be closed here. + defer tty.Close() + } + + if err := c.Sandbox.CreateContainer(c.ID, tty); err != nil { return nil, err } } @@ -399,7 +439,7 @@ func New(conf *config.Config, args Args) (*Container, error) { // Start starts running the containerized process inside the sandbox. func (c *Container) Start(conf *config.Config) error { - log.Debugf("Start container %q", c.ID) + log.Debugf("Start container, cid: %s", c.ID) if err := c.Saver.lock(); err != nil { return err @@ -428,11 +468,16 @@ func (c *Container) Start(conf *config.Config) error { // the start (and all their children processes). if err := runInCgroup(c.Sandbox.Cgroup, func() error { // Create the gofer process. - ioFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir, false) + goferFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir, false) if err != nil { return err } - defer mountsFile.Close() + defer func() { + _ = mountsFile.Close() + for _, f := range goferFiles { + _ = f.Close() + } + }() cleanMounts, err := specutils.ReadMounts(mountsFile) if err != nil { @@ -440,7 +485,14 @@ func (c *Container) Start(conf *config.Config) error { } c.Spec.Mounts = cleanMounts - return c.Sandbox.StartContainer(c.Spec, conf, c.ID, ioFiles) + // Setup stdios if the container is not using terminal. Otherwise TTY was + // already setup in create. + var stdios []*os.File + if !c.Spec.Process.Terminal { + stdios = []*os.File{os.Stdin, os.Stdout, os.Stderr} + } + + return c.Sandbox.StartContainer(c.Spec, conf, c.ID, stdios, goferFiles) }); err != nil { return err } @@ -462,7 +514,7 @@ func (c *Container) Start(conf *config.Config) error { unlock.Clean() // Adjust the oom_score_adj for sandbox. This must be done after saveLocked(). - if err := adjustSandboxOOMScoreAdj(c.Sandbox, c.Saver.RootDir, false); err != nil { + if err := adjustSandboxOOMScoreAdj(c.Sandbox, c.Spec, c.Saver.RootDir, false); err != nil { return err } @@ -474,7 +526,7 @@ func (c *Container) Start(conf *config.Config) error { // Restore takes a container and replaces its kernel and file system // to restore a container from its state file. func (c *Container) Restore(spec *specs.Spec, conf *config.Config, restoreFile string) error { - log.Debugf("Restore container %q", c.ID) + log.Debugf("Restore container, cid: %s", c.ID) if err := c.Saver.lock(); err != nil { return err } @@ -501,7 +553,7 @@ func (c *Container) Restore(spec *specs.Spec, conf *config.Config, restoreFile s // Run is a helper that calls Create + Start + Wait. func Run(conf *config.Config, args Args) (syscall.WaitStatus, error) { - log.Debugf("Run container %q in root dir: %s", args.ID, conf.RootDir) + log.Debugf("Run container, cid: %s, rootDir: %q", args.ID, conf.RootDir) c, err := New(conf, args) if err != nil { return 0, fmt.Errorf("creating container: %v", err) @@ -533,7 +585,7 @@ func Run(conf *config.Config, args Args) (syscall.WaitStatus, error) { // Execute runs the specified command in the container. It returns the PID of // the newly created process. func (c *Container) Execute(args *control.ExecArgs) (int32, error) { - log.Debugf("Execute in container %q, args: %+v", c.ID, args) + log.Debugf("Execute in container, cid: %s, args: %+v", c.ID, args) if err := c.requireStatus("execute in", Created, Running); err != nil { return 0, err } @@ -543,7 +595,7 @@ func (c *Container) Execute(args *control.ExecArgs) (int32, error) { // Event returns events for the container. func (c *Container) Event() (*boot.Event, error) { - log.Debugf("Getting events for container %q", c.ID) + log.Debugf("Getting events for container, cid: %s", c.ID) if err := c.requireStatus("get events for", Created, Running, Paused); err != nil { return nil, err } @@ -563,14 +615,19 @@ func (c *Container) SandboxPid() int { // Call to wait on a stopped container is needed to retrieve the exit status // and wait returns immediately. func (c *Container) Wait() (syscall.WaitStatus, error) { - log.Debugf("Wait on container %q", c.ID) - return c.Sandbox.Wait(c.ID) + log.Debugf("Wait on container, cid: %s", c.ID) + ws, err := c.Sandbox.Wait(c.ID) + if err == nil { + // Wait succeeded, container is not running anymore. + c.changeStatus(Stopped) + } + return ws, err } // WaitRootPID waits for process 'pid' in the sandbox's PID namespace and // returns its WaitStatus. func (c *Container) WaitRootPID(pid int32) (syscall.WaitStatus, error) { - log.Debugf("Wait on PID %d in sandbox %q", pid, c.Sandbox.ID) + log.Debugf("Wait on process %d in sandbox, cid: %s", pid, c.Sandbox.ID) if !c.isSandboxRunning() { return 0, fmt.Errorf("sandbox is not running") } @@ -580,7 +637,7 @@ func (c *Container) WaitRootPID(pid int32) (syscall.WaitStatus, error) { // WaitPID waits for process 'pid' in the container's PID namespace and returns // its WaitStatus. func (c *Container) WaitPID(pid int32) (syscall.WaitStatus, error) { - log.Debugf("Wait on PID %d in container %q", pid, c.ID) + log.Debugf("Wait on process %d in container, cid: %s", pid, c.ID) if !c.isSandboxRunning() { return 0, fmt.Errorf("sandbox is not running") } @@ -592,7 +649,7 @@ func (c *Container) WaitPID(pid int32) (syscall.WaitStatus, error) { // SignalContainer returns an error if the container is already stopped. // TODO(b/113680494): Distinguish different error types. func (c *Container) SignalContainer(sig syscall.Signal, all bool) error { - log.Debugf("Signal container %q: %v", c.ID, sig) + log.Debugf("Signal container, cid: %s, signal: %v (%d)", c.ID, sig, sig) // Signaling container in Stopped state is allowed. When all=false, // an error will be returned anyway; when all=true, this allows // sending signal to other processes inside the container even @@ -609,7 +666,7 @@ func (c *Container) SignalContainer(sig syscall.Signal, all bool) error { // SignalProcess sends sig to a specific process in the container. func (c *Container) SignalProcess(sig syscall.Signal, pid int32) error { - log.Debugf("Signal process %d in container %q: %v", pid, c.ID, sig) + log.Debugf("Signal process %d in container, cid: %s, signal: %v (%d)", pid, c.ID, sig, sig) if err := c.requireStatus("signal a process inside", Running); err != nil { return err } @@ -623,15 +680,15 @@ func (c *Container) SignalProcess(sig syscall.Signal, pid int32) error { // container process inside the sandbox. It returns a function that will stop // forwarding signals. func (c *Container) ForwardSignals(pid int32, fgProcess bool) func() { - log.Debugf("Forwarding all signals to container %q PID %d fgProcess=%t", c.ID, pid, fgProcess) + log.Debugf("Forwarding all signals to container, cid: %s, PIDPID: %d, fgProcess: %t", c.ID, pid, fgProcess) stop := sighandling.StartSignalForwarding(func(sig linux.Signal) { - log.Debugf("Forwarding signal %d to container %q PID %d fgProcess=%t", sig, c.ID, pid, fgProcess) + log.Debugf("Forwarding signal %d to container, cid: %s, PID: %d, fgProcess: %t", sig, c.ID, pid, fgProcess) if err := c.Sandbox.SignalProcess(c.ID, pid, syscall.Signal(sig), fgProcess); err != nil { log.Warningf("error forwarding signal %d to container %q: %v", sig, c.ID, err) } }) return func() { - log.Debugf("Done forwarding signals to container %q PID %d fgProcess=%t", c.ID, pid, fgProcess) + log.Debugf("Done forwarding signals to container, cid: %s, PID: %d, fgProcess: %t", c.ID, pid, fgProcess) stop() } } @@ -639,7 +696,7 @@ func (c *Container) ForwardSignals(pid int32, fgProcess bool) func() { // Checkpoint sends the checkpoint call to the container. // The statefile will be written to f, the file at the specified image-path. func (c *Container) Checkpoint(f *os.File) error { - log.Debugf("Checkpoint container %q", c.ID) + log.Debugf("Checkpoint container, cid: %s", c.ID) if err := c.requireStatus("checkpoint", Created, Running, Paused); err != nil { return err } @@ -649,7 +706,7 @@ func (c *Container) Checkpoint(f *os.File) error { // Pause suspends the container and its kernel. // The call only succeeds if the container's status is created or running. func (c *Container) Pause() error { - log.Debugf("Pausing container %q", c.ID) + log.Debugf("Pausing container, cid: %s", c.ID) if err := c.Saver.lock(); err != nil { return err } @@ -660,7 +717,7 @@ func (c *Container) Pause() error { } if err := c.Sandbox.Pause(c.ID); err != nil { - return fmt.Errorf("pausing container: %v", err) + return fmt.Errorf("pausing container %q: %v", c.ID, err) } c.changeStatus(Paused) return c.saveLocked() @@ -669,7 +726,7 @@ func (c *Container) Pause() error { // Resume unpauses the container and its kernel. // The call only succeeds if the container's status is paused. func (c *Container) Resume() error { - log.Debugf("Resuming container %q", c.ID) + log.Debugf("Resuming container, cid: %s", c.ID) if err := c.Saver.lock(); err != nil { return err } @@ -708,7 +765,7 @@ func (c *Container) Processes() ([]*control.Process, error) { // Destroy stops all processes and frees all resources associated with the // container. func (c *Container) Destroy() error { - log.Debugf("Destroy container %q", c.ID) + log.Debugf("Destroy container, cid: %s", c.ID) if err := c.Saver.lock(); err != nil { return err @@ -745,14 +802,12 @@ func (c *Container) Destroy() error { c.changeStatus(Stopped) // Adjust oom_score_adj for the sandbox. This must be done after the container - // is stopped and the directory at c.Root is removed. Adjustment can be - // skipped if the root container is exiting, because it brings down the entire - // sandbox. + // is stopped and the directory at c.Root is removed. // // Use 'sb' to tell whether it has been executed before because Destroy must // be idempotent. - if sb != nil && !isRoot(c.Spec) { - if err := adjustSandboxOOMScoreAdj(sb, c.Saver.RootDir, true); err != nil { + if sb != nil { + if err := adjustSandboxOOMScoreAdj(sb, c.Spec, c.Saver.RootDir, true); err != nil { errs = append(errs, err.Error()) } } @@ -781,7 +836,7 @@ func (c *Container) Destroy() error { // // Precondition: container must be locked with container.lock(). func (c *Container) saveLocked() error { - log.Debugf("Save container %q", c.ID) + log.Debugf("Save container, cid: %s", c.ID) if err := c.Saver.saveLocked(c); err != nil { return fmt.Errorf("saving container metadata: %v", err) } @@ -795,7 +850,7 @@ func (c *Container) stop() error { var cgroup *cgroup.Cgroup if c.Sandbox != nil { - log.Debugf("Destroying container %q", c.ID) + log.Debugf("Destroying container, cid: %s", c.ID) if err := c.Sandbox.DestroyContainer(c.ID); err != nil { return fmt.Errorf("destroying container %q: %v", c.ID, err) } @@ -809,7 +864,7 @@ func (c *Container) stop() error { // Try killing gofer if it does not exit with container. if c.GoferPid != 0 { - log.Debugf("Killing gofer for container %q, PID: %d", c.ID, c.GoferPid) + log.Debugf("Killing gofer for container, cid: %s, PID: %d", c.ID, c.GoferPid) if err := syscall.Kill(c.GoferPid, syscall.SIGKILL); err != nil { // The gofer may already be stopped, log the error. log.Warningf("Error sending signal %d to gofer %d: %v", syscall.SIGKILL, c.GoferPid, err) @@ -1082,7 +1137,13 @@ func (c *Container) adjustGoferOOMScoreAdj() error { // TODO(gvisor.dev/issue/238): This call could race with other containers being // created at the same time and end up setting the wrong oom_score_adj to the // sandbox. Use rpc client to synchronize. -func adjustSandboxOOMScoreAdj(s *sandbox.Sandbox, rootDir string, destroy bool) error { +func adjustSandboxOOMScoreAdj(s *sandbox.Sandbox, spec *specs.Spec, rootDir string, destroy bool) error { + // Adjustment can be skipped if the root container is exiting, because it + // brings down the entire sandbox. + if isRoot(spec) && destroy { + return nil + } + containers, err := loadSandbox(rootDir, s.ID) if err != nil { return fmt.Errorf("loading sandbox containers: %v", err) @@ -1096,53 +1157,34 @@ func adjustSandboxOOMScoreAdj(s *sandbox.Sandbox, rootDir string, destroy bool) // Get the lowest score for all containers. var lowScore int scoreFound := false - if len(containers) == 1 && specutils.SpecContainerType(containers[0].Spec) == specutils.ContainerTypeUnspecified { - // This is a single-container sandbox. Set the oom_score_adj to - // the value specified in the OCI bundle. - if containers[0].Spec.Process.OOMScoreAdj != nil { - scoreFound = true - lowScore = *containers[0].Spec.Process.OOMScoreAdj + for _, container := range containers { + // Special multi-container support for CRI. Ignore the root container when + // calculating oom_score_adj for the sandbox because it is the + // infrastructure (pause) container and always has a very low oom_score_adj. + // + // We will use OOMScoreAdj in the single-container case where the + // containerd container-type annotation is not present. + if specutils.SpecContainerType(container.Spec) == specutils.ContainerTypeSandbox { + continue } - } else { - for _, container := range containers { - // Special multi-container support for CRI. Ignore the root - // container when calculating oom_score_adj for the sandbox because - // it is the infrastructure (pause) container and always has a very - // low oom_score_adj. - // - // We will use OOMScoreAdj in the single-container case where the - // containerd container-type annotation is not present. - if specutils.SpecContainerType(container.Spec) == specutils.ContainerTypeSandbox { - continue - } - if container.Spec.Process.OOMScoreAdj != nil && (!scoreFound || *container.Spec.Process.OOMScoreAdj < lowScore) { - scoreFound = true - lowScore = *container.Spec.Process.OOMScoreAdj - } + if container.Spec.Process.OOMScoreAdj != nil && (!scoreFound || *container.Spec.Process.OOMScoreAdj < lowScore) { + scoreFound = true + lowScore = *container.Spec.Process.OOMScoreAdj } } // If the container is destroyed and remaining containers have no - // oomScoreAdj specified then we must revert to the oom_score_adj of the - // parent process. + // oomScoreAdj specified then we must revert to the original oom_score_adj + // saved with the root container. if !scoreFound && destroy { - ppid, err := specutils.GetParentPid(s.Pid) - if err != nil { - return fmt.Errorf("getting parent pid of sandbox pid %d: %v", s.Pid, err) - } - pScore, err := specutils.GetOOMScoreAdj(ppid) - if err != nil { - return fmt.Errorf("getting oom_score_adj of parent %d: %v", ppid, err) - } - + lowScore = containers[0].Sandbox.OriginalOOMScoreAdj scoreFound = true - lowScore = pScore } - // Only set oom_score_adj if one of the containers has oom_score_adj set - // in the OCI bundle. If not, we need to inherit the parent process's - // oom_score_adj. + // Only set oom_score_adj if one of the containers has oom_score_adj set. If + // not, oom_score_adj is inherited from the parent process. + // // See: https://github.com/opencontainers/runtime-spec/blob/master/config.md#linux-process if !scoreFound { return nil diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index cc188f45b..fa99e403a 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -364,7 +364,7 @@ func TestLifecycle(t *testing.T) { defer c.Destroy() // Load the container from disk and check the status. - c, err = Load(rootDir, args.ID) + c, err = LoadAndCheck(rootDir, args.ID) if err != nil { t.Fatalf("error loading container: %v", err) } @@ -387,7 +387,7 @@ func TestLifecycle(t *testing.T) { } // Load the container from disk and check the status. - c, err = Load(rootDir, args.ID) + c, err = LoadAndCheck(rootDir, args.ID) if err != nil { t.Fatalf("error loading container: %v", err) } @@ -428,7 +428,7 @@ func TestLifecycle(t *testing.T) { } // Load the container from disk and check the status. - c, err = Load(rootDir, args.ID) + c, err = LoadAndCheck(rootDir, args.ID) if err != nil { t.Fatalf("error loading container: %v", err) } @@ -451,7 +451,7 @@ func TestLifecycle(t *testing.T) { } // Loading the container by id should fail. - if _, err = Load(rootDir, args.ID); err == nil { + if _, err = LoadAndCheck(rootDir, args.ID); err == nil { t.Errorf("expected loading destroyed container to fail, but it did not") } }) @@ -1738,7 +1738,7 @@ func doAbbreviatedIDsTest(t *testing.T, vfs2 bool) { cids[2]: cids[2], } for shortid, longid := range unambiguous { - if _, err := Load(rootDir, shortid); err != nil { + if _, err := LoadAndCheck(rootDir, shortid); err != nil { t.Errorf("%q should resolve to %q: %v", shortid, longid, err) } } @@ -1749,7 +1749,7 @@ func doAbbreviatedIDsTest(t *testing.T, vfs2 bool) { "ba", } for _, shortid := range ambiguous { - if s, err := Load(rootDir, shortid); err == nil { + if s, err := LoadAndCheck(rootDir, shortid); err == nil { t.Errorf("%q should be ambiguous, but resolved to %q", shortid, s.ID) } } @@ -1976,11 +1976,11 @@ func doDestroyNotStartedTest(t *testing.T, vfs2 bool) { // TestDestroyStarting attempts to force a race between start and destroy. func TestDestroyStarting(t *testing.T) { - doDestroyNotStartedTest(t, false) + doDestroyStartingTest(t, false) } func TestDestroyStartedVFS2(t *testing.T) { - doDestroyNotStartedTest(t, true) + doDestroyStartingTest(t, true) } func doDestroyStartingTest(t *testing.T, vfs2 bool) { @@ -2007,7 +2007,7 @@ func doDestroyStartingTest(t *testing.T, vfs2 bool) { // Container is not thread safe, so load another instance to run in // concurrently. - startCont, err := Load(rootDir, args.ID) + startCont, err := LoadAndCheck(rootDir, args.ID) if err != nil { t.Fatalf("error loading container: %v", err) } diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go index 850e80290..45d4e6e6e 100644 --- a/runsc/container/multi_container_test.go +++ b/runsc/container/multi_container_test.go @@ -15,6 +15,7 @@ package container import ( + "encoding/json" "fmt" "io/ioutil" "math" @@ -300,54 +301,21 @@ func TestMultiContainerWait(t *testing.T) { } defer cleanup() - // Check via ps that multiple processes are running. - expectedPL := []*control.Process{ - newProcessBuilder().PID(2).PPID(0).Cmd("sleep").Process(), + // Check that we can wait for the sub-container. + c := containers[1] + if ws, err := c.Wait(); err != nil { + t.Errorf("failed to wait for process %s: %v", c.Spec.Process.Args, err) + } else if es := ws.ExitStatus(); es != 0 { + t.Errorf("process %s exited with non-zero status %d", c.Spec.Process.Args, es) } - if err := waitForProcessList(containers[1], expectedPL); err != nil { - t.Errorf("failed to wait for sleep to start: %v", err) + if _, err := c.Wait(); err != nil { + t.Errorf("wait for stopped container %s shouldn't fail: %v", c.Spec.Process.Args, err) } - // Wait on the short lived container from multiple goroutines. - wg := sync.WaitGroup{} - for i := 0; i < 3; i++ { - wg.Add(1) - go func(c *Container) { - defer wg.Done() - if ws, err := c.Wait(); err != nil { - t.Errorf("failed to wait for process %s: %v", c.Spec.Process.Args, err) - } else if es := ws.ExitStatus(); es != 0 { - t.Errorf("process %s exited with non-zero status %d", c.Spec.Process.Args, es) - } - if _, err := c.Wait(); err != nil { - t.Errorf("wait for stopped container %s shouldn't fail: %v", c.Spec.Process.Args, err) - } - }(containers[1]) - } - - // Also wait via PID. - for i := 0; i < 3; i++ { - wg.Add(1) - go func(c *Container) { - defer wg.Done() - const pid = 2 - if ws, err := c.WaitPID(pid); err != nil { - t.Errorf("failed to wait for PID %d: %v", pid, err) - } else if es := ws.ExitStatus(); es != 0 { - t.Errorf("PID %d exited with non-zero status %d", pid, es) - } - if _, err := c.WaitPID(pid); err == nil { - t.Errorf("wait for stopped PID %d should fail", pid) - } - }(containers[1]) - } - - wg.Wait() - // After Wait returns, ensure that the root container is running and // the child has finished. - expectedPL = []*control.Process{ - newProcessBuilder().Cmd("sleep").Process(), + expectedPL := []*control.Process{ + newProcessBuilder().Cmd("sleep").PID(1).Process(), } if err := waitForProcessList(containers[0], expectedPL); err != nil { t.Errorf("failed to wait for %q to start: %v", strings.Join(containers[0].Spec.Process.Args, " "), err) @@ -762,7 +730,7 @@ func TestMultiContainerKillAll(t *testing.T) { // processes still running inside. containers[1].SignalContainer(syscall.SIGKILL, false) op := func() error { - c, err := Load(conf.RootDir, ids[1]) + c, err := LoadAndCheck(conf.RootDir, ids[1]) if err != nil { return err } @@ -776,7 +744,7 @@ func TestMultiContainerKillAll(t *testing.T) { } } - c, err := Load(conf.RootDir, ids[1]) + c, err := LoadAndCheck(conf.RootDir, ids[1]) if err != nil { t.Fatalf("failed to load child container %q: %v", c.ID, err) } @@ -899,7 +867,7 @@ func TestMultiContainerDestroyStarting(t *testing.T) { // Container is not thread safe, so load another instance to run in // concurrently. - startCont, err := Load(rootDir, ids[i]) + startCont, err := LoadAndCheck(rootDir, ids[i]) if err != nil { t.Fatalf("error loading container: %v", err) } @@ -1766,3 +1734,72 @@ func TestMultiContainerHomeEnvDir(t *testing.T) { }) } } + +func TestMultiContainerEvent(t *testing.T) { + conf := testutil.TestConfig(t) + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir + + // Setup the containers. + sleep := []string{"/bin/sleep", "100"} + quick := []string{"/bin/true"} + podSpec, ids := createSpecs(sleep, sleep, quick) + containers, cleanup, err := startContainers(conf, podSpec, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + + for _, cont := range containers { + t.Logf("Running containerd %s", cont.ID) + } + + // Wait for last container to stabilize the process count that is checked + // further below. + if ws, err := containers[2].Wait(); err != nil || ws != 0 { + t.Fatalf("Container.Wait, status: %v, err: %v", ws, err) + } + + // Check events for running containers. + for _, cont := range containers[:2] { + evt, err := cont.Event() + if err != nil { + t.Errorf("Container.Events(): %v", err) + } + if want := "stats"; evt.Type != want { + t.Errorf("Wrong event type, want: %s, got :%s", want, evt.Type) + } + if cont.ID != evt.ID { + t.Errorf("Wrong container ID, want: %s, got :%s", cont.ID, evt.ID) + } + // Event.Data is an interface, so it comes from the wire was + // map[string]string. Marshal and unmarshall again to the correc type. + data, err := json.Marshal(evt.Data) + if err != nil { + t.Fatalf("invalid event data: %v", err) + } + var stats boot.Stats + if err := json.Unmarshal(data, &stats); err != nil { + t.Fatalf("invalid event data: %v", err) + } + // One process per remaining container. + if want := uint64(2); stats.Pids.Current != want { + t.Errorf("Wrong number of PIDs, want: %d, got :%d", want, stats.Pids.Current) + } + } + + // Check that stop and destroyed containers return error. + if err := containers[1].Destroy(); err != nil { + t.Fatalf("container.Destroy: %v", err) + } + for _, cont := range containers[1:] { + _, err := cont.Event() + if err == nil { + t.Errorf("Container.Events() should have failed, cid:%s, state: %v", cont.ID, cont.Status) + } + } +} diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go index 8f66dd1f8..9e429f7d5 100644 --- a/runsc/sandbox/network.go +++ b/runsc/sandbox/network.go @@ -127,7 +127,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG // Get all interfaces in the namespace. ifaces, err := net.Interfaces() if err != nil { - return fmt.Errorf("querying interfaces: %v", err) + return fmt.Errorf("querying interfaces: %w", err) } isRoot, err := isRootNS() @@ -148,14 +148,14 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG allAddrs, err := iface.Addrs() if err != nil { - return fmt.Errorf("fetching interface addresses for %q: %v", iface.Name, err) + return fmt.Errorf("fetching interface addresses for %q: %w", iface.Name, err) } // We build our own loopback device. if iface.Flags&net.FlagLoopback != 0 { link, err := loopbackLink(iface, allAddrs) if err != nil { - return fmt.Errorf("getting loopback link for iface %q: %v", iface.Name, err) + return fmt.Errorf("getting loopback link for iface %q: %w", iface.Name, err) } args.LoopbackLinks = append(args.LoopbackLinks, link) continue @@ -209,7 +209,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG // Get the link for the interface. ifaceLink, err := netlink.LinkByName(iface.Name) if err != nil { - return fmt.Errorf("getting link for interface %q: %v", iface.Name, err) + return fmt.Errorf("getting link for interface %q: %w", iface.Name, err) } link.LinkAddress = ifaceLink.Attrs().HardwareAddr @@ -219,7 +219,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG log.Debugf("Creating Channel %d", i) socketEntry, err := createSocket(iface, ifaceLink, hardwareGSO) if err != nil { - return fmt.Errorf("failed to createSocket for %s : %v", iface.Name, err) + return fmt.Errorf("failed to createSocket for %s : %w", iface.Name, err) } if i == 0 { link.GSOMaxSize = socketEntry.gsoMaxSize @@ -241,11 +241,12 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG // Collect the addresses for the interface, enable forwarding, // and remove them from the host. for _, addr := range ipAddrs { - link.Addresses = append(link.Addresses, addr.IP) + prefix, _ := addr.Mask.Size() + link.Addresses = append(link.Addresses, boot.IPWithPrefix{Address: addr.IP, PrefixLen: prefix}) // Steal IP address from NIC. if err := removeAddress(ifaceLink, addr.String()); err != nil { - return fmt.Errorf("removing address %v from device %q: %v", iface.Name, addr, err) + return fmt.Errorf("removing address %v from device %q: %w", addr, iface.Name, err) } } @@ -254,7 +255,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG log.Debugf("Setting up network, config: %+v", args) if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &args, nil); err != nil { - return fmt.Errorf("creating links and routes: %v", err) + return fmt.Errorf("creating links and routes: %w", err) } return nil } @@ -278,8 +279,6 @@ func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) ( ll := syscall.SockaddrLinklayer{ Protocol: protocol, Ifindex: iface.Index, - Hatype: 0, // No ARP type. - Pkttype: syscall.PACKET_OTHERHOST, } if err := syscall.Bind(fd, &ll); err != nil { return nil, fmt.Errorf("unable to bind to %q: %v", iface.Name, err) @@ -339,9 +338,15 @@ func loopbackLink(iface net.Interface, addrs []net.Addr) (boot.LoopbackLink, err if !ok { return boot.LoopbackLink{}, fmt.Errorf("address is not IPNet: %+v", addr) } + + prefix, _ := ipNet.Mask.Size() + link.Addresses = append(link.Addresses, boot.IPWithPrefix{ + Address: ipNet.IP, + PrefixLen: prefix, + }) + dst := *ipNet dst.IP = dst.IP.Mask(dst.Mask) - link.Addresses = append(link.Addresses, ipNet.IP) link.Routes = append(link.Routes, boot.Route{ Destination: dst, }) diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index c4309feb3..c84ebcd8a 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -66,6 +66,10 @@ type Sandbox struct { // Cgroup has the cgroup configuration for the sandbox. Cgroup *cgroup.Cgroup `json:"cgroup"` + // OriginalOOMScoreAdj stores the value of oom_score_adj when the sandbox + // started, before it may be modified. + OriginalOOMScoreAdj int `json:"originalOomScoreAdj"` + // child is set if a sandbox process is a child of the current process. // // This field isn't saved to json, because only a creator of sandbox @@ -169,7 +173,7 @@ func New(conf *config.Config, args *Args) (*Sandbox, error) { } // CreateContainer creates a non-root container inside the sandbox. -func (s *Sandbox) CreateContainer(cid string) error { +func (s *Sandbox) CreateContainer(cid string, tty *os.File) error { log.Debugf("Create non-root container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid) sandboxConn, err := s.sandboxConnect() if err != nil { @@ -177,7 +181,16 @@ func (s *Sandbox) CreateContainer(cid string) error { } defer sandboxConn.Close() - if err := sandboxConn.Call(boot.ContainerCreate, &cid, nil); err != nil { + var files []*os.File + if tty != nil { + files = []*os.File{tty} + } + + args := boot.CreateArgs{ + CID: cid, + FilePayload: urpc.FilePayload{Files: files}, + } + if err := sandboxConn.Call(boot.ContainerCreate, &args, nil); err != nil { return fmt.Errorf("creating non-root container %q: %v", cid, err) } return nil @@ -207,11 +220,7 @@ func (s *Sandbox) StartRoot(spec *specs.Spec, conf *config.Config) error { } // StartContainer starts running a non-root container inside the sandbox. -func (s *Sandbox) StartContainer(spec *specs.Spec, conf *config.Config, cid string, goferFiles []*os.File) error { - for _, f := range goferFiles { - defer f.Close() - } - +func (s *Sandbox) StartContainer(spec *specs.Spec, conf *config.Config, cid string, stdios, goferFiles []*os.File) error { log.Debugf("Start non-root container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid) sandboxConn, err := s.sandboxConnect() if err != nil { @@ -219,15 +228,18 @@ func (s *Sandbox) StartContainer(spec *specs.Spec, conf *config.Config, cid stri } defer sandboxConn.Close() - // The payload must container stdin/stdout/stderr followed by gofer - // files. - files := append([]*os.File{os.Stdin, os.Stdout, os.Stderr}, goferFiles...) + // The payload must contain stdin/stdout/stderr (which may be empty if using + // TTY) followed by gofer files. + payload := urpc.FilePayload{} + payload.Files = append(payload.Files, stdios...) + payload.Files = append(payload.Files, goferFiles...) + // Start running the container. args := boot.StartArgs{ Spec: spec, Conf: conf, CID: cid, - FilePayload: urpc.FilePayload{Files: files}, + FilePayload: payload, } if err := sandboxConn.Call(boot.ContainerStart, &args, nil); err != nil { return fmt.Errorf("starting non-root container %v: %v", spec.Process.Args, err) @@ -739,6 +751,11 @@ func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyn } return err } + s.OriginalOOMScoreAdj, err = specutils.GetOOMScoreAdj(cmd.Process.Pid) + if err != nil { + return err + } + s.child = true s.Pid = cmd.Process.Pid log.Infof("Sandbox started, PID: %d", s.Pid) @@ -1133,11 +1150,11 @@ func (s *Sandbox) DestroyContainer(cid string) error { func (s *Sandbox) destroyContainer(cid string) error { if s.IsRootContainer(cid) { - log.Debugf("Destroying root container %q by destroying sandbox", cid) + log.Debugf("Destroying root container by destroying sandbox, cid: %s", cid) return s.destroy() } - log.Debugf("Destroying container %q in sandbox %q", cid, s.ID) + log.Debugf("Destroying container, cid: %s, sandbox: %s", cid, s.ID) conn, err := s.sandboxConnect() if err != nil { return err diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go index 0392e3e83..fdbba1832 100644 --- a/runsc/specutils/specutils.go +++ b/runsc/specutils/specutils.go @@ -19,6 +19,7 @@ package specutils import ( "encoding/json" "fmt" + "io" "io/ioutil" "os" "path" @@ -169,7 +170,7 @@ func ReadSpec(bundleDir string, conf *config.Config) (*specs.Spec, error) { // ReadSpecFromFile reads an OCI runtime spec from the given File, and // normalizes all relative paths into absolute by prepending the bundle dir. func ReadSpecFromFile(bundleDir string, specFile *os.File, conf *config.Config) (*specs.Spec, error) { - if _, err := specFile.Seek(0, os.SEEK_SET); err != nil { + if _, err := specFile.Seek(0, io.SeekStart); err != nil { return nil, fmt.Errorf("error seeking to beginning of file %q: %v", specFile.Name(), err) } specBytes, err := ioutil.ReadAll(specFile) @@ -344,15 +345,9 @@ func IsSupportedDevMount(m specs.Mount) bool { var existingDevices = []string{ "/dev/fd", "/dev/stdin", "/dev/stdout", "/dev/stderr", "/dev/null", "/dev/zero", "/dev/full", "/dev/random", - "/dev/urandom", "/dev/shm", "/dev/pts", "/dev/ptmx", + "/dev/urandom", "/dev/shm", "/dev/ptmx", } dst := filepath.Clean(m.Destination) - if dst == "/dev" { - // OCI spec uses many different mounts for the things inside of '/dev'. We - // have a single mount at '/dev' that is always mounted, regardless of - // whether it was asked for, as the spec says we SHOULD. - return false - } for _, dev := range existingDevices { if dst == dev || strings.HasPrefix(dst, dev+"/") { return false @@ -425,7 +420,7 @@ func Mount(src, dst, typ string, flags uint32) error { // Special case, as there is no source directory for proc mounts. isDir = true } else if fi, err := os.Stat(src); err != nil { - return fmt.Errorf("Stat(%q) failed: %v", src, err) + return fmt.Errorf("stat(%q) failed: %v", src, err) } else { isDir = fi.IsDir() } @@ -433,25 +428,25 @@ func Mount(src, dst, typ string, flags uint32) error { if isDir { // Create the destination directory. if err := os.MkdirAll(dst, 0777); err != nil { - return fmt.Errorf("Mkdir(%q) failed: %v", dst, err) + return fmt.Errorf("mkdir(%q) failed: %v", dst, err) } } else { // Create the parent destination directory. parent := path.Dir(dst) if err := os.MkdirAll(parent, 0777); err != nil { - return fmt.Errorf("Mkdir(%q) failed: %v", parent, err) + return fmt.Errorf("mkdir(%q) failed: %v", parent, err) } // Create the destination file if it does not exist. f, err := os.OpenFile(dst, syscall.O_CREAT, 0777) if err != nil { - return fmt.Errorf("Open(%q) failed: %v", dst, err) + return fmt.Errorf("open(%q) failed: %v", dst, err) } f.Close() } // Do the mount. if err := syscall.Mount(src, dst, typ, uintptr(flags), ""); err != nil { - return fmt.Errorf("Mount(%q, %q, %d) failed: %v", src, dst, flags, err) + return fmt.Errorf("mount(%q, %q, %d) failed: %v", src, dst, flags, err) } return nil } @@ -486,35 +481,6 @@ func GetOOMScoreAdj(pid int) (int, error) { return strconv.Atoi(strings.TrimSpace(string(data))) } -// GetParentPid gets the parent process ID of the specified PID. -func GetParentPid(pid int) (int, error) { - data, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/stat", pid)) - if err != nil { - return 0, err - } - - var cpid string - var name string - var state string - var ppid int - // Parse after the binary name. - _, err = fmt.Sscanf(string(data), - "%v %v %v %d", - // cpid is ignored. - &cpid, - // name is ignored. - &name, - // state is ignored. - &state, - &ppid) - - if err != nil { - return 0, err - } - - return ppid, nil -} - // EnvVar looks for a varible value in the env slice assuming the following // format: "NAME=VALUE". func EnvVar(env []string, name string) (string, bool) { |