diff options
Diffstat (limited to 'runsc')
44 files changed, 1623 insertions, 671 deletions
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index 8c73dc5dc..67307ab3c 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -33,6 +33,7 @@ go_library( "//pkg/cpuid", "//pkg/eventchannel", "//pkg/fd", + "//pkg/flipcall", "//pkg/fspath", "//pkg/log", "//pkg/memutil", diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go index 7076ae2e2..a3a76b609 100644 --- a/runsc/boot/compat.go +++ b/runsc/boot/compat.go @@ -53,7 +53,7 @@ type compatEmitter struct { func newCompatEmitter(logFD int) (*compatEmitter, error) { nameMap, ok := getSyscallNameMap() if !ok { - return nil, fmt.Errorf("Linux syscall table not found") + return nil, fmt.Errorf("syscall table not found") } c := &compatEmitter{ diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index fdf13c8e1..cb5d8ea31 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -104,13 +104,11 @@ const ( // Profiling related commands (see pprof.go for more details). const ( - StartCPUProfile = "Profile.StartCPUProfile" - StopCPUProfile = "Profile.StopCPUProfile" - HeapProfile = "Profile.HeapProfile" - BlockProfile = "Profile.BlockProfile" - MutexProfile = "Profile.MutexProfile" - StartTrace = "Profile.StartTrace" - StopTrace = "Profile.StopTrace" + CPUProfile = "Profile.CPU" + HeapProfile = "Profile.Heap" + BlockProfile = "Profile.Block" + MutexProfile = "Profile.Mutex" + Trace = "Profile.Trace" ) // Logging related commands (see logging.go for more details). @@ -131,9 +129,6 @@ type controller struct { // manager holds the containerManager methods. manager *containerManager - - // pprop holds the profile instance if enabled. It may be nil. - pprof *control.Profile } // newController creates a new controller. The caller must call @@ -164,19 +159,14 @@ func newController(fd int, l *Loader) (*controller, error) { ctrl.srv.Register(&control.Logging{}) if l.root.conf.ProfileEnable { - ctrl.pprof = &control.Profile{Kernel: l.k} - ctrl.srv.Register(ctrl.pprof) + ctrl.srv.Register(control.NewProfile(l.k)) } return ctrl, nil } func (c *controller) stop() { - if c.pprof != nil { - // These are noop if there is nothing being profiled. - _ = c.pprof.StopCPUProfile(nil, nil) - _ = c.pprof.StopTrace(nil, nil) - } + c.srv.Stop() } // containerManager manages sandbox containers. @@ -211,10 +201,31 @@ func (cm *containerManager) Processes(cid *string, out *[]*control.Process) erro return control.Processes(cm.l.k, *cid, out) } +// CreateArgs contains arguments to the Create method. +type CreateArgs struct { + // CID is the ID of the container to start. + CID string + + // FilePayload may contain a TTY file for the terminal, if enabled. + urpc.FilePayload +} + // Create creates a container within a sandbox. -func (cm *containerManager) Create(cid *string, _ *struct{}) error { - log.Debugf("containerManager.Create, cid: %s", *cid) - return cm.l.createContainer(*cid) +func (cm *containerManager) Create(args *CreateArgs, _ *struct{}) error { + log.Debugf("containerManager.Create: %s", args.CID) + + if len(args.Files) > 1 { + return fmt.Errorf("start arguments must have at most 1 files for TTY") + } + var tty *fd.FD + if len(args.Files) == 1 { + var err error + tty, err = fd.NewFromFile(args.Files[0]) + if err != nil { + return fmt.Errorf("error dup'ing TTY file: %w", err) + } + } + return cm.l.createContainer(args.CID, tty) } // StartArgs contains arguments to the Start method. @@ -229,9 +240,8 @@ type StartArgs struct { CID string // FilePayload contains, in order: - // * stdin, stdout, and stderr. - // * the file descriptor over which the sandbox will - // request files from its root filesystem. + // * stdin, stdout, and stderr (optional: if terminal is disabled). + // * file descriptors to connect to gofer to serve the root filesystem. urpc.FilePayload } @@ -251,23 +261,45 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { if args.CID == "" { return errors.New("start argument missing container ID") } - if len(args.FilePayload.Files) < 4 { - return fmt.Errorf("start arguments must contain stdin, stderr, and stdout followed by at least one file for the container root gofer") + if len(args.Files) < 1 { + return fmt.Errorf("start arguments must contain at least one file for the container root gofer") } // All validation passed, logs the spec for debugging. specutils.LogSpec(args.Spec) - fds, err := fd.NewFromFiles(args.FilePayload.Files) + goferFiles := args.Files + var stdios []*fd.FD + if !args.Spec.Process.Terminal { + // When not using a terminal, stdios come as the first 3 files in the + // payload. + if l := len(args.Files); l < 4 { + return fmt.Errorf("start arguments (len: %d) must contain stdios and files for the container root gofer", l) + } + var err error + stdios, err = fd.NewFromFiles(goferFiles[:3]) + if err != nil { + return fmt.Errorf("error dup'ing stdio files: %w", err) + } + goferFiles = goferFiles[3:] + } + defer func() { + for _, fd := range stdios { + _ = fd.Close() + } + }() + + goferFDs, err := fd.NewFromFiles(goferFiles) if err != nil { - return err + return fmt.Errorf("error dup'ing gofer files: %w", err) } defer func() { - for _, fd := range fds { + for _, fd := range goferFDs { _ = fd.Close() } }() - if err := cm.l.startContainer(args.Spec, args.Conf, args.CID, fds); err != nil { + + if err := cm.l.startContainer(args.Spec, args.Conf, args.CID, stdios, goferFDs); err != nil { log.Debugf("containerManager.Start failed, cid: %s, args: %+v, err: %v", args.CID, args, err) return err } @@ -330,18 +362,18 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { log.Debugf("containerManager.Restore") var specFile, deviceFile *os.File - switch numFiles := len(o.FilePayload.Files); numFiles { + switch numFiles := len(o.Files); numFiles { case 2: // The device file is donated to the platform. // Can't take ownership away from os.File. dup them to get a new FD. - fd, err := syscall.Dup(int(o.FilePayload.Files[1].Fd())) + fd, err := syscall.Dup(int(o.Files[1].Fd())) if err != nil { return fmt.Errorf("failed to dup file: %v", err) } deviceFile = os.NewFile(uintptr(fd), "platform device") fallthrough case 1: - specFile = o.FilePayload.Files[0] + specFile = o.Files[0] case 0: return fmt.Errorf("at least one file must be passed to Restore") default: diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index a7c4ebb0c..eacd73531 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -343,6 +343,21 @@ func hostInetFilters() seccomp.SyscallRules { }, { seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IP), + seccomp.EqualTo(syscall.IP_PKTINFO), + }, + { + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IP), + seccomp.EqualTo(syscall.IP_RECVORIGDSTADDR), + }, + { + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IP), + seccomp.EqualTo(syscall.IP_RECVERR), + }, + { + seccomp.MatchAny{}, seccomp.EqualTo(syscall.SOL_IPV6), seccomp.EqualTo(syscall.IPV6_TCLASS), }, @@ -354,10 +369,20 @@ func hostInetFilters() seccomp.SyscallRules { { seccomp.MatchAny{}, seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(syscall.IPV6_RECVERR), + }, + { + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), seccomp.EqualTo(syscall.IPV6_V6ONLY), }, { seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(linux.IPV6_RECVORIGDSTADDR), + }, + { + seccomp.MatchAny{}, seccomp.EqualTo(syscall.SOL_SOCKET), seccomp.EqualTo(syscall.SO_ERROR), }, @@ -393,6 +418,11 @@ func hostInetFilters() seccomp.SyscallRules { }, { seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_TIMESTAMP), + }, + { + seccomp.MatchAny{}, seccomp.EqualTo(syscall.SOL_TCP), seccomp.EqualTo(syscall.TCP_NODELAY), }, @@ -401,6 +431,11 @@ func hostInetFilters() seccomp.SyscallRules { seccomp.EqualTo(syscall.SOL_TCP), seccomp.EqualTo(syscall.TCP_INFO), }, + { + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_TCP), + seccomp.EqualTo(linux.TCP_INQ), + }, }, syscall.SYS_IOCTL: []seccomp.Rule{ { @@ -421,29 +456,29 @@ func hostInetFilters() seccomp.SyscallRules { syscall.SYS_SETSOCKOPT: []seccomp.Rule{ { seccomp.MatchAny{}, - seccomp.EqualTo(syscall.SOL_IPV6), - seccomp.EqualTo(syscall.IPV6_V6ONLY), + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_SNDBUF), seccomp.MatchAny{}, seccomp.EqualTo(4), }, { seccomp.MatchAny{}, seccomp.EqualTo(syscall.SOL_SOCKET), - seccomp.EqualTo(syscall.SO_SNDBUF), + seccomp.EqualTo(syscall.SO_RCVBUF), seccomp.MatchAny{}, seccomp.EqualTo(4), }, { seccomp.MatchAny{}, seccomp.EqualTo(syscall.SOL_SOCKET), - seccomp.EqualTo(syscall.SO_RCVBUF), + seccomp.EqualTo(syscall.SO_REUSEADDR), seccomp.MatchAny{}, seccomp.EqualTo(4), }, { seccomp.MatchAny{}, seccomp.EqualTo(syscall.SOL_SOCKET), - seccomp.EqualTo(syscall.SO_REUSEADDR), + seccomp.EqualTo(syscall.SO_TIMESTAMP), seccomp.MatchAny{}, seccomp.EqualTo(4), }, @@ -456,6 +491,13 @@ func hostInetFilters() seccomp.SyscallRules { }, { seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_TCP), + seccomp.EqualTo(linux.TCP_INQ), + seccomp.MatchAny{}, + seccomp.EqualTo(4), + }, + { + seccomp.MatchAny{}, seccomp.EqualTo(syscall.SOL_IP), seccomp.EqualTo(syscall.IP_TOS), seccomp.MatchAny{}, @@ -470,6 +512,27 @@ func hostInetFilters() seccomp.SyscallRules { }, { seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IP), + seccomp.EqualTo(syscall.IP_PKTINFO), + seccomp.MatchAny{}, + seccomp.EqualTo(4), + }, + { + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IP), + seccomp.EqualTo(syscall.IP_RECVORIGDSTADDR), + seccomp.MatchAny{}, + seccomp.EqualTo(4), + }, + { + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IP), + seccomp.EqualTo(syscall.IP_RECVERR), + seccomp.MatchAny{}, + seccomp.EqualTo(4), + }, + { + seccomp.MatchAny{}, seccomp.EqualTo(syscall.SOL_IPV6), seccomp.EqualTo(syscall.IPV6_TCLASS), seccomp.MatchAny{}, @@ -482,6 +545,27 @@ func hostInetFilters() seccomp.SyscallRules { seccomp.MatchAny{}, seccomp.EqualTo(4), }, + { + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(linux.IPV6_RECVORIGDSTADDR), + seccomp.MatchAny{}, + seccomp.EqualTo(4), + }, + { + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(syscall.IPV6_RECVERR), + seccomp.MatchAny{}, + seccomp.EqualTo(4), + }, + { + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(syscall.IPV6_V6ONLY), + seccomp.MatchAny{}, + seccomp.EqualTo(4), + }, }, syscall.SYS_SHUTDOWN: []seccomp.Rule{ { diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 6b6ae98d7..2b0d2cd51 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -22,15 +22,6 @@ import ( "strings" "syscall" - // Include filesystem types that OCI spec might mount. - _ "gvisor.dev/gvisor/pkg/sentry/fs/dev" - _ "gvisor.dev/gvisor/pkg/sentry/fs/host" - _ "gvisor.dev/gvisor/pkg/sentry/fs/proc" - _ "gvisor.dev/gvisor/pkg/sentry/fs/sys" - _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" - _ "gvisor.dev/gvisor/pkg/sentry/fs/tty" - "gvisor.dev/gvisor/pkg/sentry/vfs" - specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -48,9 +39,18 @@ import ( tmpfsvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/specutils" + + // Include filesystem types that OCI spec might mount. + _ "gvisor.dev/gvisor/pkg/sentry/fs/dev" + _ "gvisor.dev/gvisor/pkg/sentry/fs/host" + _ "gvisor.dev/gvisor/pkg/sentry/fs/proc" + _ "gvisor.dev/gvisor/pkg/sentry/fs/sys" + _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" + _ "gvisor.dev/gvisor/pkg/sentry/fs/tty" ) const ( diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index ebdd518d0..d37528ee7 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -75,12 +75,14 @@ import ( "gvisor.dev/gvisor/runsc/specutils" "gvisor.dev/gvisor/runsc/specutils/seccomp" - // Include supported socket providers. + // Top-level inet providers. "gvisor.dev/gvisor/pkg/sentry/socket/hostinet" + "gvisor.dev/gvisor/pkg/sentry/socket/netstack" + + // Include other supported socket providers. _ "gvisor.dev/gvisor/pkg/sentry/socket/netlink" _ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/route" _ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/uevent" - "gvisor.dev/gvisor/pkg/sentry/socket/netstack" _ "gvisor.dev/gvisor/pkg/sentry/socket/unix" ) @@ -157,6 +159,11 @@ type execProcess struct { // pidnsPath is the pid namespace path in spec pidnsPath string + + // hostTTY is present when creating a sub-container with terminal enabled. + // TTY file is passed during container create and must be saved until + // container start. + hostTTY *fd.FD } func init() { @@ -289,7 +296,7 @@ func New(args Args) (*Loader, error) { if args.TotalMem > 0 { // Adjust the total memory returned by the Sentry so that applications that // use /proc/meminfo can make allocations based on this limit. - usage.MinimumTotalMemoryBytes = args.TotalMem + usage.MaximumTotalMemoryBytes = args.TotalMem log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(1<<30)) } @@ -435,6 +442,10 @@ func createProcessArgs(id string, spec *specs.Spec, creds *auth.Credentials, k * if err != nil { return kernel.CreateProcessArgs{}, fmt.Errorf("creating limits: %v", err) } + env, err := specutils.ResolveEnvs(spec.Process.Env) + if err != nil { + return kernel.CreateProcessArgs{}, fmt.Errorf("resolving env: %w", err) + } wd := spec.Process.Cwd if wd == "" { @@ -444,7 +455,7 @@ func createProcessArgs(id string, spec *specs.Spec, creds *auth.Credentials, k * // Create the process arguments. procArgs := kernel.CreateProcessArgs{ Argv: spec.Process.Args, - Envv: spec.Process.Env, + Envv: env, WorkingDirectory: wd, Credentials: creds, Umask: 0022, @@ -588,10 +599,11 @@ func (l *Loader) run() error { // Create the root container init task. It will begin running // when the kernel is started. - if _, err := l.createContainerProcess(true, l.sandboxID, &l.root, ep); err != nil { + var err error + _, ep.tty, ep.ttyVFS2, err = l.createContainerProcess(true, l.sandboxID, &l.root) + if err != nil { return err } - } ep.tg = l.k.GlobalInit() @@ -627,7 +639,7 @@ func (l *Loader) run() error { } // createContainer creates a new container inside the sandbox. -func (l *Loader) createContainer(cid string) error { +func (l *Loader) createContainer(cid string, tty *fd.FD) error { l.mu.Lock() defer l.mu.Unlock() @@ -635,14 +647,14 @@ func (l *Loader) createContainer(cid string) error { if _, ok := l.processes[eid]; ok { return fmt.Errorf("container %q already exists", cid) } - l.processes[eid] = &execProcess{} + l.processes[eid] = &execProcess{hostTTY: tty} return nil } // startContainer starts a child container. It returns the thread group ID of // the newly created process. Used FDs are either closed or released. It's safe // for the caller to close any remaining files upon return. -func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, files []*fd.FD) error { +func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, stdioFDs, goferFDs []*fd.FD) error { // Create capabilities. caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities) if err != nil { @@ -695,36 +707,41 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid strin info := &containerInfo{ conf: conf, spec: spec, - stdioFDs: files[:3], - goferFDs: files[3:], + goferFDs: goferFDs, } info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns) if err != nil { return fmt.Errorf("creating new process: %v", err) } - tg, err := l.createContainerProcess(false, cid, info, ep) + + // Use stdios or TTY depending on the spec configuration. + if spec.Process.Terminal { + if len(stdioFDs) > 0 { + return fmt.Errorf("using TTY, stdios not expected: %v", stdioFDs) + } + if ep.hostTTY == nil { + return fmt.Errorf("terminal enabled but no TTY provided. Did you set --console-socket on create?") + } + info.stdioFDs = []*fd.FD{ep.hostTTY, ep.hostTTY, ep.hostTTY} + ep.hostTTY = nil + } else { + info.stdioFDs = stdioFDs + } + + ep.tg, ep.tty, ep.ttyVFS2, err = l.createContainerProcess(false, cid, info) if err != nil { return err } - - // Success! - l.k.StartProcess(tg) - ep.tg = tg + l.k.StartProcess(ep.tg) return nil } -func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo, ep *execProcess) (*kernel.ThreadGroup, error) { - console := false - if root { - // Only root container supports terminal for now. - console = info.spec.Process.Terminal - } - +func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo) (*kernel.ThreadGroup, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { // Create the FD map, which will set stdin, stdout, and stderr. ctx := info.procArgs.NewContext(l.k) - fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, console, info.stdioFDs) + fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, info.spec.Process.Terminal, info.stdioFDs) if err != nil { - return nil, fmt.Errorf("importing fds: %v", err) + return nil, nil, nil, fmt.Errorf("importing fds: %v", err) } // CreateProcess takes a reference on fdTable if successful. We won't need // ours either way. @@ -736,11 +753,11 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn mntr := newContainerMounter(info.spec, info.goferFDs, l.k, l.mountHints) if root { if err := mntr.processHints(info.conf, info.procArgs.Credentials); err != nil { - return nil, err + return nil, nil, nil, err } } if err := setupContainerFS(ctx, info.conf, mntr, &info.procArgs); err != nil { - return nil, err + return nil, nil, nil, err } // Add the HOME environment variable if it is not already set. @@ -754,29 +771,25 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn info.procArgs.Credentials.RealKUID, info.procArgs.Envv) } if err != nil { - return nil, err + return nil, nil, nil, err } info.procArgs.Envv = envv // Create and start the new process. tg, _, err := l.k.CreateProcess(info.procArgs) if err != nil { - return nil, fmt.Errorf("creating process: %v", err) + return nil, nil, nil, fmt.Errorf("creating process: %v", err) } // CreateProcess takes a reference on FDTable if successful. info.procArgs.FDTable.DecRef(ctx) // Set the foreground process group on the TTY to the global init process // group, since that is what we are about to start running. - if root { - switch { - case ttyFileVFS2 != nil: - ep.ttyVFS2 = ttyFileVFS2 - ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup()) - case ttyFile != nil: - ep.tty = ttyFile - ttyFile.InitForegroundProcessGroup(tg.ProcessGroup()) - } + switch { + case ttyFileVFS2 != nil: + ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup()) + case ttyFile != nil: + ttyFile.InitForegroundProcessGroup(tg.ProcessGroup()) } // Install seccomp filters with the new task if there are any. @@ -784,7 +797,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil { program, err := seccomp.BuildProgram(info.spec.Linux.Seccomp) if err != nil { - return nil, fmt.Errorf("building seccomp program: %v", err) + return nil, nil, nil, fmt.Errorf("building seccomp program: %v", err) } if log.IsLogging(log.Debug) { @@ -795,7 +808,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn task := tg.Leader() // NOTE: It seems Flags are ignored by runc so we ignore them too. if err := task.AppendSyscallFilter(program, true); err != nil { - return nil, fmt.Errorf("appending seccomp filters: %v", err) + return nil, nil, nil, fmt.Errorf("appending seccomp filters: %v", err) } } } else { @@ -804,7 +817,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn } } - return tg, nil + return tg, ttyFile, ttyFileVFS2, nil } // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on @@ -926,6 +939,11 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { } } + args.Envv, err = specutils.ResolveEnvs(args.Envv) + if err != nil { + return 0, fmt.Errorf("resolving env: %w", err) + } + // Add the HOME environment variable if it is not already set. if kernel.VFS2Enabled { root := args.MountNamespaceVFS2.Root() @@ -1037,9 +1055,10 @@ func (l *Loader) WaitExit() kernel.ExitStatus { // Wait for container. l.k.WaitExited() - // Cleanup + // Stop the control server. l.ctrl.stop() + // Check all references. refs.OnExit() return l.k.GlobalInit().ExitStatus() @@ -1074,7 +1093,12 @@ func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID st func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) { netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol} - transProtos := []stack.TransportProtocolFactory{tcp.NewProtocol, udp.NewProtocol, icmp.NewProtocol4} + transProtos := []stack.TransportProtocolFactory{ + tcp.NewProtocol, + udp.NewProtocol, + icmp.NewProtocol4, + icmp.NewProtocol6, + } s := netstack.Stack{stack.New(stack.Options{ NetworkProtocols: netProtos, TransportProtocols: transProtos, diff --git a/runsc/boot/network.go b/runsc/boot/network.go index 988573640..3d3a813df 100644 --- a/runsc/boot/network.go +++ b/runsc/boot/network.go @@ -28,7 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/link/packetsocket" "gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo" "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" - "gvisor.dev/gvisor/pkg/tcpip/network/arp" "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" "gvisor.dev/gvisor/pkg/tcpip/stack" @@ -41,9 +40,9 @@ var ( // "::1/8" on "lo" interface. DefaultLoopbackLink = LoopbackLink{ Name: "lo", - Addresses: []net.IP{ - net.IP("\x7f\x00\x00\x01"), - net.IPv6loopback, + Addresses: []IPWithPrefix{ + {Address: net.IP("\x7f\x00\x00\x01"), PrefixLen: 8}, + {Address: net.IPv6loopback, PrefixLen: 128}, }, Routes: []Route{ { @@ -83,7 +82,7 @@ type DefaultRoute struct { type FDBasedLink struct { Name string MTU int - Addresses []net.IP + Addresses []IPWithPrefix Routes []Route GSOMaxSize uint32 SoftwareGSOEnabled bool @@ -100,7 +99,7 @@ type FDBasedLink struct { // LoopbackLink configures a loopback li nk. type LoopbackLink struct { Name string - Addresses []net.IP + Addresses []IPWithPrefix Routes []Route } @@ -118,6 +117,19 @@ type CreateLinksAndRoutesArgs struct { Defaultv6Gateway DefaultRoute } +// IPWithPrefix is an address with its subnet prefix length. +type IPWithPrefix struct { + // Address is a network address. + Address net.IP + + // PrefixLen is the subnet prefix length. + PrefixLen int +} + +func (ip IPWithPrefix) String() string { + return fmt.Sprintf("%s/%d", ip.Address, ip.PrefixLen) +} + // Empty returns true if route hasn't been set. func (r *Route) Empty() bool { return r.Destination.IP == nil && r.Destination.Mask == nil && r.Gateway == nil @@ -265,20 +277,19 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct // createNICWithAddrs creates a NIC in the network stack and adds the given // addresses. -func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []net.IP) error { +func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []IPWithPrefix) error { opts := stack.NICOptions{Name: name} if err := n.Stack.CreateNICWithOptions(id, sniffer.New(ep), opts); err != nil { return fmt.Errorf("CreateNICWithOptions(%d, _, %+v) failed: %v", id, opts, err) } - // Always start with an arp address for the NIC. - if err := n.Stack.AddAddress(id, arp.ProtocolNumber, arp.ProtocolAddress); err != nil { - return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, arp.ProtocolNumber, arp.ProtocolAddress, err) - } - for _, addr := range addrs { - proto, tcpipAddr := ipToAddressAndProto(addr) - if err := n.Stack.AddAddress(id, proto, tcpipAddr); err != nil { + proto, tcpipAddr := ipToAddressAndProto(addr.Address) + ap := tcpip.AddressWithPrefix{ + Address: tcpipAddr, + PrefixLen: addr.PrefixLen, + } + if err := n.Stack.AddAddressWithPrefix(id, proto, ap); err != nil { return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, proto, tcpipAddr, err) } } diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index b157387ef..3fd28e516 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -250,36 +250,76 @@ func (c *containerMounter) configureOverlay(ctx context.Context, creds *auth.Cre overlayOpts := *lowerOpts overlayOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{} - // Next mount upper and lower. Upper is a tmpfs mount to keep all - // modifications inside the sandbox. - upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts) - if err != nil { - return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err) - } - cu := cleanup.Make(func() { upper.DecRef(ctx) }) - defer cu.Clean() - // All writes go to the upper layer, be paranoid and make lower readonly. lowerOpts.ReadOnly = true lower, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, lowerFSName, lowerOpts) if err != nil { return nil, nil, err } - cu.Add(func() { lower.DecRef(ctx) }) + cu := cleanup.Make(func() { lower.DecRef(ctx) }) + defer cu.Clean() - // Propagate the lower layer's root's owner, group, and mode to the upper - // layer's root for consistency with VFS1. - upperRootVD := vfs.MakeVirtualDentry(upper, upper.Root()) + // Determine the lower layer's root's type. lowerRootVD := vfs.MakeVirtualDentry(lower, lower.Root()) stat, err := c.k.VFS().StatAt(ctx, creds, &vfs.PathOperation{ Root: lowerRootVD, Start: lowerRootVD, }, &vfs.StatOptions{ - Mask: linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE, + Mask: linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE | linux.STATX_TYPE, }) if err != nil { - return nil, nil, err + return nil, nil, fmt.Errorf("failed to stat lower layer's root: %v", err) + } + if stat.Mask&linux.STATX_TYPE == 0 { + return nil, nil, fmt.Errorf("failed to get file type of lower layer's root") + } + rootType := stat.Mode & linux.S_IFMT + if rootType != linux.S_IFDIR && rootType != linux.S_IFREG { + return nil, nil, fmt.Errorf("lower layer's root has unsupported file type %v", rootType) + } + + // Upper is a tmpfs mount to keep all modifications inside the sandbox. + upperOpts.GetFilesystemOptions.InternalData = tmpfs.FilesystemOpts{ + RootFileType: uint16(rootType), + } + upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts) + if err != nil { + return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err) + } + cu.Add(func() { upper.DecRef(ctx) }) + + // If the overlay mount consists of a regular file, copy up its contents + // from the lower layer, since in the overlay the otherwise-empty upper + // layer file will take precedence. + upperRootVD := vfs.MakeVirtualDentry(upper, upper.Root()) + if rootType == linux.S_IFREG { + lowerFD, err := c.k.VFS().OpenAt(ctx, creds, &vfs.PathOperation{ + Root: lowerRootVD, + Start: lowerRootVD, + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }) + if err != nil { + return nil, nil, fmt.Errorf("failed to open lower layer root for copying: %v", err) + } + defer lowerFD.DecRef(ctx) + upperFD, err := c.k.VFS().OpenAt(ctx, creds, &vfs.PathOperation{ + Root: upperRootVD, + Start: upperRootVD, + }, &vfs.OpenOptions{ + Flags: linux.O_WRONLY, + }) + if err != nil { + return nil, nil, fmt.Errorf("failed to open upper layer root for copying: %v", err) + } + defer upperFD.DecRef(ctx) + if _, err := vfs.CopyRegularFileData(ctx, upperFD, lowerFD); err != nil { + return nil, nil, fmt.Errorf("failed to copy up overlay file: %v", err) + } } + + // Propagate the lower layer's root's owner, group, and mode to the upper + // layer's root for consistency with VFS1. err = c.k.VFS().SetStatAt(ctx, creds, &vfs.PathOperation{ Root: upperRootVD, Start: upperRootVD, diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go index 5bd0afc52..13c6a16a0 100644 --- a/runsc/cgroup/cgroup.go +++ b/runsc/cgroup/cgroup.go @@ -41,22 +41,22 @@ const ( ) var controllers = map[string]config{ - "blkio": config{ctrlr: &blockIO{}}, - "cpu": config{ctrlr: &cpu{}}, - "cpuset": config{ctrlr: &cpuSet{}}, - "hugetlb": config{ctrlr: &hugeTLB{}, optional: true}, - "memory": config{ctrlr: &memory{}}, - "net_cls": config{ctrlr: &networkClass{}}, - "net_prio": config{ctrlr: &networkPrio{}}, - "pids": config{ctrlr: &pids{}}, + "blkio": {ctrlr: &blockIO{}}, + "cpu": {ctrlr: &cpu{}}, + "cpuset": {ctrlr: &cpuSet{}}, + "hugetlb": {ctrlr: &hugeTLB{}, optional: true}, + "memory": {ctrlr: &memory{}}, + "net_cls": {ctrlr: &networkClass{}}, + "net_prio": {ctrlr: &networkPrio{}}, + "pids": {ctrlr: &pids{}}, // These controllers either don't have anything in the OCI spec or is // irrelevant for a sandbox. - "devices": config{ctrlr: &noop{}}, - "freezer": config{ctrlr: &noop{}}, - "perf_event": config{ctrlr: &noop{}}, - "rdma": config{ctrlr: &noop{}, optional: true}, - "systemd": config{ctrlr: &noop{}}, + "devices": {ctrlr: &noop{}}, + "freezer": {ctrlr: &noop{}}, + "perf_event": {ctrlr: &noop{}}, + "rdma": {ctrlr: &noop{}, optional: true}, + "systemd": {ctrlr: &noop{}}, } func setOptionalValueInt(path, name string, val *int64) error { @@ -234,7 +234,7 @@ func loadPathsHelper(cgroup io.Reader) (map[string]string, error) { type Cgroup struct { Name string `json:"name"` Parents map[string]string `json:"parents"` - Own bool `json:"own"` + Own map[string]bool `json:"own"` } // New creates a new Cgroup instance if the spec includes a cgroup path. @@ -251,9 +251,11 @@ func New(spec *specs.Spec) (*Cgroup, error) { return nil, fmt.Errorf("finding current cgroups: %w", err) } } + own := make(map[string]bool) return &Cgroup{ Name: spec.Linux.CgroupsPath, Parents: parents, + Own: own, }, nil } @@ -261,18 +263,8 @@ func New(spec *specs.Spec) (*Cgroup, error) { // already exists, it means that the caller has already provided a // pre-configured cgroups, and 'res' is ignored. func (c *Cgroup) Install(res *specs.LinuxResources) error { - if _, err := os.Stat(c.makePath("memory")); err == nil { - // If cgroup has already been created; it has been setup by caller. Don't - // make any changes to configuration, just join when sandbox/gofer starts. - log.Debugf("Using pre-created cgroup %q", c.Name) - return nil - } - log.Debugf("Creating cgroup %q", c.Name) - // Mark that cgroup resources are owned by me. - c.Own = true - // The Cleanup object cleans up partially created cgroups when an error occurs. // Errors occuring during cleanup itself are ignored. clean := cleanup.Make(func() { _ = c.Uninstall() }) @@ -280,6 +272,16 @@ func (c *Cgroup) Install(res *specs.LinuxResources) error { for key, cfg := range controllers { path := c.makePath(key) + if _, err := os.Stat(path); err == nil { + // If cgroup has already been created; it has been setup by caller. Don't + // make any changes to configuration, just join when sandbox/gofer starts. + log.Debugf("Using pre-created cgroup %q", path) + continue + } + + // Mark that cgroup resources are owned by me. + c.Own[key] = true + if err := os.MkdirAll(path, 0755); err != nil { if cfg.optional && errors.Is(err, syscall.EROFS) { log.Infof("Skipping cgroup %q", key) @@ -298,12 +300,12 @@ func (c *Cgroup) Install(res *specs.LinuxResources) error { // Uninstall removes the settings done in Install(). If cgroup path already // existed when Install() was called, Uninstall is a noop. func (c *Cgroup) Uninstall() error { - if !c.Own { - // cgroup is managed by caller, don't touch it. - return nil - } log.Debugf("Deleting cgroup %q", c.Name) for key := range controllers { + if !c.Own[key] { + // cgroup is managed by caller, don't touch it. + continue + } path := c.makePath(key) log.Debugf("Removing cgroup controller for key=%q path=%q", key, path) @@ -369,6 +371,7 @@ func (c *Cgroup) Join() (func(), error) { return undo, nil } +// CPUQuota returns the CFS CPU quota. func (c *Cgroup) CPUQuota() (float64, error) { path := c.makePath("cpu") quota, err := getInt(path, "cpu.cfs_quota_us") diff --git a/runsc/cgroup/cgroup_test.go b/runsc/cgroup/cgroup_test.go index 9794517a7..931144cf9 100644 --- a/runsc/cgroup/cgroup_test.go +++ b/runsc/cgroup/cgroup_test.go @@ -29,7 +29,10 @@ func TestUninstallEnoent(t *testing.T) { c := Cgroup{ // set a non-existent name Name: "runsc-test-uninstall-656e6f656e740a", - Own: true, + } + c.Own = make(map[string]bool) + for key := range controllers { + c.Own[key] = true } if err := c.Uninstall(); err != nil { t.Errorf("Uninstall() failed: %v", err) diff --git a/runsc/cli/main.go b/runsc/cli/main.go index bca015db5..6c3bf4d21 100644 --- a/runsc/cli/main.go +++ b/runsc/cli/main.go @@ -22,6 +22,7 @@ import ( "io/ioutil" "os" "os/signal" + "runtime" "syscall" "time" @@ -82,6 +83,7 @@ func Main(version string) { subcommands.Register(new(cmd.Spec), "") subcommands.Register(new(cmd.State), "") subcommands.Register(new(cmd.Start), "") + subcommands.Register(new(cmd.Symbolize), "") subcommands.Register(new(cmd.Wait), "") // Register internal commands with the internal group name. This causes @@ -207,6 +209,8 @@ func Main(version string) { log.Infof("***************************") log.Infof("Args: %s", os.Args) log.Infof("Version %s", version) + log.Infof("GOOS: %s", runtime.GOOS) + log.Infof("GOARCH: %s", runtime.GOARCH) log.Infof("PID: %d", os.Getpid()) log.Infof("UID: %d, GID: %d", os.Getuid(), os.Getgid()) log.Infof("Configuration:") diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD index 2556f6d9e..19520d7ab 100644 --- a/runsc/cmd/BUILD +++ b/runsc/cmd/BUILD @@ -32,6 +32,7 @@ go_library( "start.go", "state.go", "statefile.go", + "symbolize.go", "syscalls.go", "wait.go", ], @@ -39,6 +40,7 @@ go_library( "//runsc:__subpackages__", ], deps = [ + "//pkg/coverage", "//pkg/log", "//pkg/p9", "//pkg/sentry/control", diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go index c0bc8f064..124198239 100644 --- a/runsc/cmd/checkpoint.go +++ b/runsc/cmd/checkpoint.go @@ -75,7 +75,7 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa conf := args[0].(*config.Config) waitStatus := args[1].(*syscall.WaitStatus) - cont, err := container.LoadAndCheck(conf.RootDir, id) + cont, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{}) if err != nil { Fatalf("loading container: %v", err) } diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go index 609e8231c..b84142b0d 100644 --- a/runsc/cmd/debug.go +++ b/runsc/cmd/debug.go @@ -17,8 +17,10 @@ package cmd import ( "context" "os" + "os/signal" "strconv" "strings" + "sync" "syscall" "time" @@ -43,6 +45,7 @@ type Debug struct { strace string logLevel string logPackets string + delay time.Duration duration time.Duration ps bool } @@ -70,10 +73,11 @@ func (d *Debug) SetFlags(f *flag.FlagSet) { f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.") f.StringVar(&d.profileBlock, "profile-block", "", "writes block profile to the given file.") f.StringVar(&d.profileMutex, "profile-mutex", "", "writes mutex profile to the given file.") - f.DurationVar(&d.duration, "duration", time.Second, "amount of time to wait for CPU and trace profiles") + f.DurationVar(&d.delay, "delay", time.Hour, "amount of time to delay for collecting heap and goroutine profiles.") + f.DurationVar(&d.duration, "duration", time.Hour, "amount of time to wait for CPU and trace profiles.") f.StringVar(&d.trace, "trace", "", "writes an execution trace to the given file.") f.IntVar(&d.signal, "signal", -1, "sends signal to the sandbox") - f.StringVar(&d.strace, "strace", "", `A comma separated list of syscalls to trace. "all" enables all traces, "off" disables all`) + f.StringVar(&d.strace, "strace", "", `A comma separated list of syscalls to trace. "all" enables all traces, "off" disables all.`) f.StringVar(&d.logLevel, "log-level", "", "The log level to set: warning (0), info (1), or debug (2).") f.StringVar(&d.logPackets, "log-packets", "", "A boolean value to enable or disable packet logging: true or false.") f.BoolVar(&d.ps, "ps", false, "lists processes") @@ -90,8 +94,10 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) f.Usage() return subcommands.ExitUsageError } + id := f.Arg(0) + var err error - c, err = container.LoadAndCheck(conf.RootDir, f.Arg(0)) + c, err = container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{}) if err != nil { return Errorf("loading container %q: %v", f.Arg(0), err) } @@ -106,9 +112,10 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) return Errorf("listing containers: %v", err) } for _, id := range ids { - candidate, err := container.LoadAndCheck(conf.RootDir, id) + candidate, err := container.Load(conf.RootDir, id, container.LoadOpts{Exact: true, SkipCheck: true}) if err != nil { - return Errorf("loading container %q: %v", id, err) + log.Warningf("Skipping container %q: %v", id, err) + continue } if candidate.SandboxPid() == d.pid { c = candidate @@ -120,11 +127,12 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) } } - if c.Sandbox == nil || !c.Sandbox.IsRunning() { + if !c.IsSandboxRunning() { return Errorf("container sandbox is not running") } log.Infof("Found sandbox %q, PID: %d", c.Sandbox.ID, c.Sandbox.Pid) + // Perform synchronous actions. if d.signal > 0 { log.Infof("Sending signal %d to process: %d", d.signal, c.Sandbox.Pid) if err := syscall.Kill(c.Sandbox.Pid, syscall.Signal(d.signal)); err != nil { @@ -139,81 +147,6 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) } log.Infof(" *** Stack dump ***\n%s", stacks) } - if d.profileHeap != "" { - f, err := os.Create(d.profileHeap) - if err != nil { - return Errorf(err.Error()) - } - defer f.Close() - - if err := c.Sandbox.HeapProfile(f); err != nil { - return Errorf(err.Error()) - } - log.Infof("Heap profile written to %q", d.profileHeap) - } - if d.profileBlock != "" { - f, err := os.Create(d.profileBlock) - if err != nil { - return Errorf(err.Error()) - } - defer f.Close() - - if err := c.Sandbox.BlockProfile(f); err != nil { - return Errorf(err.Error()) - } - log.Infof("Block profile written to %q", d.profileBlock) - } - if d.profileMutex != "" { - f, err := os.Create(d.profileMutex) - if err != nil { - return Errorf(err.Error()) - } - defer f.Close() - - if err := c.Sandbox.MutexProfile(f); err != nil { - return Errorf(err.Error()) - } - log.Infof("Mutex profile written to %q", d.profileMutex) - } - - delay := false - if d.profileCPU != "" { - delay = true - f, err := os.Create(d.profileCPU) - if err != nil { - return Errorf(err.Error()) - } - defer func() { - f.Close() - if err := c.Sandbox.StopCPUProfile(); err != nil { - Fatalf(err.Error()) - } - log.Infof("CPU profile written to %q", d.profileCPU) - }() - if err := c.Sandbox.StartCPUProfile(f); err != nil { - return Errorf(err.Error()) - } - log.Infof("CPU profile started for %v, writing to %q", d.duration, d.profileCPU) - } - if d.trace != "" { - delay = true - f, err := os.Create(d.trace) - if err != nil { - return Errorf(err.Error()) - } - defer func() { - f.Close() - if err := c.Sandbox.StopTrace(); err != nil { - Fatalf(err.Error()) - } - log.Infof("Trace written to %q", d.trace) - }() - if err := c.Sandbox.StartTrace(f); err != nil { - return Errorf(err.Error()) - } - log.Infof("Tracing started for %v, writing to %q", d.duration, d.trace) - } - if d.strace != "" || len(d.logLevel) != 0 || len(d.logPackets) != 0 { args := control.LoggingArgs{} switch strings.ToLower(d.strace) { @@ -282,8 +215,156 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) log.Infof(o) } - if delay { - time.Sleep(d.duration) + // Open profiling files. + var ( + heapFile *os.File + cpuFile *os.File + traceFile *os.File + blockFile *os.File + mutexFile *os.File + ) + if d.profileHeap != "" { + f, err := os.OpenFile(d.profileHeap, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) + if err != nil { + return Errorf("error opening heap profile output: %v", err) + } + defer f.Close() + heapFile = f + } + if d.profileCPU != "" { + f, err := os.OpenFile(d.profileCPU, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) + if err != nil { + return Errorf("error opening cpu profile output: %v", err) + } + defer f.Close() + cpuFile = f + } + if d.trace != "" { + f, err := os.OpenFile(d.trace, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) + if err != nil { + return Errorf("error opening trace profile output: %v", err) + } + traceFile = f + } + if d.profileBlock != "" { + f, err := os.OpenFile(d.profileBlock, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) + if err != nil { + return Errorf("error opening blocking profile output: %v", err) + } + defer f.Close() + blockFile = f + } + if d.profileMutex != "" { + f, err := os.OpenFile(d.profileMutex, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) + if err != nil { + return Errorf("error opening mutex profile output: %v", err) + } + defer f.Close() + mutexFile = f + } + + // Collect profiles. + var ( + wg sync.WaitGroup + heapErr error + cpuErr error + traceErr error + blockErr error + mutexErr error + ) + if heapFile != nil { + wg.Add(1) + go func() { + defer wg.Done() + heapErr = c.Sandbox.HeapProfile(heapFile, d.delay) + }() + } + if cpuFile != nil { + wg.Add(1) + go func() { + defer wg.Done() + cpuErr = c.Sandbox.CPUProfile(cpuFile, d.duration) + }() + } + if traceFile != nil { + wg.Add(1) + go func() { + defer wg.Done() + traceErr = c.Sandbox.Trace(traceFile, d.duration) + }() + } + if blockFile != nil { + wg.Add(1) + go func() { + defer wg.Done() + blockErr = c.Sandbox.BlockProfile(blockFile, d.duration) + }() + } + if mutexFile != nil { + wg.Add(1) + go func() { + defer wg.Done() + mutexErr = c.Sandbox.MutexProfile(mutexFile, d.duration) + }() + } + + // Before sleeping, allow us to catch signals and try to exit + // gracefully before just exiting. If we can't wait for wg, then + // we will not be able to read the errors below safely. + readyChan := make(chan struct{}) + go func() { + defer close(readyChan) + wg.Wait() + }() + signals := make(chan os.Signal, 1) + signal.Notify(signals, syscall.SIGTERM, syscall.SIGINT) + select { + case <-readyChan: + break // Safe to proceed. + case <-signals: + log.Infof("caught signal, waiting at most one more second.") + select { + case <-signals: + log.Infof("caught second signal, exiting immediately.") + os.Exit(1) // Not finished. + case <-time.After(time.Second): + log.Infof("timeout, exiting.") + os.Exit(1) // Not finished. + case <-readyChan: + break // Safe to proceed. + } + } + + // Collect all errors. + errorCount := 0 + if heapErr != nil { + errorCount++ + log.Infof("error collecting heap profile: %v", heapErr) + os.Remove(heapFile.Name()) + } + if cpuErr != nil { + errorCount++ + log.Infof("error collecting cpu profile: %v", cpuErr) + os.Remove(cpuFile.Name()) + } + if traceErr != nil { + errorCount++ + log.Infof("error collecting trace profile: %v", traceErr) + os.Remove(traceFile.Name()) + } + if blockErr != nil { + errorCount++ + log.Infof("error collecting block profile: %v", blockErr) + os.Remove(blockFile.Name()) + } + if mutexErr != nil { + errorCount++ + log.Infof("error collecting mutex profile: %v", mutexErr) + os.Remove(mutexFile.Name()) + } + + if errorCount > 0 { + return subcommands.ExitFailure } return subcommands.ExitSuccess diff --git a/runsc/cmd/delete.go b/runsc/cmd/delete.go index a25637265..a750be131 100644 --- a/runsc/cmd/delete.go +++ b/runsc/cmd/delete.go @@ -68,7 +68,7 @@ func (d *Delete) Execute(_ context.Context, f *flag.FlagSet, args ...interface{} func (d *Delete) execute(ids []string, conf *config.Config) error { for _, id := range ids { - c, err := container.LoadAndCheck(conf.RootDir, id) + c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{}) if err != nil { if os.IsNotExist(err) && d.force { log.Warningf("couldn't find container %q: %v", id, err) diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go index 640de4c47..8a8d9f752 100644 --- a/runsc/cmd/do.go +++ b/runsc/cmd/do.go @@ -81,7 +81,7 @@ func (c *Do) SetFlags(f *flag.FlagSet) { // Execute implements subcommands.Command.Execute. func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { if len(f.Args()) == 0 { - c.Usage() + f.Usage() return subcommands.ExitUsageError } diff --git a/runsc/cmd/events.go b/runsc/cmd/events.go index 3836b7b4e..75b0aac8d 100644 --- a/runsc/cmd/events.go +++ b/runsc/cmd/events.go @@ -74,7 +74,7 @@ func (evs *Events) Execute(ctx context.Context, f *flag.FlagSet, args ...interfa id := f.Arg(0) conf := args[0].(*config.Config) - c, err := container.LoadAndCheck(conf.RootDir, id) + c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{}) if err != nil { Fatalf("loading sandbox: %v", err) } diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go index 86c02a22a..e9726401a 100644 --- a/runsc/cmd/exec.go +++ b/runsc/cmd/exec.go @@ -112,20 +112,20 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) } waitStatus := args[1].(*syscall.WaitStatus) - c, err := container.LoadAndCheck(conf.RootDir, id) + c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{}) if err != nil { Fatalf("loading sandbox: %v", err) } log.Debugf("Exec arguments: %+v", e) - log.Debugf("Exec capablities: %+v", e.Capabilities) + log.Debugf("Exec capabilities: %+v", e.Capabilities) // Replace empty settings with defaults from container. if e.WorkingDirectory == "" { e.WorkingDirectory = c.Spec.Process.Cwd } if e.Envv == nil { - e.Envv, err = resolveEnvs(c.Spec.Process.Env, ex.env) + e.Envv, err = specutils.ResolveEnvs(c.Spec.Process.Env, ex.env) if err != nil { Fatalf("getting environment variables: %v", err) } @@ -150,7 +150,7 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) } func (ex *Exec) exec(c *container.Container, e *control.ExecArgs, waitStatus *syscall.WaitStatus) subcommands.ExitStatus { - // Start the new process and get it pid. + // Start the new process and get its pid. pid, err := c.Execute(e) if err != nil { return Errorf("executing processes for container: %v", err) @@ -382,31 +382,6 @@ func argsFromProcess(p *specs.Process, enableRaw bool) (*control.ExecArgs, error }, nil } -// resolveEnvs transforms lists of environment variables into a single list of -// environment variables. If a variable is defined multiple times, the last -// value is used. -func resolveEnvs(envs ...[]string) ([]string, error) { - // First create a map of variable names to values. This removes any - // duplicates. - envMap := make(map[string]string) - for _, env := range envs { - for _, str := range env { - parts := strings.SplitN(str, "=", 2) - if len(parts) != 2 { - return nil, fmt.Errorf("invalid variable: %s", str) - } - envMap[parts[0]] = parts[1] - } - } - // Reassemble envMap into a list of environment variables of the form - // NAME=VALUE. - env := make([]string, 0, len(envMap)) - for k, v := range envMap { - env = append(env, fmt.Sprintf("%s=%s", k, v)) - } - return env, nil -} - // capabilities takes a list of capabilities as strings and returns an // auth.TaskCapabilities struct with those capabilities in every capability set. // This mimics runc's behavior. diff --git a/runsc/cmd/kill.go b/runsc/cmd/kill.go index fe69e2a08..aecf0b7ab 100644 --- a/runsc/cmd/kill.go +++ b/runsc/cmd/kill.go @@ -69,7 +69,7 @@ func (k *Kill) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) Fatalf("it is invalid to specify both --all and --pid") } - c, err := container.LoadAndCheck(conf.RootDir, id) + c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{}) if err != nil { Fatalf("loading container: %v", err) } diff --git a/runsc/cmd/list.go b/runsc/cmd/list.go index 6907eb16a..9f9a47bd8 100644 --- a/runsc/cmd/list.go +++ b/runsc/cmd/list.go @@ -24,6 +24,7 @@ import ( "github.com/google/subcommands" specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" @@ -71,7 +72,7 @@ func (l *List) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) if l.quiet { for _, id := range ids { - fmt.Println(id) + fmt.Println(id.ContainerID) } return subcommands.ExitSuccess } @@ -79,9 +80,10 @@ func (l *List) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) // Collect the containers. var containers []*container.Container for _, id := range ids { - c, err := container.LoadAndCheck(conf.RootDir, id) + c, err := container.Load(conf.RootDir, id, container.LoadOpts{Exact: true}) if err != nil { - Fatalf("loading container %q: %v", id, err) + log.Warningf("Skipping container %q: %v", id, err) + continue } containers = append(containers, c) } diff --git a/runsc/cmd/pause.go b/runsc/cmd/pause.go index fe7d4e257..15ef7b577 100644 --- a/runsc/cmd/pause.go +++ b/runsc/cmd/pause.go @@ -55,7 +55,7 @@ func (*Pause) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s id := f.Arg(0) conf := args[0].(*config.Config) - cont, err := container.LoadAndCheck(conf.RootDir, id) + cont, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{}) if err != nil { Fatalf("loading container: %v", err) } diff --git a/runsc/cmd/ps.go b/runsc/cmd/ps.go index 18d7a1436..04e3e0bdd 100644 --- a/runsc/cmd/ps.go +++ b/runsc/cmd/ps.go @@ -60,7 +60,7 @@ func (ps *PS) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) id := f.Arg(0) conf := args[0].(*config.Config) - c, err := container.LoadAndCheck(conf.RootDir, id) + c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{}) if err != nil { Fatalf("loading sandbox: %v", err) } diff --git a/runsc/cmd/resume.go b/runsc/cmd/resume.go index a00928204..856469252 100644 --- a/runsc/cmd/resume.go +++ b/runsc/cmd/resume.go @@ -56,7 +56,7 @@ func (r *Resume) Execute(_ context.Context, f *flag.FlagSet, args ...interface{} id := f.Arg(0) conf := args[0].(*config.Config) - cont, err := container.LoadAndCheck(conf.RootDir, id) + cont, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{}) if err != nil { Fatalf("loading container: %v", err) } diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go index f6499cc44..964a65064 100644 --- a/runsc/cmd/start.go +++ b/runsc/cmd/start.go @@ -55,7 +55,7 @@ func (*Start) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s id := f.Arg(0) conf := args[0].(*config.Config) - c, err := container.LoadAndCheck(conf.RootDir, id) + c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{}) if err != nil { Fatalf("loading container: %v", err) } diff --git a/runsc/cmd/state.go b/runsc/cmd/state.go index d8a70dd7f..1f7913d5a 100644 --- a/runsc/cmd/state.go +++ b/runsc/cmd/state.go @@ -57,7 +57,7 @@ func (*State) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s id := f.Arg(0) conf := args[0].(*config.Config) - c, err := container.LoadAndCheck(conf.RootDir, id) + c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{}) if err != nil { Fatalf("loading container: %v", err) } diff --git a/runsc/cmd/symbolize.go b/runsc/cmd/symbolize.go new file mode 100644 index 000000000..fc0c69358 --- /dev/null +++ b/runsc/cmd/symbolize.go @@ -0,0 +1,91 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "bufio" + "context" + "os" + "strconv" + "strings" + + "github.com/google/subcommands" + "gvisor.dev/gvisor/pkg/coverage" + "gvisor.dev/gvisor/runsc/flag" +) + +// Symbolize implements subcommands.Command for the "symbolize" command. +type Symbolize struct { + dumpAll bool +} + +// Name implements subcommands.Command.Name. +func (*Symbolize) Name() string { + return "symbolize" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Symbolize) Synopsis() string { + return "Convert synthetic instruction pointers from kcov into positions in the runsc source code. Only used when Go coverage is enabled." +} + +// Usage implements subcommands.Command.Usage. +func (*Symbolize) Usage() string { + return `symbolize - converts synthetic instruction pointers into positions in the runsc source code. + +This command takes instruction pointers from stdin and converts them into their +corresponding file names and line/column numbers in the runsc source code. The +inputs are not interpreted as actual addresses, but as synthetic values that are +exposed through /sys/kernel/debug/kcov. One can extract coverage information +from kcov and translate those values into locations in the source code by +running symbolize on the same runsc binary. +` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (c *Symbolize) SetFlags(f *flag.FlagSet) { + f.BoolVar(&c.dumpAll, "all", false, "dump information on all coverage blocks along with their synthetic PCs") +} + +// Execute implements subcommands.Command.Execute. +func (c *Symbolize) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if f.NArg() != 0 { + f.Usage() + return subcommands.ExitUsageError + } + if !coverage.KcovAvailable() { + return Errorf("symbolize can only be used when coverage is available.") + } + coverage.InitCoverageData() + + if c.dumpAll { + coverage.WriteAllBlocks(os.Stdout) + return subcommands.ExitSuccess + } + + scanner := bufio.NewScanner(os.Stdin) + for scanner.Scan() { + // Input is always base 16, but may or may not have a leading "0x". + str := strings.TrimPrefix(scanner.Text(), "0x") + pc, err := strconv.ParseUint(str, 16 /* base */, 64 /* bitSize */) + if err != nil { + return Errorf("Failed to symbolize \"%s\": %v", scanner.Text(), err) + } + if err := coverage.Symbolize(os.Stdout, pc); err != nil { + return Errorf("Failed to symbolize \"%s\": %v", scanner.Text(), err) + } + } + return subcommands.ExitSuccess +} diff --git a/runsc/cmd/syscalls.go b/runsc/cmd/syscalls.go index a37d66139..a8c83d662 100644 --- a/runsc/cmd/syscalls.go +++ b/runsc/cmd/syscalls.go @@ -147,7 +147,7 @@ func getCompatibilityInfo(osName string, archName string) (CompatibilityInfo, er info := CompatibilityInfo(make(map[string]map[string]ArchInfo)) if osName == osAll { // Special processing for the 'all' OS name. - for osName, _ := range syscallTableMap { + for osName := range syscallTableMap { info[osName] = make(map[string]ArchInfo) // osName is a specific OS name. if err := addToCompatibilityInfo(info, osName, archName); err != nil { @@ -171,7 +171,7 @@ func getCompatibilityInfo(osName string, archName string) (CompatibilityInfo, er func addToCompatibilityInfo(info CompatibilityInfo, osName string, archName string) error { if archName == archAll { // Special processing for the 'all' architecture name. - for archName, _ := range syscallTableMap[osName] { + for archName := range syscallTableMap[osName] { archInfo, err := getArchInfo(osName, archName) if err != nil { return err diff --git a/runsc/cmd/wait.go b/runsc/cmd/wait.go index c1d6aeae2..5d55422c7 100644 --- a/runsc/cmd/wait.go +++ b/runsc/cmd/wait.go @@ -72,7 +72,7 @@ func (wt *Wait) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) id := f.Arg(0) conf := args[0].(*config.Config) - c, err := container.LoadAndCheck(conf.RootDir, id) + c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{}) if err != nil { Fatalf("loading container: %v", err) } diff --git a/runsc/config/config.go b/runsc/config/config.go index b02d8e2e1..e9fd7708f 100644 --- a/runsc/config/config.go +++ b/runsc/config/config.go @@ -131,7 +131,7 @@ type Config struct { NumNetworkChannels int `flag:"num-network-channels"` // Rootless allows the sandbox to be started with a user that is not root. - // Defense is depth measures are weaker with rootless. Specifically, the + // Defense in depth measures are weaker in rootless mode. Specifically, the // sandbox and Gofer process run as root inside a user namespace with root // mapped to the caller's user. Rootless bool `flag:"rootless"` diff --git a/runsc/config/flags.go b/runsc/config/flags.go index 13d8f1b25..02ab9255a 100644 --- a/runsc/config/flags.go +++ b/runsc/config/flags.go @@ -71,7 +71,7 @@ func RegisterFlags() { flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.") flag.Bool("overlayfs-stale-read", true, "assume root mount is an overlay filesystem") flag.Bool("fsgofer-host-uds", false, "allow the gofer to mount Unix Domain Sockets.") - flag.Bool("vfs2", false, "TEST ONLY; use while VFSv2 is landing. This uses the new experimental VFS layer.") + flag.Bool("vfs2", false, "enables VFSv2. This uses the new VFS layer that is faster than the previous one.") flag.Bool("fuse", false, "TEST ONLY; use while FUSE in VFSv2 is landing. This allows the use of the new experimental FUSE filesystem.") // Flags that control sandbox runtime behavior: network related. diff --git a/runsc/console/console.go b/runsc/console/console.go index dbb88e117..b36028792 100644 --- a/runsc/console/console.go +++ b/runsc/console/console.go @@ -24,8 +24,8 @@ import ( "golang.org/x/sys/unix" ) -// NewWithSocket creates pty master/replica pair, sends the master FD over the given -// socket, and returns the replica. +// NewWithSocket creates pty master/replica pair, sends the master FD over the +// given socket, and returns the replica. func NewWithSocket(socketPath string) (*os.File, error) { // Create a new pty master and replica. ptyMaster, ptyReplica, err := pty.Open() diff --git a/runsc/container/BUILD b/runsc/container/BUILD index c33755482..8793c8916 100644 --- a/runsc/container/BUILD +++ b/runsc/container/BUILD @@ -1,4 +1,4 @@ -load("//tools:defs.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library", "go_test", "more_shards") package(licenses = ["notice"]) @@ -24,6 +24,7 @@ go_library( "//runsc/boot", "//runsc/cgroup", "//runsc/config", + "//runsc/console", "//runsc/sandbox", "//runsc/specutils", "@com_github_cenkalti_backoff//:go_default_library", @@ -48,7 +49,7 @@ go_test( "//test/cmd/test_app", ], library = ":container", - shard_count = 10, + shard_count = more_shards, tags = [ "requires-kvm", ], diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go index 4228399b8..1b0fdebd6 100644 --- a/runsc/container/console_test.go +++ b/runsc/container/console_test.go @@ -18,6 +18,7 @@ import ( "bytes" "fmt" "io" + "math/rand" "os" "path/filepath" "syscall" @@ -27,7 +28,6 @@ import ( "github.com/kr/pty" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/sentry/control" - "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/test/testutil" "gvisor.dev/gvisor/pkg/unet" @@ -38,19 +38,22 @@ import ( // path is under 108 charactors (the unix socket path length limit), // relativizing the path if necessary. func socketPath(bundleDir string) (string, error) { - path := filepath.Join(bundleDir, "socket") + num := rand.Intn(10000) + path := filepath.Join(bundleDir, fmt.Sprintf("socket-%4d", num)) + const maxPathLen = 108 + if len(path) <= maxPathLen { + return path, nil + } + + // Path is too large, try to make it smaller. cwd, err := os.Getwd() if err != nil { return "", fmt.Errorf("error getting cwd: %v", err) } - relPath, err := filepath.Rel(cwd, path) + path, err = filepath.Rel(cwd, path) if err != nil { return "", fmt.Errorf("error getting relative path for %q from cwd %q: %v", path, cwd, err) } - if len(path) > len(relPath) { - path = relPath - } - const maxPathLen = 108 if len(path) > maxPathLen { return "", fmt.Errorf("could not get socket path under length limit %d: %s", maxPathLen, path) } @@ -159,6 +162,82 @@ func TestConsoleSocket(t *testing.T) { } } +// Test that an pty FD is sent over the console socket if one is provided. +func TestMultiContainerConsoleSocket(t *testing.T) { + for name, conf := range configsWithVFS2(t, all...) { + t.Run(name, func(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir + + // Setup the containers. + sleep := []string{"sleep", "100"} + tru := []string{"true"} + testSpecs, ids := createSpecs(sleep, tru) + testSpecs[1].Process.Terminal = true + + bundleDir, cleanup, err := testutil.SetupBundleDir(testSpecs[0]) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + args := Args{ + ID: ids[0], + Spec: testSpecs[0], + BundleDir: bundleDir, + } + rootCont, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer rootCont.Destroy() + if err := rootCont.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + bundleDir, cleanup, err = testutil.SetupBundleDir(testSpecs[0]) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + sock, err := socketPath(bundleDir) + if err != nil { + t.Fatalf("error getting socket path: %v", err) + } + srv, cleanup := createConsoleSocket(t, sock) + defer cleanup() + + // Create the container and pass the socket name. + args = Args{ + ID: ids[1], + Spec: testSpecs[1], + BundleDir: bundleDir, + ConsoleSocket: sock, + } + cont, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer cont.Destroy() + if err := cont.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Make sure we get a console PTY. + ptyMaster, err := receiveConsolePTY(srv) + if err != nil { + t.Fatalf("error receiving console FD: %v", err) + } + ptyMaster.Close() + }) + } +} + // Test that job control signals work on a console created with "exec -ti". func TestJobControlSignalExec(t *testing.T) { spec := testutil.NewSpecWithArgs("/bin/sleep", "10000") @@ -221,9 +300,9 @@ func TestJobControlSignalExec(t *testing.T) { // Make sure all the processes are running. expectedPL := []*control.Process{ // Root container process. - {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, + newProcessBuilder().Cmd("sleep").Process(), // Bash from exec process. - {PID: 2, Cmd: "bash", Threads: []kernel.ThreadID{2}}, + newProcessBuilder().PID(2).Cmd("bash").Process(), } if err := waitForProcessList(c, expectedPL); err != nil { t.Error(err) @@ -233,7 +312,7 @@ func TestJobControlSignalExec(t *testing.T) { ptyMaster.Write([]byte("sleep 100\n")) // Wait for it to start. Sleep's PPID is bash's PID. - expectedPL = append(expectedPL, &control.Process{PID: 3, PPID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{3}}) + expectedPL = append(expectedPL, newProcessBuilder().PID(3).PPID(2).Cmd("sleep").Process()) if err := waitForProcessList(c, expectedPL); err != nil { t.Error(err) } @@ -254,7 +333,7 @@ func TestJobControlSignalExec(t *testing.T) { // Sleep is dead, but it may take more time for bash to notice and // change the foreground process back to itself. We know it is done // when bash writes "Terminated" to the pty. - if err := testutil.WaitUntilRead(ptyMaster, "Terminated", nil, 5*time.Second); err != nil { + if err := testutil.WaitUntilRead(ptyMaster, "Terminated", 5*time.Second); err != nil { t.Fatalf("bash did not take over pty: %v", err) } @@ -359,7 +438,7 @@ func TestJobControlSignalRootContainer(t *testing.T) { // Wait for bash to start. expectedPL := []*control.Process{ - {PID: 1, Cmd: "bash", Threads: []kernel.ThreadID{1}}, + newProcessBuilder().PID(1).Cmd("bash").Process(), } if err := waitForProcessList(c, expectedPL); err != nil { t.Fatalf("error waiting for processes: %v", err) @@ -369,7 +448,7 @@ func TestJobControlSignalRootContainer(t *testing.T) { ptyMaster.Write([]byte("sleep 100\n")) // Wait for sleep to start. - expectedPL = append(expectedPL, &control.Process{PID: 2, PPID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{2}}) + expectedPL = append(expectedPL, newProcessBuilder().PID(2).PPID(1).Cmd("sleep").Process()) if err := waitForProcessList(c, expectedPL); err != nil { t.Fatalf("error waiting for processes: %v", err) } @@ -393,7 +472,7 @@ func TestJobControlSignalRootContainer(t *testing.T) { // Sleep is dead, but it may take more time for bash to notice and // change the foreground process back to itself. We know it is done // when bash writes "Terminated" to the pty. - if err := testutil.WaitUntilRead(ptyBuf, "Terminated", nil, 5*time.Second); err != nil { + if err := testutil.WaitUntilRead(ptyBuf, "Terminated", 5*time.Second); err != nil { t.Fatalf("bash did not take over pty: %v", err) } @@ -414,6 +493,104 @@ func TestJobControlSignalRootContainer(t *testing.T) { } } +// Test that terminal works with root and sub-containers. +func TestMultiContainerTerminal(t *testing.T) { + for name, conf := range configsWithVFS2(t, all...) { + t.Run(name, func(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir + + // Don't let bash execute from profile or rc files, otherwise our PID + // counts get messed up. + bash := []string{"/bin/bash", "--noprofile", "--norc"} + testSpecs, ids := createSpecs(bash, bash) + + type termContainer struct { + container *Container + master *os.File + } + var containers []termContainer + for i, spec := range testSpecs { + bundleDir, cleanup, err := testutil.SetupBundleDir(spec) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + spec.Process.Terminal = true + sock, err := socketPath(bundleDir) + if err != nil { + t.Fatalf("error getting socket path: %v", err) + } + srv, cleanup := createConsoleSocket(t, sock) + defer cleanup() + + // Create the container and pass the socket name. + args := Args{ + ID: ids[i], + Spec: spec, + BundleDir: bundleDir, + ConsoleSocket: sock, + } + cont, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer cont.Destroy() + + if err := cont.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Make sure we get a console PTY. + ptyMaster, err := receiveConsolePTY(srv) + if err != nil { + t.Fatalf("error receiving console FD: %v", err) + } + defer ptyMaster.Close() + + containers = append(containers, termContainer{ + container: cont, + master: ptyMaster, + }) + } + + for _, tc := range containers { + // Bash output as well as sandbox output will be written to the PTY + // file. Writes after a certain point will block unless we drain the + // PTY, so we must continually copy from it. + // + // We log the output to stderr for debugabilitly, and also to a buffer, + // since we wait on particular output from bash below. We use a custom + // blockingBuffer which is thread-safe and also blocks on Read calls, + // which makes this a suitable Reader for WaitUntilRead. + ptyBuf := newBlockingBuffer() + tee := io.TeeReader(tc.master, ptyBuf) + go io.Copy(os.Stderr, tee) + + // Wait for bash to start. + expectedPL := []*control.Process{ + newProcessBuilder().Cmd("bash").Process(), + } + if err := waitForProcessList(tc.container, expectedPL); err != nil { + t.Fatalf("error waiting for processes: %v", err) + } + + // Execute echo command and check that it was executed correctly. Use + // a variable to ensure it's not matching against command echo. + tc.master.Write([]byte("echo foo-${PWD}-123\n")) + if err := testutil.WaitUntilRead(ptyBuf, "foo-/-123", 5*time.Second); err != nil { + t.Fatalf("echo didn't execute: %v", err) + } + } + }) + } +} + // blockingBuffer is a thread-safe buffer that blocks when reading if the // buffer is empty. It implements io.ReadWriter. type blockingBuffer struct { diff --git a/runsc/container/container.go b/runsc/container/container.go index 4aa139c88..5a0f8d5dc 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -38,6 +38,7 @@ import ( "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/cgroup" "gvisor.dev/gvisor/runsc/config" + "gvisor.dev/gvisor/runsc/console" "gvisor.dev/gvisor/runsc/sandbox" "gvisor.dev/gvisor/runsc/specutils" ) @@ -79,6 +80,7 @@ func validateID(id string) error { // - It calls 'runsc delete'. runc implementation kills --all SIGKILL once // again just to be sure, waits, and then proceeds with remaining teardown. // +// Container is thread-unsafe. type Container struct { // ID is the container ID. ID string `json:"id"` @@ -126,125 +128,6 @@ type Container struct { goferIsChild bool } -// loadSandbox loads all containers that belong to the sandbox with the given -// ID. -func loadSandbox(rootDir, id string) ([]*Container, error) { - cids, err := List(rootDir) - if err != nil { - return nil, err - } - - // Load the container metadata. - var containers []*Container - for _, cid := range cids { - container, err := Load(rootDir, cid) - if err != nil { - // Container file may not exist if it raced with creation/deletion or - // directory was left behind. Load provides a snapshot in time, so it's - // fine to skip it. - if os.IsNotExist(err) { - continue - } - return nil, fmt.Errorf("loading container %q: %v", id, err) - } - if container.Sandbox.ID == id { - containers = append(containers, container) - } - } - return containers, nil -} - -// Load loads a container with the given id from a metadata file. partialID may -// be an abbreviation of the full container id, in which case Load loads the -// container to which id unambiguously refers to. Returns ErrNotExist if -// container doesn't exist. -func Load(rootDir, partialID string) (*Container, error) { - log.Debugf("Load container, rootDir: %q, partial cid: %s", rootDir, partialID) - if err := validateID(partialID); err != nil { - return nil, fmt.Errorf("invalid container id: %v", err) - } - - id, err := findContainerID(rootDir, partialID) - if err != nil { - // Preserve error so that callers can distinguish 'not found' errors. - return nil, err - } - - state := StateFile{ - RootDir: rootDir, - ID: id, - } - defer state.close() - - c := &Container{} - if err := state.load(c); err != nil { - if os.IsNotExist(err) { - // Preserve error so that callers can distinguish 'not found' errors. - return nil, err - } - return nil, fmt.Errorf("reading container metadata file %q: %v", state.statePath(), err) - } - return c, nil -} - -// LoadAndCheck is similar to Load(), but also checks if the container is still -// running to get an error earlier to the caller. -func LoadAndCheck(rootDir, partialID string) (*Container, error) { - c, err := Load(rootDir, partialID) - if err != nil { - // Preserve error so that callers can distinguish 'not found' errors. - return nil, err - } - - // If the status is "Running" or "Created", check that the sandbox/container - // is still running, setting it to Stopped if not. - // - // This is inherently racy. - switch c.Status { - case Created: - if !c.isSandboxRunning() { - // Sandbox no longer exists, so this container definitely does not exist. - c.changeStatus(Stopped) - } - case Running: - if err := c.SignalContainer(syscall.Signal(0), false); err != nil { - c.changeStatus(Stopped) - } - } - - return c, nil -} - -func findContainerID(rootDir, partialID string) (string, error) { - // Check whether the id fully specifies an existing container. - stateFile := buildStatePath(rootDir, partialID) - if _, err := os.Stat(stateFile); err == nil { - return partialID, nil - } - - // Now see whether id could be an abbreviation of exactly 1 of the - // container ids. If id is ambiguous (it could match more than 1 - // container), it is an error. - ids, err := List(rootDir) - if err != nil { - return "", err - } - rv := "" - for _, id := range ids { - if strings.HasPrefix(id, partialID) { - if rv != "" { - return "", fmt.Errorf("id %q is ambiguous and could refer to multiple containers: %q, %q", partialID, rv, id) - } - rv = id - } - } - if rv == "" { - return "", os.ErrNotExist - } - log.Debugf("abbreviated id %q resolves to full id %q", partialID, rv) - return rv, nil -} - // Args is used to configure a new container. type Args struct { // ID is the container unique identifier. @@ -289,6 +172,15 @@ func New(conf *config.Config, args Args) (*Container, error) { return nil, fmt.Errorf("creating container root directory %q: %v", conf.RootDir, err) } + sandboxID := args.ID + if !isRoot(args.Spec) { + var ok bool + sandboxID, ok = specutils.SandboxID(args.Spec) + if !ok { + return nil, fmt.Errorf("no sandbox ID found when creating container") + } + } + c := &Container{ ID: args.ID, Spec: args.Spec, @@ -299,7 +191,10 @@ func New(conf *config.Config, args Args) (*Container, error) { Owner: os.Getenv("USER"), Saver: StateFile{ RootDir: conf.RootDir, - ID: args.ID, + ID: FullID{ + SandboxID: sandboxID, + ContainerID: args.ID, + }, }, } // The Cleanup object cleans up partially created containers when an error @@ -314,10 +209,17 @@ func New(conf *config.Config, args Args) (*Container, error) { } defer c.Saver.unlock() - // If the metadata annotations indicate that this container should be - // started in an existing sandbox, we must do so. The metadata will - // indicate the ID of the sandbox, which is the same as the ID of the - // init container in the sandbox. + // If the metadata annotations indicate that this container should be started + // in an existing sandbox, we must do so. These are the possible metadata + // annotation states: + // 1. No annotations: it means that there is a single container and this + // container is obviously the root. Both container and sandbox share the + // ID. + // 2. Container type == sandbox: it means this is the root container + // starting the sandbox. Both container and sandbox share the same ID. + // 3. Container type == container: it means this is a subcontainer of an + // already started sandbox. In this case, container ID is different than + // the sandbox ID. if isRoot(args.Spec) { log.Debugf("Creating new sandbox for container, cid: %s", args.ID) @@ -356,7 +258,7 @@ func New(conf *config.Config, args Args) (*Container, error) { // Start a new sandbox for this container. Any errors after this point // must destroy the container. sandArgs := &sandbox.Args{ - ID: args.ID, + ID: sandboxID, Spec: args.Spec, BundleDir: args.BundleDir, ConsoleSocket: args.ConsoleSocket, @@ -377,27 +279,34 @@ func New(conf *config.Config, args Args) (*Container, error) { return nil, err } } else { - // This is sort of confusing. For a sandbox with a root - // container and a child container in it, runsc sees: - // * A container struct whose sandbox ID is equal to the - // container ID. This is the root container that is tied to - // the creation of the sandbox. - // * A container struct whose sandbox ID is equal to the above - // container/sandbox ID, but that has a different container - // ID. This is the child container. - sbid, ok := specutils.SandboxID(args.Spec) - if !ok { - return nil, fmt.Errorf("no sandbox ID found when creating container") - } - log.Debugf("Creating new container, cid: %s, sandbox: %s", c.ID, sbid) + log.Debugf("Creating new container, cid: %s, sandbox: %s", c.ID, sandboxID) // Find the sandbox associated with this ID. - sb, err := LoadAndCheck(conf.RootDir, sbid) + fullID := FullID{ + SandboxID: sandboxID, + ContainerID: sandboxID, + } + sb, err := Load(conf.RootDir, fullID, LoadOpts{Exact: true}) if err != nil { return nil, err } c.Sandbox = sb.Sandbox - if err := c.Sandbox.CreateContainer(c.ID); err != nil { + + // If the console control socket file is provided, then create a new + // pty master/slave pair and send the TTY to the sandbox process. + var tty *os.File + if c.ConsoleSocket != "" { + // Create a new TTY pair and send the master on the provided socket. + var err error + tty, err = console.NewWithSocket(c.ConsoleSocket) + if err != nil { + return nil, fmt.Errorf("setting up console with socket %q: %w", c.ConsoleSocket, err) + } + // tty file is transferred to the sandbox, then it can be closed here. + defer tty.Close() + } + + if err := c.Sandbox.CreateContainer(c.ID, tty); err != nil { return nil, err } } @@ -451,11 +360,16 @@ func (c *Container) Start(conf *config.Config) error { // the start (and all their children processes). if err := runInCgroup(c.Sandbox.Cgroup, func() error { // Create the gofer process. - ioFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir, false) + goferFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir, false) if err != nil { return err } - defer mountsFile.Close() + defer func() { + _ = mountsFile.Close() + for _, f := range goferFiles { + _ = f.Close() + } + }() cleanMounts, err := specutils.ReadMounts(mountsFile) if err != nil { @@ -463,7 +377,14 @@ func (c *Container) Start(conf *config.Config) error { } c.Spec.Mounts = cleanMounts - return c.Sandbox.StartContainer(c.Spec, conf, c.ID, ioFiles) + // Setup stdios if the container is not using terminal. Otherwise TTY was + // already setup in create. + var stdios []*os.File + if !c.Spec.Process.Terminal { + stdios = []*os.File{os.Stdin, os.Stdout, os.Stderr} + } + + return c.Sandbox.StartContainer(c.Spec, conf, c.ID, stdios, goferFiles) }); err != nil { return err } @@ -599,7 +520,7 @@ func (c *Container) Wait() (syscall.WaitStatus, error) { // returns its WaitStatus. func (c *Container) WaitRootPID(pid int32) (syscall.WaitStatus, error) { log.Debugf("Wait on process %d in sandbox, cid: %s", pid, c.Sandbox.ID) - if !c.isSandboxRunning() { + if !c.IsSandboxRunning() { return 0, fmt.Errorf("sandbox is not running") } return c.Sandbox.WaitPID(c.Sandbox.ID, pid) @@ -609,7 +530,7 @@ func (c *Container) WaitRootPID(pid int32) (syscall.WaitStatus, error) { // its WaitStatus. func (c *Container) WaitPID(pid int32) (syscall.WaitStatus, error) { log.Debugf("Wait on process %d in container, cid: %s", pid, c.ID) - if !c.isSandboxRunning() { + if !c.IsSandboxRunning() { return 0, fmt.Errorf("sandbox is not running") } return c.Sandbox.WaitPID(c.ID, pid) @@ -629,7 +550,7 @@ func (c *Container) SignalContainer(sig syscall.Signal, all bool) error { if err := c.requireStatus("signal", Running, Stopped); err != nil { return err } - if !c.isSandboxRunning() { + if !c.IsSandboxRunning() { return fmt.Errorf("sandbox is not running") } return c.Sandbox.SignalContainer(c.ID, sig, all) @@ -641,7 +562,7 @@ func (c *Container) SignalProcess(sig syscall.Signal, pid int32) error { if err := c.requireStatus("signal a process inside", Running); err != nil { return err } - if !c.isSandboxRunning() { + if !c.IsSandboxRunning() { return fmt.Errorf("sandbox is not running") } return c.Sandbox.SignalProcess(c.ID, int32(pid), sig, false) @@ -860,7 +781,7 @@ func (c *Container) waitForStopped() error { defer cancel() b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx) op := func() error { - if c.isSandboxRunning() { + if c.IsSandboxRunning() { if err := c.SignalContainer(syscall.Signal(0), false); err == nil { return fmt.Errorf("container is still running") } @@ -1062,7 +983,8 @@ func (c *Container) changeStatus(s Status) { c.Status = s } -func (c *Container) isSandboxRunning() bool { +// IsSandboxRunning returns true if the sandbox exists and is running. +func (c *Container) IsSandboxRunning() bool { return c.Sandbox != nil && c.Sandbox.IsRunning() } diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index fa99e403a..a92ae046d 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -364,7 +364,7 @@ func TestLifecycle(t *testing.T) { defer c.Destroy() // Load the container from disk and check the status. - c, err = LoadAndCheck(rootDir, args.ID) + c, err = Load(rootDir, FullID{ContainerID: args.ID}, LoadOpts{}) if err != nil { t.Fatalf("error loading container: %v", err) } @@ -377,7 +377,11 @@ func TestLifecycle(t *testing.T) { if err != nil { t.Fatalf("error listing containers: %v", err) } - if got, want := ids, []string{args.ID}; !reflect.DeepEqual(got, want) { + fullID := FullID{ + SandboxID: args.ID, + ContainerID: args.ID, + } + if got, want := ids, []FullID{fullID}; !reflect.DeepEqual(got, want) { t.Errorf("container list got %v, want %v", got, want) } @@ -387,7 +391,7 @@ func TestLifecycle(t *testing.T) { } // Load the container from disk and check the status. - c, err = LoadAndCheck(rootDir, args.ID) + c, err = Load(rootDir, fullID, LoadOpts{Exact: true}) if err != nil { t.Fatalf("error loading container: %v", err) } @@ -428,7 +432,7 @@ func TestLifecycle(t *testing.T) { } // Load the container from disk and check the status. - c, err = LoadAndCheck(rootDir, args.ID) + c, err = Load(rootDir, fullID, LoadOpts{Exact: true}) if err != nil { t.Fatalf("error loading container: %v", err) } @@ -451,7 +455,7 @@ func TestLifecycle(t *testing.T) { } // Loading the container by id should fail. - if _, err = LoadAndCheck(rootDir, args.ID); err == nil { + if _, err = Load(rootDir, fullID, LoadOpts{Exact: true}); err == nil { t.Errorf("expected loading destroyed container to fail, but it did not") } }) @@ -1738,7 +1742,7 @@ func doAbbreviatedIDsTest(t *testing.T, vfs2 bool) { cids[2]: cids[2], } for shortid, longid := range unambiguous { - if _, err := LoadAndCheck(rootDir, shortid); err != nil { + if _, err := Load(rootDir, FullID{ContainerID: shortid}, LoadOpts{}); err != nil { t.Errorf("%q should resolve to %q: %v", shortid, longid, err) } } @@ -1749,7 +1753,7 @@ func doAbbreviatedIDsTest(t *testing.T, vfs2 bool) { "ba", } for _, shortid := range ambiguous { - if s, err := LoadAndCheck(rootDir, shortid); err == nil { + if s, err := Load(rootDir, FullID{ContainerID: shortid}, LoadOpts{}); err == nil { t.Errorf("%q should be ambiguous, but resolved to %q", shortid, s.ID) } } @@ -2007,7 +2011,7 @@ func doDestroyStartingTest(t *testing.T, vfs2 bool) { // Container is not thread safe, so load another instance to run in // concurrently. - startCont, err := LoadAndCheck(rootDir, args.ID) + startCont, err := Load(rootDir, FullID{ContainerID: args.ID}, LoadOpts{}) if err != nil { t.Fatalf("error loading container: %v", err) } @@ -2332,6 +2336,42 @@ func TestTTYField(t *testing.T) { } } +// Test that container can run even when there are corrupt state files in the +// root directiry. +func TestCreateWithCorruptedStateFile(t *testing.T) { + conf := testutil.TestConfig(t) + spec := testutil.NewSpecWithArgs("/bin/true") + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + // Create corrupted state file. + corruptID := testutil.RandomContainerID() + corruptState := buildPath(conf.RootDir, FullID{SandboxID: corruptID, ContainerID: corruptID}, stateFileExtension) + if err := ioutil.WriteFile(corruptState, []byte("this{file(is;not[valid.json"), 0777); err != nil { + t.Fatalf("createCorruptStateFile(): %v", err) + } + defer os.Remove(corruptState) + + if _, err := Load(conf.RootDir, FullID{ContainerID: corruptID}, LoadOpts{SkipCheck: true}); err == nil { + t.Fatalf("loading corrupted state file should have failed") + } + + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + Attached: true, + } + if ws, err := Run(conf, args); err != nil { + t.Errorf("running container: %v", err) + } else if !ws.Exited() || ws.ExitStatus() != 0 { + t.Errorf("container failed, waitStatus: %v", ws) + } +} + func execute(cont *Container, name string, arg ...string) (syscall.WaitStatus, error) { args := &control.ExecArgs{ Filename: name, diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go index cadc63bf3..044eec6fe 100644 --- a/runsc/container/multi_container_test.go +++ b/runsc/container/multi_container_test.go @@ -301,54 +301,21 @@ func TestMultiContainerWait(t *testing.T) { } defer cleanup() - // Check via ps that multiple processes are running. - expectedPL := []*control.Process{ - newProcessBuilder().PID(2).PPID(0).Cmd("sleep").Process(), + // Check that we can wait for the sub-container. + c := containers[1] + if ws, err := c.Wait(); err != nil { + t.Errorf("failed to wait for process %s: %v", c.Spec.Process.Args, err) + } else if es := ws.ExitStatus(); es != 0 { + t.Errorf("process %s exited with non-zero status %d", c.Spec.Process.Args, es) } - if err := waitForProcessList(containers[1], expectedPL); err != nil { - t.Errorf("failed to wait for sleep to start: %v", err) + if _, err := c.Wait(); err != nil { + t.Errorf("wait for stopped container %s shouldn't fail: %v", c.Spec.Process.Args, err) } - // Wait on the short lived container from multiple goroutines. - wg := sync.WaitGroup{} - for i := 0; i < 3; i++ { - wg.Add(1) - go func(c *Container) { - defer wg.Done() - if ws, err := c.Wait(); err != nil { - t.Errorf("failed to wait for process %s: %v", c.Spec.Process.Args, err) - } else if es := ws.ExitStatus(); es != 0 { - t.Errorf("process %s exited with non-zero status %d", c.Spec.Process.Args, es) - } - if _, err := c.Wait(); err != nil { - t.Errorf("wait for stopped container %s shouldn't fail: %v", c.Spec.Process.Args, err) - } - }(containers[1]) - } - - // Also wait via PID. - for i := 0; i < 3; i++ { - wg.Add(1) - go func(c *Container) { - defer wg.Done() - const pid = 2 - if ws, err := c.WaitPID(pid); err != nil { - t.Errorf("failed to wait for PID %d: %v", pid, err) - } else if es := ws.ExitStatus(); es != 0 { - t.Errorf("PID %d exited with non-zero status %d", pid, es) - } - if _, err := c.WaitPID(pid); err == nil { - t.Errorf("wait for stopped PID %d should fail", pid) - } - }(containers[1]) - } - - wg.Wait() - // After Wait returns, ensure that the root container is running and // the child has finished. - expectedPL = []*control.Process{ - newProcessBuilder().Cmd("sleep").Process(), + expectedPL := []*control.Process{ + newProcessBuilder().Cmd("sleep").PID(1).Process(), } if err := waitForProcessList(containers[0], expectedPL); err != nil { t.Errorf("failed to wait for %q to start: %v", strings.Join(containers[0].Spec.Process.Args, " "), err) @@ -763,7 +730,7 @@ func TestMultiContainerKillAll(t *testing.T) { // processes still running inside. containers[1].SignalContainer(syscall.SIGKILL, false) op := func() error { - c, err := LoadAndCheck(conf.RootDir, ids[1]) + c, err := Load(conf.RootDir, FullID{ContainerID: ids[1]}, LoadOpts{}) if err != nil { return err } @@ -777,7 +744,7 @@ func TestMultiContainerKillAll(t *testing.T) { } } - c, err := LoadAndCheck(conf.RootDir, ids[1]) + c, err := Load(conf.RootDir, FullID{ContainerID: ids[1]}, LoadOpts{}) if err != nil { t.Fatalf("failed to load child container %q: %v", c.ID, err) } @@ -900,7 +867,7 @@ func TestMultiContainerDestroyStarting(t *testing.T) { // Container is not thread safe, so load another instance to run in // concurrently. - startCont, err := LoadAndCheck(rootDir, ids[i]) + startCont, err := Load(rootDir, FullID{ContainerID: ids[i]}, LoadOpts{}) if err != nil { t.Fatalf("error loading container: %v", err) } @@ -1836,3 +1803,91 @@ func TestMultiContainerEvent(t *testing.T) { } } } + +// Tests that duplicate variables in the spec are merged into a single one. +func TestDuplicateEnvVariable(t *testing.T) { + conf := testutil.TestConfig(t) + + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir + + // Create files to dump `env` output. + files := [3]*os.File{} + for i := 0; i < len(files); i++ { + var err error + files[i], err = ioutil.TempFile(testutil.TmpDir(), "env-var-test") + if err != nil { + t.Fatalf("creating temp file: %v", err) + } + defer files[i].Close() + defer os.Remove(files[i].Name()) + } + + // Setup the containers. Use root container to test exec too. + cmd1 := fmt.Sprintf("env > %q; sleep 1000", files[0].Name()) + cmd2 := fmt.Sprintf("env > %q", files[1].Name()) + cmdExec := fmt.Sprintf("env > %q", files[2].Name()) + testSpecs, ids := createSpecs([]string{"/bin/bash", "-c", cmd1}, []string{"/bin/bash", "-c", cmd2}) + testSpecs[0].Process.Env = append(testSpecs[0].Process.Env, "VAR=foo", "VAR=bar") + testSpecs[1].Process.Env = append(testSpecs[1].Process.Env, "VAR=foo", "VAR=bar") + + containers, cleanup, err := startContainers(conf, testSpecs, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + + // Wait for the `env` from the root container to finish. + expectedPL := []*control.Process{ + newProcessBuilder().Cmd("bash").Process(), + newProcessBuilder().Cmd("sleep").Process(), + } + if err := waitForProcessList(containers[0], expectedPL); err != nil { + t.Errorf("failed to wait for sleep to start: %v", err) + } + if ws, err := containers[1].Wait(); err != nil { + t.Errorf("failed to wait container 1: %v", err) + } else if es := ws.ExitStatus(); es != 0 { + t.Errorf("container %s exited with non-zero status: %v", containers[1].ID, es) + } + + execArgs := &control.ExecArgs{ + Filename: "/bin/bash", + Argv: []string{"/bin/bash", "-c", cmdExec}, + Envv: []string{"VAR=foo", "VAR=bar"}, + } + if ws, err := containers[0].executeSync(execArgs); err != nil || ws.ExitStatus() != 0 { + t.Fatalf("exec failed, ws: %v, err: %v", ws, err) + } + + // Now read and check that none of the env has repeated values. + for _, file := range files { + out, err := ioutil.ReadAll(file) + if err != nil { + t.Fatal(err) + } + t.Logf("Checking env %q:\n%s", file.Name(), out) + envs := make(map[string]string) + for _, line := range strings.Split(string(out), "\n") { + if len(line) == 0 { + continue + } + envVar := strings.SplitN(line, "=", 2) + if len(envVar) != 2 { + t.Fatalf("invalid env variable: %s", line) + } + key := envVar[0] + if val, ok := envs[key]; ok { + t.Errorf("env variable %q is duplicated: %q and %q", key, val, envVar[1]) + } + envs[key] = envVar[1] + } + if _, ok := envs["VAR"]; !ok { + t.Errorf("variable VAR missing: %v", envs) + } + } +} diff --git a/runsc/container/state_file.go b/runsc/container/state_file.go index 17a251530..dfbf1f2d3 100644 --- a/runsc/container/state_file.go +++ b/runsc/container/state_file.go @@ -20,58 +20,228 @@ import ( "io/ioutil" "os" "path/filepath" + "regexp" + "strings" + "syscall" "github.com/gofrs/flock" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sync" ) -const stateFileExtension = ".state" +const stateFileExtension = "state" -// StateFile handles load from/save to container state safely from multiple -// processes. It uses a lock file to provide synchronization between operations. +// LoadOpts provides options for Load()ing a container. +type LoadOpts struct { + // Exact tells whether the search should be exact. See Load() for more. + Exact bool + + // SkipCheck tells Load() to skip checking if container is runnning. + SkipCheck bool +} + +// Load loads a container with the given id from a metadata file. "id" may +// be an abbreviation of the full container id in case LoadOpts.Exact if not +// set. It also checks if the container is still running, in order to return +// an error to the caller earlier. This check is skipped if LoadOpts.SkipCheck +// is set. // -// The lock file is located at: "${s.RootDir}/${s.ID}.lock". -// The state file is located at: "${s.RootDir}/${s.ID}.state". -type StateFile struct { - // RootDir is the directory containing the container metadata file. - RootDir string `json:"rootDir"` +// Returns ErrNotExist if no container is found. Returns error in case more than +// one containers matching the ID prefix is found. +func Load(rootDir string, id FullID, opts LoadOpts) (*Container, error) { + //log.Debugf("Load container, rootDir: %q, partial cid: %s", rootDir, partialID) + if !opts.Exact { + var err error + id, err = findContainerID(rootDir, id.ContainerID) + if err != nil { + // Preserve error so that callers can distinguish 'not found' errors. + return nil, err + } + } - // ID is the container ID. - ID string `json:"id"` + if err := id.validate(); err != nil { + return nil, fmt.Errorf("invalid container id: %v", err) + } + state := StateFile{ + RootDir: rootDir, + ID: id, + } + defer state.close() - // - // Fields below this line are not saved in the state file and will not - // be preserved across commands. - // + c := &Container{} + if err := state.load(c); err != nil { + if os.IsNotExist(err) { + // Preserve error so that callers can distinguish 'not found' errors. + return nil, err + } + return nil, fmt.Errorf("reading container metadata file %q: %v", state.statePath(), err) + } - once sync.Once - flock *flock.Flock + if !opts.SkipCheck { + // If the status is "Running" or "Created", check that the sandbox/container + // is still running, setting it to Stopped if not. + // + // This is inherently racy. + switch c.Status { + case Created: + if !c.IsSandboxRunning() { + // Sandbox no longer exists, so this container definitely does not exist. + c.changeStatus(Stopped) + } + case Running: + if err := c.SignalContainer(syscall.Signal(0), false); err != nil { + c.changeStatus(Stopped) + } + } + } + + return c, nil } // List returns all container ids in the given root directory. -func List(rootDir string) ([]string, error) { +func List(rootDir string) ([]FullID, error) { log.Debugf("List containers %q", rootDir) - list, err := filepath.Glob(filepath.Join(rootDir, "*"+stateFileExtension)) + return listMatch(rootDir, FullID{}) +} + +// listMatch returns all container ids that match the provided id. +func listMatch(rootDir string, id FullID) ([]FullID, error) { + id.SandboxID += "*" + id.ContainerID += "*" + pattern := buildPath(rootDir, id, stateFileExtension) + list, err := filepath.Glob(pattern) if err != nil { return nil, err } - var out []string + var out []FullID for _, path := range list { - // Filter out files that do no belong to a container. - fileName := filepath.Base(path) - if len(fileName) < len(stateFileExtension) { - panic(fmt.Sprintf("invalid file match %q", path)) - } - // Remove the extension. - cid := fileName[:len(fileName)-len(stateFileExtension)] - if validateID(cid) == nil { - out = append(out, cid) + id, err := parseFileName(filepath.Base(path)) + if err == nil { + out = append(out, id) } } return out, nil } +// loadSandbox loads all containers that belong to the sandbox with the given +// ID. +func loadSandbox(rootDir, id string) ([]*Container, error) { + cids, err := listMatch(rootDir, FullID{SandboxID: id}) + if err != nil { + return nil, err + } + + // Load the container metadata. + var containers []*Container + for _, cid := range cids { + container, err := Load(rootDir, cid, LoadOpts{Exact: true, SkipCheck: true}) + if err != nil { + // Container file may not exist if it raced with creation/deletion or + // directory was left behind. Load provides a snapshot in time, so it's + // fine to skip it. + if os.IsNotExist(err) { + continue + } + return nil, fmt.Errorf("loading sandbox %q, failed to load container %q: %v", id, cid, err) + } + containers = append(containers, container) + } + return containers, nil +} + +func findContainerID(rootDir, partialID string) (FullID, error) { + // Check whether the id fully specifies an existing container. + pattern := buildPath(rootDir, FullID{SandboxID: "*", ContainerID: partialID + "*"}, stateFileExtension) + list, err := filepath.Glob(pattern) + if err != nil { + return FullID{}, err + } + switch len(list) { + case 0: + return FullID{}, os.ErrNotExist + case 1: + return parseFileName(filepath.Base(list[0])) + } + + // Now see whether id could be an abbreviation of exactly 1 of the + // container ids. If id is ambiguous (it could match more than 1 + // container), it is an error. + ids, err := List(rootDir) + if err != nil { + return FullID{}, err + } + var rv *FullID + for _, id := range ids { + if strings.HasPrefix(id.ContainerID, partialID) { + if rv != nil { + return FullID{}, fmt.Errorf("id %q is ambiguous and could refer to multiple containers: %q, %q", partialID, rv, id) + } + rv = &id + } + } + if rv == nil { + return FullID{}, os.ErrNotExist + } + log.Debugf("abbreviated id %q resolves to full id %v", partialID, *rv) + return *rv, nil +} + +func parseFileName(name string) (FullID, error) { + re := regexp.MustCompile(`([\w+-\.]+)_sandbox:([\w+-\.]+)\.` + stateFileExtension) + groups := re.FindStringSubmatch(name) + if len(groups) != 3 { + return FullID{}, fmt.Errorf("invalid state file name format: %q", name) + } + id := FullID{ + SandboxID: groups[2], + ContainerID: groups[1], + } + if err := id.validate(); err != nil { + return FullID{}, fmt.Errorf("invalid state file name %q: %w", name, err) + } + return id, nil +} + +// FullID combines sandbox and container ID to identify a container. Sandbox ID +// is used to allow all containers for a given sandbox to be loaded by matching +// sandbox ID in the file name. +type FullID struct { + SandboxID string `json:"sandboxId"` + ContainerID string `json:"containerId"` +} + +func (f *FullID) String() string { + return f.SandboxID + "/" + f.ContainerID +} + +func (f *FullID) validate() error { + if err := validateID(f.SandboxID); err != nil { + return err + } + return validateID(f.ContainerID) +} + +// StateFile handles load from/save to container state safely from multiple +// processes. It uses a lock file to provide synchronization between operations. +// +// The lock file is located at: "${s.RootDir}/${containerd-id}_sand:{sandbox-id}.lock". +// The state file is located at: "${s.RootDir}/${containerd-id}_sand:{sandbox-id}.state". +type StateFile struct { + // RootDir is the directory containing the container metadata file. + RootDir string `json:"rootDir"` + + // ID is the sandbox+container ID. + ID FullID `json:"id"` + + // + // Fields below this line are not saved in the state file and will not + // be preserved across commands. + // + + once sync.Once + flock *flock.Flock +} + // lock globally locks all locking operations for the container. func (s *StateFile) lock() error { s.once.Do(func() { @@ -157,18 +327,20 @@ func (s *StateFile) close() error { return s.flock.Close() } -func buildStatePath(rootDir, id string) string { - return filepath.Join(rootDir, id+stateFileExtension) +func buildPath(rootDir string, id FullID, extension string) string { + // Note: "_" and ":" are not valid in IDs. + name := fmt.Sprintf("%s_sandbox:%s.%s", id.ContainerID, id.SandboxID, extension) + return filepath.Join(rootDir, name) } // statePath is the full path to the state file. func (s *StateFile) statePath() string { - return buildStatePath(s.RootDir, s.ID) + return buildPath(s.RootDir, s.ID, stateFileExtension) } // lockPath is the full path to the lock file. func (s *StateFile) lockPath() string { - return filepath.Join(s.RootDir, s.ID+".lock") + return buildPath(s.RootDir, s.ID, "lock") } // destroy deletes all state created by the stateFile. It may be called with the diff --git a/runsc/flag/flag.go b/runsc/flag/flag.go index 775325c06..f921a8107 100644 --- a/runsc/flag/flag.go +++ b/runsc/flag/flag.go @@ -19,8 +19,10 @@ import ( "flag" ) +// FlagSet is an alias for flag.FlagSet. type FlagSet = flag.FlagSet +// Aliases for flag functions. var ( Bool = flag.Bool CommandLine = flag.CommandLine @@ -32,6 +34,7 @@ var ( Var = flag.Var ) +// ContinueOnError is an alias for flag.ContinueOnError. const ContinueOnError = flag.ContinueOnError // Get returns the flag's underlying object. diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD index 96c57a426..c56e1d4d0 100644 --- a/runsc/fsgofer/BUILD +++ b/runsc/fsgofer/BUILD @@ -29,9 +29,12 @@ go_test( srcs = ["fsgofer_test.go"], library = ":fsgofer", deps = [ + "//pkg/fd", "//pkg/log", "//pkg/p9", "//pkg/test/testutil", + "//runsc/specutils", + "@com_github_syndtr_gocapability//capability:go_default_library", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index 0b628c8ce..c3bba0973 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -49,6 +49,21 @@ const ( allowedOpenFlags = unix.O_TRUNC ) +var ( + // Remember the process uid/gid to skip chown calls when file owner/group + // doesn't need to be changed. + processUID = p9.UID(os.Getuid()) + processGID = p9.GID(os.Getgid()) +) + +// join is equivalent to path.Join() but skips path.Clean() which is expensive. +func join(parent, child string) string { + if child == "." || child == ".." { + panic(fmt.Sprintf("invalid child path %q", child)) + } + return parent + "/" + child +} + // Config sets configuration options for each attach point. type Config struct { // ROMount is set to true if this is a readonly mount. @@ -115,7 +130,7 @@ func (a *attachPoint) Attach() (p9.File, error) { return nil, fmt.Errorf("unable to stat %q: %v", a.prefix, err) } - lf, err := newLocalFile(a, f, a.prefix, readable, stat) + lf, err := newLocalFile(a, f, a.prefix, readable, &stat) if err != nil { return nil, fmt.Errorf("unable to create localFile %q: %v", a.prefix, err) } @@ -124,7 +139,7 @@ func (a *attachPoint) Attach() (p9.File, error) { } // makeQID returns a unique QID for the given stat buffer. -func (a *attachPoint) makeQID(stat unix.Stat_t) p9.QID { +func (a *attachPoint) makeQID(stat *unix.Stat_t) p9.QID { a.deviceMu.Lock() defer a.deviceMu.Unlock() @@ -245,7 +260,7 @@ func reopenProcFd(f *fd.FD, mode int) (*fd.FD, error) { } func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, bool, error) { - pathDebug := path.Join(parent.hostPath, name) + pathDebug := join(parent.hostPath, name) f, readable, err := openAnyFile(pathDebug, func(mode int) (*fd.FD, error) { return fd.OpenAt(parent.file, name, openFlags|mode, 0) }) @@ -297,8 +312,8 @@ func openAnyFile(pathDebug string, fn func(mode int) (*fd.FD, error)) (*fd.FD, b return nil, false, extractErrno(err) } -func checkSupportedFileType(stat unix.Stat_t, permitSocket bool) error { - switch stat.Mode & unix.S_IFMT { +func checkSupportedFileType(mode uint32, permitSocket bool) error { + switch mode & unix.S_IFMT { case unix.S_IFREG, unix.S_IFDIR, unix.S_IFLNK: return nil @@ -313,8 +328,8 @@ func checkSupportedFileType(stat unix.Stat_t, permitSocket bool) error { } } -func newLocalFile(a *attachPoint, file *fd.FD, path string, readable bool, stat unix.Stat_t) (*localFile, error) { - if err := checkSupportedFileType(stat, a.conf.HostUDS); err != nil { +func newLocalFile(a *attachPoint, file *fd.FD, path string, readable bool, stat *unix.Stat_t) (*localFile, error) { + if err := checkSupportedFileType(stat.Mode, a.conf.HostUDS); err != nil { return nil, err } @@ -442,8 +457,10 @@ func (l *localFile) Create(name string, p9Flags p9.OpenFlags, perm p9.FileMode, }) defer cu.Clean() - if err := fchown(child.FD(), uid, gid); err != nil { - return nil, nil, p9.QID{}, 0, extractErrno(err) + if uid != processUID || gid != processGID { + if err := fchown(child.FD(), uid, gid); err != nil { + return nil, nil, p9.QID{}, 0, extractErrno(err) + } } stat, err := fstat(child.FD()) if err != nil { @@ -452,11 +469,11 @@ func (l *localFile) Create(name string, p9Flags p9.OpenFlags, perm p9.FileMode, c := &localFile{ attachPoint: l.attachPoint, - hostPath: path.Join(l.hostPath, name), + hostPath: join(l.hostPath, name), file: child, mode: mode, fileType: unix.S_IFREG, - qid: l.attachPoint.makeQID(stat), + qid: l.attachPoint.makeQID(&stat), } cu.Release() @@ -488,8 +505,10 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID) } defer f.Close() - if err := fchown(f.FD(), uid, gid); err != nil { - return p9.QID{}, extractErrno(err) + if uid != processUID || gid != processGID { + if err := fchown(f.FD(), uid, gid); err != nil { + return p9.QID{}, extractErrno(err) + } } stat, err := fstat(f.FD()) if err != nil { @@ -497,7 +516,7 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID) } cu.Release() - return l.attachPoint.makeQID(stat), nil + return l.attachPoint.makeQID(&stat), nil } // Walk implements p9.File. @@ -512,7 +531,7 @@ func (l *localFile) WalkGetAttr(names []string) ([]p9.QID, p9.File, p9.AttrMask, if err != nil { return nil, nil, p9.AttrMask{}, p9.Attr{}, err } - mask, attr := l.fillAttr(stat) + mask, attr := l.fillAttr(&stat) return qids, file, mask, attr, nil } @@ -538,13 +557,13 @@ func (l *localFile) walk(names []string) ([]p9.QID, p9.File, unix.Stat_t, error) file: newFile, mode: invalidMode, fileType: l.fileType, - qid: l.attachPoint.makeQID(stat), + qid: l.attachPoint.makeQID(&stat), controlReadable: readable, } return []p9.QID{c.qid}, c, stat, nil } - var qids []p9.QID + qids := make([]p9.QID, 0, len(names)) var lastStat unix.Stat_t last := l for _, name := range names { @@ -560,7 +579,7 @@ func (l *localFile) walk(names []string) ([]p9.QID, p9.File, unix.Stat_t, error) _ = f.Close() return nil, nil, unix.Stat_t{}, extractErrno(err) } - c, err := newLocalFile(last.attachPoint, f, path, readable, lastStat) + c, err := newLocalFile(last.attachPoint, f, path, readable, &lastStat) if err != nil { _ = f.Close() return nil, nil, unix.Stat_t{}, extractErrno(err) @@ -609,11 +628,11 @@ func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) if err != nil { return p9.QID{}, p9.AttrMask{}, p9.Attr{}, extractErrno(err) } - mask, attr := l.fillAttr(stat) + mask, attr := l.fillAttr(&stat) return l.qid, mask, attr, nil } -func (l *localFile) fillAttr(stat unix.Stat_t) (p9.AttrMask, p9.Attr) { +func (l *localFile) fillAttr(stat *unix.Stat_t) (p9.AttrMask, p9.Attr) { attr := p9.Attr{ Mode: p9.FileMode(stat.Mode), UID: p9.UID(stat.Uid), @@ -739,15 +758,15 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error { // utimensat operates different that other syscalls. To operate on a // symlink it *requires* AT_SYMLINK_NOFOLLOW with dirFD and a non-empty // name. - parent, err := unix.Open(path.Dir(l.hostPath), openFlags|unix.O_PATH, 0) - if err != nil { - return extractErrno(err) + parent, oErr := unix.Open(path.Dir(l.hostPath), openFlags|unix.O_PATH, 0) + if oErr != nil { + return extractErrno(oErr) } defer unix.Close(parent) - if terr := utimensat(parent, path.Base(l.hostPath), utimes, linux.AT_SYMLINK_NOFOLLOW); terr != nil { - log.Debugf("SetAttr utimens failed %q, err: %v", l.hostPath, terr) - err = extractErrno(terr) + if tErr := utimensat(parent, path.Base(l.hostPath), utimes, linux.AT_SYMLINK_NOFOLLOW); tErr != nil { + log.Debugf("SetAttr utimens failed %q, err: %v", l.hostPath, tErr) + err = extractErrno(tErr) } } else { // Directories and regular files can operate directly on the fd @@ -768,9 +787,9 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error { if valid.GID { gid = int(attr.GID) } - if oerr := unix.Fchownat(f.FD(), "", uid, gid, linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW); oerr != nil { - log.Debugf("SetAttr fchownat failed %q, err: %v", l.hostPath, oerr) - err = extractErrno(oerr) + if oErr := unix.Fchownat(f.FD(), "", uid, gid, linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW); oErr != nil { + log.Debugf("SetAttr fchownat failed %q, err: %v", l.hostPath, oErr) + err = extractErrno(oErr) } } @@ -881,8 +900,10 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9. } defer f.Close() - if err := fchown(f.FD(), uid, gid); err != nil { - return p9.QID{}, extractErrno(err) + if uid != processUID || gid != processGID { + if err := fchown(f.FD(), uid, gid); err != nil { + return p9.QID{}, extractErrno(err) + } } stat, err := fstat(f.FD()) if err != nil { @@ -890,7 +911,7 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9. } cu.Release() - return l.attachPoint.makeQID(stat), nil + return l.attachPoint.makeQID(&stat), nil } // Link implements p9.File. @@ -938,8 +959,10 @@ func (l *localFile) Mknod(name string, mode p9.FileMode, _ uint32, _ uint32, uid } defer child.Close() - if err := fchown(child.FD(), uid, gid); err != nil { - return p9.QID{}, extractErrno(err) + if uid != processUID || gid != processGID { + if err := fchown(child.FD(), uid, gid); err != nil { + return p9.QID{}, extractErrno(err) + } } stat, err := fstat(child.FD()) if err != nil { @@ -947,7 +970,7 @@ func (l *localFile) Mknod(name string, mode p9.FileMode, _ uint32, _ uint32, uid } cu.Release() - return l.attachPoint.makeQID(stat), nil + return l.attachPoint.makeQID(&stat), nil } // UnlinkAt implements p9.File. @@ -1045,7 +1068,7 @@ func (l *localFile) readDirent(f int, offset uint64, count uint32, skip uint64) log.Warningf("Readdir is skipping file with failed stat %q, err: %v", l.hostPath, err) continue } - qid := l.attachPoint.makeQID(stat) + qid := l.attachPoint.makeQID(&stat) offset++ dirents = append(dirents, p9.Dirent{ QID: qid, @@ -1139,7 +1162,7 @@ func (l *localFile) isOpen() bool { // Renamed implements p9.Renamed. func (l *localFile) Renamed(newDir p9.File, newName string) { - l.hostPath = path.Join(newDir.(*localFile).hostPath, newName) + l.hostPath = join(newDir.(*localFile).hostPath, newName) } // extractErrno tries to determine the errno. diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go index a84206686..c5daebe5e 100644 --- a/runsc/fsgofer/fsgofer_test.go +++ b/runsc/fsgofer/fsgofer_test.go @@ -23,10 +23,13 @@ import ( "path/filepath" "testing" + "github.com/syndtr/gocapability/capability" "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/test/testutil" + "gvisor.dev/gvisor/runsc/specutils" ) var allOpenFlags = []p9.OpenFlags{p9.ReadOnly, p9.WriteOnly, p9.ReadWrite} @@ -197,10 +200,13 @@ func setup(fileType uint32) (string, string, error) { switch fileType { case unix.S_IFREG: name = "file" - _, f, _, _, err := root.Create(name, p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) + fd, f, _, _, err := root.Create(name, p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) if err != nil { return "", "", fmt.Errorf("createFile(root, %q) failed, err: %v", "test", err) } + if fd != nil { + fd.Close() + } defer f.Close() case unix.S_IFDIR: name = "dir" @@ -556,7 +562,28 @@ func TestROMountChecks(t *testing.T) { func TestWalkNotFound(t *testing.T) { runCustom(t, []uint32{unix.S_IFDIR}, allConfs, func(t *testing.T, s state) { if _, _, err := s.file.Walk([]string{"nobody-here"}); err != unix.ENOENT { - t.Errorf("%v: Walk(%q) should have failed, got: %v, expected: unix.ENOENT", s, "nobody-here", err) + t.Errorf("Walk(%q) should have failed, got: %v, expected: unix.ENOENT", "nobody-here", err) + } + if _, _, err := s.file.Walk([]string{"nobody", "here"}); err != unix.ENOENT { + t.Errorf("Walk(%q) should have failed, got: %v, expected: unix.ENOENT", "nobody/here", err) + } + if !s.conf.ROMount { + if _, err := s.file.Mkdir("dir", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil { + t.Fatalf("MkDir(dir) failed, err: %v", err) + } + if _, _, err := s.file.Walk([]string{"dir", "nobody-here"}); err != unix.ENOENT { + t.Errorf("Walk(%q) should have failed, got: %v, expected: unix.ENOENT", "dir/nobody-here", err) + } + } + }) +} + +func TestWalkPanic(t *testing.T) { + runCustom(t, []uint32{unix.S_IFDIR}, allConfs, func(t *testing.T, s state) { + for _, name := range []string{".", ".."} { + assertPanic(t, func() { + s.file.Walk([]string{name}) + }) } }) } @@ -574,6 +601,27 @@ func TestWalkDup(t *testing.T) { }) } +func TestWalkMultiple(t *testing.T) { + runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) { + var names []string + var parent p9.File = s.file + for i := 0; i < 5; i++ { + name := fmt.Sprintf("dir%d", i) + names = append(names, name) + + if _, err := parent.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil { + t.Fatalf("MkDir(%q) failed, err: %v", name, err) + } + + var err error + _, parent, err = s.file.Walk(names) + if err != nil { + t.Errorf("Walk(%q): %v", name, err) + } + } + }) +} + func TestReaddir(t *testing.T) { runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) { name := "dir" @@ -819,3 +867,168 @@ func TestMknod(t *testing.T) { } }) } + +func BenchmarkWalkOne(b *testing.B) { + path, name, err := setup(unix.S_IFDIR) + if err != nil { + b.Fatalf("%v", err) + } + defer os.RemoveAll(path) + + a, err := NewAttachPoint(path, Config{}) + if err != nil { + b.Fatalf("NewAttachPoint failed: %v", err) + } + root, err := a.Attach() + if err != nil { + b.Fatalf("Attach failed, err: %v", err) + } + defer root.Close() + + names := []string{name} + files := make([]p9.File, 0, 1000) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, file, err := root.Walk(names) + if err != nil { + b.Fatalf("Walk(%q): %v", name, err) + } + files = append(files, file) + + // Avoid running out of FDs. + if len(files) == cap(files) { + b.StopTimer() + for _, file := range files { + file.Close() + } + files = files[:0] + b.StartTimer() + } + } + + b.StopTimer() + for _, file := range files { + file.Close() + } +} + +func BenchmarkCreate(b *testing.B) { + path, _, err := setup(unix.S_IFDIR) + if err != nil { + b.Fatalf("%v", err) + } + defer os.RemoveAll(path) + + a, err := NewAttachPoint(path, Config{}) + if err != nil { + b.Fatalf("NewAttachPoint failed: %v", err) + } + root, err := a.Attach() + if err != nil { + b.Fatalf("Attach failed, err: %v", err) + } + defer root.Close() + + files := make([]p9.File, 0, 500) + fds := make([]*fd.FD, 0, 500) + uid := p9.UID(os.Getuid()) + gid := p9.GID(os.Getgid()) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + name := fmt.Sprintf("same-%d", i) + fd, file, _, _, err := root.Create(name, p9.ReadOnly, 0777, uid, gid) + if err != nil { + b.Fatalf("Create(%q): %v", name, err) + } + files = append(files, file) + if fd != nil { + fds = append(fds, fd) + } + + // Avoid running out of FDs. + if len(files) == cap(files) { + b.StopTimer() + for _, file := range files { + file.Close() + } + files = files[:0] + for _, fd := range fds { + fd.Close() + } + fds = fds[:0] + b.StartTimer() + } + } + + b.StopTimer() + for _, file := range files { + file.Close() + } + for _, fd := range fds { + fd.Close() + } +} + +func BenchmarkCreateDiffOwner(b *testing.B) { + if !specutils.HasCapabilities(capability.CAP_CHOWN) { + b.Skipf("Test requires CAP_CHOWN") + } + + path, _, err := setup(unix.S_IFDIR) + if err != nil { + b.Fatalf("%v", err) + } + defer os.RemoveAll(path) + + a, err := NewAttachPoint(path, Config{}) + if err != nil { + b.Fatalf("NewAttachPoint failed: %v", err) + } + root, err := a.Attach() + if err != nil { + b.Fatalf("Attach failed, err: %v", err) + } + defer root.Close() + + files := make([]p9.File, 0, 500) + fds := make([]*fd.FD, 0, 500) + gid := p9.GID(os.Getgid()) + const nobody = 65534 + + b.ResetTimer() + for i := 0; i < b.N; i++ { + name := fmt.Sprintf("diff-%d", i) + fd, file, _, _, err := root.Create(name, p9.ReadOnly, 0777, nobody, gid) + if err != nil { + b.Fatalf("Create(%q): %v", name, err) + } + files = append(files, file) + if fd != nil { + fds = append(fds, fd) + } + + // Avoid running out of FDs. + if len(files) == cap(files) { + b.StopTimer() + for _, file := range files { + file.Close() + } + files = files[:0] + for _, fd := range fds { + fd.Close() + } + fds = fds[:0] + b.StartTimer() + } + } + + b.StopTimer() + for _, file := range files { + file.Close() + } + for _, fd := range fds { + fd.Close() + } +} diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go index 8f66dd1f8..9e429f7d5 100644 --- a/runsc/sandbox/network.go +++ b/runsc/sandbox/network.go @@ -127,7 +127,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG // Get all interfaces in the namespace. ifaces, err := net.Interfaces() if err != nil { - return fmt.Errorf("querying interfaces: %v", err) + return fmt.Errorf("querying interfaces: %w", err) } isRoot, err := isRootNS() @@ -148,14 +148,14 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG allAddrs, err := iface.Addrs() if err != nil { - return fmt.Errorf("fetching interface addresses for %q: %v", iface.Name, err) + return fmt.Errorf("fetching interface addresses for %q: %w", iface.Name, err) } // We build our own loopback device. if iface.Flags&net.FlagLoopback != 0 { link, err := loopbackLink(iface, allAddrs) if err != nil { - return fmt.Errorf("getting loopback link for iface %q: %v", iface.Name, err) + return fmt.Errorf("getting loopback link for iface %q: %w", iface.Name, err) } args.LoopbackLinks = append(args.LoopbackLinks, link) continue @@ -209,7 +209,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG // Get the link for the interface. ifaceLink, err := netlink.LinkByName(iface.Name) if err != nil { - return fmt.Errorf("getting link for interface %q: %v", iface.Name, err) + return fmt.Errorf("getting link for interface %q: %w", iface.Name, err) } link.LinkAddress = ifaceLink.Attrs().HardwareAddr @@ -219,7 +219,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG log.Debugf("Creating Channel %d", i) socketEntry, err := createSocket(iface, ifaceLink, hardwareGSO) if err != nil { - return fmt.Errorf("failed to createSocket for %s : %v", iface.Name, err) + return fmt.Errorf("failed to createSocket for %s : %w", iface.Name, err) } if i == 0 { link.GSOMaxSize = socketEntry.gsoMaxSize @@ -241,11 +241,12 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG // Collect the addresses for the interface, enable forwarding, // and remove them from the host. for _, addr := range ipAddrs { - link.Addresses = append(link.Addresses, addr.IP) + prefix, _ := addr.Mask.Size() + link.Addresses = append(link.Addresses, boot.IPWithPrefix{Address: addr.IP, PrefixLen: prefix}) // Steal IP address from NIC. if err := removeAddress(ifaceLink, addr.String()); err != nil { - return fmt.Errorf("removing address %v from device %q: %v", iface.Name, addr, err) + return fmt.Errorf("removing address %v from device %q: %w", addr, iface.Name, err) } } @@ -254,7 +255,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG log.Debugf("Setting up network, config: %+v", args) if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &args, nil); err != nil { - return fmt.Errorf("creating links and routes: %v", err) + return fmt.Errorf("creating links and routes: %w", err) } return nil } @@ -278,8 +279,6 @@ func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) ( ll := syscall.SockaddrLinklayer{ Protocol: protocol, Ifindex: iface.Index, - Hatype: 0, // No ARP type. - Pkttype: syscall.PACKET_OTHERHOST, } if err := syscall.Bind(fd, &ll); err != nil { return nil, fmt.Errorf("unable to bind to %q: %v", iface.Name, err) @@ -339,9 +338,15 @@ func loopbackLink(iface net.Interface, addrs []net.Addr) (boot.LoopbackLink, err if !ok { return boot.LoopbackLink{}, fmt.Errorf("address is not IPNet: %+v", addr) } + + prefix, _ := ipNet.Mask.Size() + link.Addresses = append(link.Addresses, boot.IPWithPrefix{ + Address: ipNet.IP, + PrefixLen: prefix, + }) + dst := *ipNet dst.IP = dst.IP.Mask(dst.Mask) - link.Addresses = append(link.Addresses, ipNet.IP) link.Routes = append(link.Routes, boot.Route{ Destination: dst, }) diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index 4a4110477..266bc0bdc 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -173,7 +173,7 @@ func New(conf *config.Config, args *Args) (*Sandbox, error) { } // CreateContainer creates a non-root container inside the sandbox. -func (s *Sandbox) CreateContainer(cid string) error { +func (s *Sandbox) CreateContainer(cid string, tty *os.File) error { log.Debugf("Create non-root container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid) sandboxConn, err := s.sandboxConnect() if err != nil { @@ -181,7 +181,16 @@ func (s *Sandbox) CreateContainer(cid string) error { } defer sandboxConn.Close() - if err := sandboxConn.Call(boot.ContainerCreate, &cid, nil); err != nil { + var files []*os.File + if tty != nil { + files = []*os.File{tty} + } + + args := boot.CreateArgs{ + CID: cid, + FilePayload: urpc.FilePayload{Files: files}, + } + if err := sandboxConn.Call(boot.ContainerCreate, &args, nil); err != nil { return fmt.Errorf("creating non-root container %q: %v", cid, err) } return nil @@ -211,11 +220,7 @@ func (s *Sandbox) StartRoot(spec *specs.Spec, conf *config.Config) error { } // StartContainer starts running a non-root container inside the sandbox. -func (s *Sandbox) StartContainer(spec *specs.Spec, conf *config.Config, cid string, goferFiles []*os.File) error { - for _, f := range goferFiles { - defer f.Close() - } - +func (s *Sandbox) StartContainer(spec *specs.Spec, conf *config.Config, cid string, stdios, goferFiles []*os.File) error { log.Debugf("Start non-root container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid) sandboxConn, err := s.sandboxConnect() if err != nil { @@ -223,15 +228,18 @@ func (s *Sandbox) StartContainer(spec *specs.Spec, conf *config.Config, cid stri } defer sandboxConn.Close() - // The payload must container stdin/stdout/stderr followed by gofer - // files. - files := append([]*os.File{os.Stdin, os.Stdout, os.Stderr}, goferFiles...) + // The payload must contain stdin/stdout/stderr (which may be empty if using + // TTY) followed by gofer files. + payload := urpc.FilePayload{} + payload.Files = append(payload.Files, stdios...) + payload.Files = append(payload.Files, goferFiles...) + // Start running the container. args := boot.StartArgs{ Spec: spec, Conf: conf, CID: cid, - FilePayload: urpc.FilePayload{Files: files}, + FilePayload: payload, } if err := sandboxConn.Call(boot.ContainerStart, &args, nil); err != nil { return fmt.Errorf("starting non-root container %v: %v", spec.Process.Args, err) @@ -711,6 +719,8 @@ func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyn nextFD++ } + _ = nextFD // All FD assignment is finished. + if args.Attached { // Kill sandbox if parent process exits in attached mode. cmd.SysProcAttr.Pdeathsig = syscall.SIGKILL @@ -983,7 +993,7 @@ func (s *Sandbox) Stacks() (string, error) { } // HeapProfile writes a heap profile to the given file. -func (s *Sandbox) HeapProfile(f *os.File) error { +func (s *Sandbox) HeapProfile(f *os.File, delay time.Duration) error { log.Debugf("Heap profile %q", s.ID) conn, err := s.sandboxConnect() if err != nil { @@ -991,54 +1001,31 @@ func (s *Sandbox) HeapProfile(f *os.File) error { } defer conn.Close() - opts := control.ProfileOpts{ - FilePayload: urpc.FilePayload{ - Files: []*os.File{f}, - }, - } - if err := conn.Call(boot.HeapProfile, &opts, nil); err != nil { - return fmt.Errorf("getting sandbox %q heap profile: %v", s.ID, err) - } - return nil -} - -// StartCPUProfile start CPU profile writing to the given file. -func (s *Sandbox) StartCPUProfile(f *os.File) error { - log.Debugf("CPU profile start %q", s.ID) - conn, err := s.sandboxConnect() - if err != nil { - return err - } - defer conn.Close() - - opts := control.ProfileOpts{ - FilePayload: urpc.FilePayload{ - Files: []*os.File{f}, - }, + opts := control.HeapProfileOpts{ + FilePayload: urpc.FilePayload{Files: []*os.File{f}}, + Delay: delay, } - if err := conn.Call(boot.StartCPUProfile, &opts, nil); err != nil { - return fmt.Errorf("starting sandbox %q CPU profile: %v", s.ID, err) - } - return nil + return conn.Call(boot.HeapProfile, &opts, nil) } -// StopCPUProfile stops a previously started CPU profile. -func (s *Sandbox) StopCPUProfile() error { - log.Debugf("CPU profile stop %q", s.ID) +// CPUProfile collects a CPU profile. +func (s *Sandbox) CPUProfile(f *os.File, duration time.Duration) error { + log.Debugf("CPU profile %q", s.ID) conn, err := s.sandboxConnect() if err != nil { return err } defer conn.Close() - if err := conn.Call(boot.StopCPUProfile, nil, nil); err != nil { - return fmt.Errorf("stopping sandbox %q CPU profile: %v", s.ID, err) + opts := control.CPUProfileOpts{ + FilePayload: urpc.FilePayload{Files: []*os.File{f}}, + Duration: duration, } - return nil + return conn.Call(boot.CPUProfile, &opts, nil) } // BlockProfile writes a block profile to the given file. -func (s *Sandbox) BlockProfile(f *os.File) error { +func (s *Sandbox) BlockProfile(f *os.File, duration time.Duration) error { log.Debugf("Block profile %q", s.ID) conn, err := s.sandboxConnect() if err != nil { @@ -1046,19 +1033,15 @@ func (s *Sandbox) BlockProfile(f *os.File) error { } defer conn.Close() - opts := control.ProfileOpts{ - FilePayload: urpc.FilePayload{ - Files: []*os.File{f}, - }, - } - if err := conn.Call(boot.BlockProfile, &opts, nil); err != nil { - return fmt.Errorf("getting sandbox %q block profile: %v", s.ID, err) + opts := control.BlockProfileOpts{ + FilePayload: urpc.FilePayload{Files: []*os.File{f}}, + Duration: duration, } - return nil + return conn.Call(boot.BlockProfile, &opts, nil) } // MutexProfile writes a mutex profile to the given file. -func (s *Sandbox) MutexProfile(f *os.File) error { +func (s *Sandbox) MutexProfile(f *os.File, duration time.Duration) error { log.Debugf("Mutex profile %q", s.ID) conn, err := s.sandboxConnect() if err != nil { @@ -1066,50 +1049,27 @@ func (s *Sandbox) MutexProfile(f *os.File) error { } defer conn.Close() - opts := control.ProfileOpts{ - FilePayload: urpc.FilePayload{ - Files: []*os.File{f}, - }, - } - if err := conn.Call(boot.MutexProfile, &opts, nil); err != nil { - return fmt.Errorf("getting sandbox %q mutex profile: %v", s.ID, err) - } - return nil -} - -// StartTrace start trace writing to the given file. -func (s *Sandbox) StartTrace(f *os.File) error { - log.Debugf("Trace start %q", s.ID) - conn, err := s.sandboxConnect() - if err != nil { - return err - } - defer conn.Close() - - opts := control.ProfileOpts{ - FilePayload: urpc.FilePayload{ - Files: []*os.File{f}, - }, + opts := control.MutexProfileOpts{ + FilePayload: urpc.FilePayload{Files: []*os.File{f}}, + Duration: duration, } - if err := conn.Call(boot.StartTrace, &opts, nil); err != nil { - return fmt.Errorf("starting sandbox %q trace: %v", s.ID, err) - } - return nil + return conn.Call(boot.MutexProfile, &opts, nil) } -// StopTrace stops a previously started trace. -func (s *Sandbox) StopTrace() error { - log.Debugf("Trace stop %q", s.ID) +// Trace collects an execution trace. +func (s *Sandbox) Trace(f *os.File, duration time.Duration) error { + log.Debugf("Trace %q", s.ID) conn, err := s.sandboxConnect() if err != nil { return err } defer conn.Close() - if err := conn.Call(boot.StopTrace, nil, nil); err != nil { - return fmt.Errorf("stopping sandbox %q trace: %v", s.ID, err) + opts := control.TraceProfileOpts{ + FilePayload: urpc.FilePayload{Files: []*os.File{f}}, + Duration: duration, } - return nil + return conn.Call(boot.Trace, &opts, nil) } // ChangeLogging changes logging options. diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go index fdbba1832..ea55bbc7d 100644 --- a/runsc/specutils/specutils.go +++ b/runsc/specutils/specutils.go @@ -493,6 +493,31 @@ func EnvVar(env []string, name string) (string, bool) { return "", false } +// ResolveEnvs transforms lists of environment variables into a single list of +// environment variables. If a variable is defined multiple times, the last +// value is used. +func ResolveEnvs(envs ...[]string) ([]string, error) { + // First create a map of variable names to values. This removes any + // duplicates. + envMap := make(map[string]string) + for _, env := range envs { + for _, str := range env { + parts := strings.SplitN(str, "=", 2) + if len(parts) != 2 { + return nil, fmt.Errorf("invalid variable: %s", str) + } + envMap[parts[0]] = parts[1] + } + } + // Reassemble envMap into a list of environment variables of the form + // NAME=VALUE. + env := make([]string, 0, len(envMap)) + for k, v := range envMap { + env = append(env, fmt.Sprintf("%s=%s", k, v)) + } + return env, nil +} + // FaqErrorMsg returns an error message pointing to the FAQ. func FaqErrorMsg(anchor, msg string) string { return fmt.Sprintf("%s; see https://gvisor.dev/faq#%s for more details", msg, anchor) |