diff options
Diffstat (limited to 'runsc/boot/loader.go')
-rw-r--r-- | runsc/boot/loader.go | 134 |
1 files changed, 83 insertions, 51 deletions
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index ad4d50008..b46d84e5a 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -58,6 +58,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/watchdog" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/link/ethernet" "gvisor.dev/gvisor/pkg/tcpip/link/loopback" "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" "gvisor.dev/gvisor/pkg/tcpip/network/arp" @@ -119,6 +120,10 @@ type Loader struct { // container. It should be called when a sandbox is destroyed. stopSignalForwarding func() + // stopProfiling stops profiling started at container creation. It + // should be called when a sandbox is destroyed. + stopProfiling func() + // restore is set to true if we are restoring a container. restore bool @@ -198,6 +203,21 @@ type Args struct { TotalMem uint64 // UserLogFD is the file descriptor to write user logs to. UserLogFD int + // ProfileBlockFD is the file descriptor to write a block profile to. + // Valid if >=0. + ProfileBlockFD int + // ProfileCPUFD is the file descriptor to write a CPU profile to. + // Valid if >=0. + ProfileCPUFD int + // ProfileHeapFD is the file descriptor to write a heap profile to. + // Valid if >=0. + ProfileHeapFD int + // ProfileMutexFD is the file descriptor to write a mutex profile to. + // Valid if >=0. + ProfileMutexFD int + // TraceFD is the file descriptor to write a Go execution trace to. + // Valid if >=0. + TraceFD int } // make sure stdioFDs are always the same on initial start and on restore @@ -206,6 +226,8 @@ const startingStdioFD = 256 // New initializes a new kernel loader configured by spec. // New also handles setting up a kernel for restoring a container. func New(args Args) (*Loader, error) { + stopProfiling := startProfiling(args) + // We initialize the rand package now to make sure /dev/urandom is pre-opened // on kernels that do not support getrandom(2). if err := rand.Init(); err != nil { @@ -278,19 +300,15 @@ func New(args Args) (*Loader, error) { } // Create timekeeper. - tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange()) - if err != nil { - return nil, fmt.Errorf("creating timekeeper: %w", err) - } + tk := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange()) tk.SetClocks(time.NewCalibratedClocks()) - k.SetTimekeeper(tk) if err := enableStrace(args.Conf); err != nil { return nil, fmt.Errorf("enabling strace: %w", err) } // Create root network namespace/stack. - netns, err := newRootNetworkNamespace(args.Conf, k, k) + netns, err := newRootNetworkNamespace(args.Conf, tk, k) if err != nil { return nil, fmt.Errorf("creating network: %w", err) } @@ -332,6 +350,7 @@ func New(args Args) (*Loader, error) { // to createVFS in order to mount (among other things) procfs. if err = k.Init(kernel.InitKernelArgs{ FeatureSet: cpuid.HostFeatureSet(), + Timekeeper: tk, RootUserNamespace: creds.UserNamespace, RootNetworkNamespace: netns, ApplicationCores: uint(args.NumCPU), @@ -402,12 +421,13 @@ func New(args Args) (*Loader, error) { eid := execID{cid: args.ID} l := &Loader{ - k: k, - watchdog: dog, - sandboxID: args.ID, - processes: map[execID]*execProcess{eid: {}}, - mountHints: mountHints, - root: info, + k: k, + watchdog: dog, + sandboxID: args.ID, + processes: map[execID]*execProcess{eid: {}}, + mountHints: mountHints, + root: info, + stopProfiling: stopProfiling, } // We don't care about child signals; some platforms can generate a @@ -500,6 +520,8 @@ func (l *Loader) Destroy() { for _, f := range l.root.goferFDs { _ = f.Close() } + + l.stopProfiling() } func createPlatform(conf *config.Config, deviceFile *os.File) (platform.Platform, error) { @@ -636,8 +658,8 @@ func (l *Loader) run() error { return l.k.Start() } -// createContainer creates a new container inside the sandbox. -func (l *Loader) createContainer(cid string, tty *fd.FD) error { +// createSubcontainer creates a new container inside the sandbox. +func (l *Loader) createSubcontainer(cid string, tty *fd.FD) error { l.mu.Lock() defer l.mu.Unlock() @@ -649,10 +671,10 @@ func (l *Loader) createContainer(cid string, tty *fd.FD) error { return nil } -// startContainer starts a child container. It returns the thread group ID of +// startSubcontainer starts a child container. It returns the thread group ID of // the newly created process. Used FDs are either closed or released. It's safe // for the caller to close any remaining files upon return. -func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, stdioFDs, goferFDs []*fd.FD) error { +func (l *Loader) startSubcontainer(spec *specs.Spec, conf *config.Config, cid string, stdioFDs, goferFDs []*fd.FD) error { // Create capabilities. caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities) if err != nil { @@ -718,7 +740,7 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid strin return fmt.Errorf("using TTY, stdios not expected: %d", l) } if ep.hostTTY == nil { - return fmt.Errorf("terminal enabled but no TTY provided (--console-socket possibly passed)") + return fmt.Errorf("terminal enabled but no TTY provided. Did you set --console-socket on create?") } info.stdioFDs = []*fd.FD{ep.hostTTY, ep.hostTTY, ep.hostTTY} ep.hostTTY = nil @@ -737,7 +759,7 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid strin func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo) (*kernel.ThreadGroup, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { // Create the FD map, which will set stdin, stdout, and stderr. ctx := info.procArgs.NewContext(l.k) - fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, info.spec.Process.Terminal, info.stdioFDs) + fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, info.spec.Process.Terminal, info.stdioFDs, info.spec.Process.User) if err != nil { return nil, nil, nil, fmt.Errorf("importing fds: %w", err) } @@ -745,8 +767,11 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn // ours either way. info.procArgs.FDTable = fdTable - // Setup the child container file system. - l.startGoferMonitor(cid, info.goferFDs) + // Gofer FDs must be ordered and the first FD is always the rootfs. + if len(info.goferFDs) < 1 { + return nil, nil, nil, fmt.Errorf("rootfs gofer FD not found") + } + l.startGoferMonitor(cid, int32(info.goferFDs[0].FD())) mntr := newContainerMounter(info, l.k, l.mountHints, kernel.VFS2Enabled) if root { @@ -819,17 +844,21 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn } // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on -// the gofer FDs looking for disconnects, and kills the container processes if a -// disconnect occurs in any of the gofer FDs. -func (l *Loader) startGoferMonitor(cid string, goferFDs []*fd.FD) { +// the gofer FD looking for disconnects, and kills the container processes if +// the rootfs FD disconnects. +// +// Note that other gofer mounts are allowed to be unmounted and disconnected. +func (l *Loader) startGoferMonitor(cid string, rootfsGoferFD int32) { + if rootfsGoferFD < 0 { + panic(fmt.Sprintf("invalid FD: %d", rootfsGoferFD)) + } go func() { log.Debugf("Monitoring gofer health for container %q", cid) - var events []unix.PollFd - for _, goferFD := range goferFDs { - events = append(events, unix.PollFd{ - Fd: int32(goferFD.FD()), + events := []unix.PollFd{ + { + Fd: rootfsGoferFD, Events: unix.POLLHUP | unix.POLLRDHUP, - }) + }, } _, _, err := specutils.RetryEintr(func() (uintptr, uintptr, error) { // Use ppoll instead of poll because it's already whilelisted in seccomp. @@ -854,9 +883,9 @@ func (l *Loader) startGoferMonitor(cid string, goferFDs []*fd.FD) { }() } -// destroyContainer stops a container if it is still running and cleans up its -// filesystem. -func (l *Loader) destroyContainer(cid string) error { +// destroySubcontainer stops a container if it is still running and cleans up +// its filesystem. +func (l *Loader) destroySubcontainer(cid string) error { l.mu.Lock() defer l.mu.Unlock() @@ -983,7 +1012,7 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { tty: ttyFile, ttyVFS2: ttyFileVFS2, } - log.Debugf("updated processes: %s", l.processes) + log.Debugf("updated processes: %v", l.processes) return tgid, nil } @@ -1004,7 +1033,7 @@ func (l *Loader) waitContainer(cid string, waitStatus *uint32) error { // Check for leaks and write coverage report after the root container has // exited. This guarantees that the report is written in cases where the - // sandbox is killed by a signal after the ContainerWait request is completed. + // sandbox is killed by a signal after the ContMgrWait request is completed. if l.root.procArgs.ContainerID == cid { // All sentry-created resources should have been released at this point. refsvfs2.DoLeakCheck() @@ -1027,7 +1056,7 @@ func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) e l.mu.Lock() delete(l.processes, eid) - log.Debugf("updated processes (removal): %s", l.processes) + log.Debugf("updated processes (removal): %v", l.processes) l.mu.Unlock() return nil } @@ -1054,7 +1083,7 @@ func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) e // to exit. func (l *Loader) wait(tg *kernel.ThreadGroup) uint32 { tg.WaitExited() - return tg.ExitStatus().Status() + return uint32(tg.ExitStatus()) } // WaitForStartSignal waits for a start signal from the control server. @@ -1063,7 +1092,7 @@ func (l *Loader) WaitForStartSignal() { } // WaitExit waits for the root container to exit, and returns its exit status. -func (l *Loader) WaitExit() kernel.ExitStatus { +func (l *Loader) WaitExit() linux.WaitStatus { // Wait for container. l.k.WaitExited() @@ -1084,23 +1113,24 @@ func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID st return inet.NewRootNamespace(hostinet.NewStack(), nil), nil case config.NetworkNone, config.NetworkSandbox: - s, err := newEmptySandboxNetworkStack(clock, uniqueID) + s, err := newEmptySandboxNetworkStack(clock, uniqueID, conf.AllowPacketEndpointWrite) if err != nil { return nil, err } creator := &sandboxNetstackCreator{ - clock: clock, - uniqueID: uniqueID, + clock: clock, + uniqueID: uniqueID, + allowPacketEndpointWrite: conf.AllowPacketEndpointWrite, } return inet.NewRootNamespace(s, creator), nil default: - panic(fmt.Sprintf("invalid network configuration: %d", conf.Network)) + panic(fmt.Sprintf("invalid network configuration: %v", conf.Network)) } } -func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) { +func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID, allowPacketEndpointWrite bool) (inet.Stack, error) { netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol} transProtos := []stack.TransportProtocolFactory{ tcp.NewProtocol, @@ -1116,9 +1146,10 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in HandleLocal: true, // Enable raw sockets for users with sufficient // privileges. - RawFactory: raw.EndpointFactory{}, - UniqueID: uniqueID, - DefaultIPTables: netfilter.DefaultLinuxTables, + RawFactory: raw.EndpointFactory{}, + AllowPacketEndpointWrite: allowPacketEndpointWrite, + UniqueID: uniqueID, + DefaultIPTables: netfilter.DefaultLinuxTables, })} // Enable SACK Recovery. @@ -1155,13 +1186,14 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in // // +stateify savable type sandboxNetstackCreator struct { - clock tcpip.Clock - uniqueID stack.UniqueID + clock tcpip.Clock + uniqueID stack.UniqueID + allowPacketEndpointWrite bool } // CreateStack implements kernel.NetworkStackCreator.CreateStack. func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) { - s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID) + s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID, f.allowPacketEndpointWrite) if err != nil { return nil, err } @@ -1170,7 +1202,7 @@ func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) { n := &Network{Stack: s.(*netstack.Stack).Stack} nicID := tcpip.NICID(f.uniqueID.UniqueID()) link := DefaultLoopbackLink - linkEP := loopback.New() + linkEP := ethernet.New(loopback.New()) if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil { return nil, err } @@ -1215,7 +1247,7 @@ func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) e return nil default: - panic(fmt.Sprintf("unknown signal delivery mode %s", mode)) + panic(fmt.Sprintf("unknown signal delivery mode %v", mode)) } } @@ -1340,14 +1372,14 @@ func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileOperations, *hostvfs2 return ep.tty, ep.ttyVFS2, nil } -func createFDTable(ctx context.Context, console bool, stdioFDs []*fd.FD) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { +func createFDTable(ctx context.Context, console bool, stdioFDs []*fd.FD, user specs.User) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { if len(stdioFDs) != 3 { return nil, nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs)) } k := kernel.KernelFromContext(ctx) fdTable := k.NewFDTable() - ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, console, stdioFDs) + ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, console, auth.KUID(user.UID), auth.KGID(user.GID), stdioFDs) if err != nil { fdTable.DecRef(ctx) return nil, nil, nil, err |