summaryrefslogtreecommitdiffhomepage
path: root/runsc/boot/loader.go
diff options
context:
space:
mode:
Diffstat (limited to 'runsc/boot/loader.go')
-rw-r--r--runsc/boot/loader.go134
1 files changed, 83 insertions, 51 deletions
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index ad4d50008..b46d84e5a 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -58,6 +58,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/watchdog"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
+ "gvisor.dev/gvisor/pkg/tcpip/link/ethernet"
"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
"gvisor.dev/gvisor/pkg/tcpip/network/arp"
@@ -119,6 +120,10 @@ type Loader struct {
// container. It should be called when a sandbox is destroyed.
stopSignalForwarding func()
+ // stopProfiling stops profiling started at container creation. It
+ // should be called when a sandbox is destroyed.
+ stopProfiling func()
+
// restore is set to true if we are restoring a container.
restore bool
@@ -198,6 +203,21 @@ type Args struct {
TotalMem uint64
// UserLogFD is the file descriptor to write user logs to.
UserLogFD int
+ // ProfileBlockFD is the file descriptor to write a block profile to.
+ // Valid if >=0.
+ ProfileBlockFD int
+ // ProfileCPUFD is the file descriptor to write a CPU profile to.
+ // Valid if >=0.
+ ProfileCPUFD int
+ // ProfileHeapFD is the file descriptor to write a heap profile to.
+ // Valid if >=0.
+ ProfileHeapFD int
+ // ProfileMutexFD is the file descriptor to write a mutex profile to.
+ // Valid if >=0.
+ ProfileMutexFD int
+ // TraceFD is the file descriptor to write a Go execution trace to.
+ // Valid if >=0.
+ TraceFD int
}
// make sure stdioFDs are always the same on initial start and on restore
@@ -206,6 +226,8 @@ const startingStdioFD = 256
// New initializes a new kernel loader configured by spec.
// New also handles setting up a kernel for restoring a container.
func New(args Args) (*Loader, error) {
+ stopProfiling := startProfiling(args)
+
// We initialize the rand package now to make sure /dev/urandom is pre-opened
// on kernels that do not support getrandom(2).
if err := rand.Init(); err != nil {
@@ -278,19 +300,15 @@ func New(args Args) (*Loader, error) {
}
// Create timekeeper.
- tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange())
- if err != nil {
- return nil, fmt.Errorf("creating timekeeper: %w", err)
- }
+ tk := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange())
tk.SetClocks(time.NewCalibratedClocks())
- k.SetTimekeeper(tk)
if err := enableStrace(args.Conf); err != nil {
return nil, fmt.Errorf("enabling strace: %w", err)
}
// Create root network namespace/stack.
- netns, err := newRootNetworkNamespace(args.Conf, k, k)
+ netns, err := newRootNetworkNamespace(args.Conf, tk, k)
if err != nil {
return nil, fmt.Errorf("creating network: %w", err)
}
@@ -332,6 +350,7 @@ func New(args Args) (*Loader, error) {
// to createVFS in order to mount (among other things) procfs.
if err = k.Init(kernel.InitKernelArgs{
FeatureSet: cpuid.HostFeatureSet(),
+ Timekeeper: tk,
RootUserNamespace: creds.UserNamespace,
RootNetworkNamespace: netns,
ApplicationCores: uint(args.NumCPU),
@@ -402,12 +421,13 @@ func New(args Args) (*Loader, error) {
eid := execID{cid: args.ID}
l := &Loader{
- k: k,
- watchdog: dog,
- sandboxID: args.ID,
- processes: map[execID]*execProcess{eid: {}},
- mountHints: mountHints,
- root: info,
+ k: k,
+ watchdog: dog,
+ sandboxID: args.ID,
+ processes: map[execID]*execProcess{eid: {}},
+ mountHints: mountHints,
+ root: info,
+ stopProfiling: stopProfiling,
}
// We don't care about child signals; some platforms can generate a
@@ -500,6 +520,8 @@ func (l *Loader) Destroy() {
for _, f := range l.root.goferFDs {
_ = f.Close()
}
+
+ l.stopProfiling()
}
func createPlatform(conf *config.Config, deviceFile *os.File) (platform.Platform, error) {
@@ -636,8 +658,8 @@ func (l *Loader) run() error {
return l.k.Start()
}
-// createContainer creates a new container inside the sandbox.
-func (l *Loader) createContainer(cid string, tty *fd.FD) error {
+// createSubcontainer creates a new container inside the sandbox.
+func (l *Loader) createSubcontainer(cid string, tty *fd.FD) error {
l.mu.Lock()
defer l.mu.Unlock()
@@ -649,10 +671,10 @@ func (l *Loader) createContainer(cid string, tty *fd.FD) error {
return nil
}
-// startContainer starts a child container. It returns the thread group ID of
+// startSubcontainer starts a child container. It returns the thread group ID of
// the newly created process. Used FDs are either closed or released. It's safe
// for the caller to close any remaining files upon return.
-func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, stdioFDs, goferFDs []*fd.FD) error {
+func (l *Loader) startSubcontainer(spec *specs.Spec, conf *config.Config, cid string, stdioFDs, goferFDs []*fd.FD) error {
// Create capabilities.
caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities)
if err != nil {
@@ -718,7 +740,7 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid strin
return fmt.Errorf("using TTY, stdios not expected: %d", l)
}
if ep.hostTTY == nil {
- return fmt.Errorf("terminal enabled but no TTY provided (--console-socket possibly passed)")
+ return fmt.Errorf("terminal enabled but no TTY provided. Did you set --console-socket on create?")
}
info.stdioFDs = []*fd.FD{ep.hostTTY, ep.hostTTY, ep.hostTTY}
ep.hostTTY = nil
@@ -737,7 +759,7 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid strin
func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo) (*kernel.ThreadGroup, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
// Create the FD map, which will set stdin, stdout, and stderr.
ctx := info.procArgs.NewContext(l.k)
- fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, info.spec.Process.Terminal, info.stdioFDs)
+ fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, info.spec.Process.Terminal, info.stdioFDs, info.spec.Process.User)
if err != nil {
return nil, nil, nil, fmt.Errorf("importing fds: %w", err)
}
@@ -745,8 +767,11 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn
// ours either way.
info.procArgs.FDTable = fdTable
- // Setup the child container file system.
- l.startGoferMonitor(cid, info.goferFDs)
+ // Gofer FDs must be ordered and the first FD is always the rootfs.
+ if len(info.goferFDs) < 1 {
+ return nil, nil, nil, fmt.Errorf("rootfs gofer FD not found")
+ }
+ l.startGoferMonitor(cid, int32(info.goferFDs[0].FD()))
mntr := newContainerMounter(info, l.k, l.mountHints, kernel.VFS2Enabled)
if root {
@@ -819,17 +844,21 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn
}
// startGoferMonitor runs a goroutine to monitor gofer's health. It polls on
-// the gofer FDs looking for disconnects, and kills the container processes if a
-// disconnect occurs in any of the gofer FDs.
-func (l *Loader) startGoferMonitor(cid string, goferFDs []*fd.FD) {
+// the gofer FD looking for disconnects, and kills the container processes if
+// the rootfs FD disconnects.
+//
+// Note that other gofer mounts are allowed to be unmounted and disconnected.
+func (l *Loader) startGoferMonitor(cid string, rootfsGoferFD int32) {
+ if rootfsGoferFD < 0 {
+ panic(fmt.Sprintf("invalid FD: %d", rootfsGoferFD))
+ }
go func() {
log.Debugf("Monitoring gofer health for container %q", cid)
- var events []unix.PollFd
- for _, goferFD := range goferFDs {
- events = append(events, unix.PollFd{
- Fd: int32(goferFD.FD()),
+ events := []unix.PollFd{
+ {
+ Fd: rootfsGoferFD,
Events: unix.POLLHUP | unix.POLLRDHUP,
- })
+ },
}
_, _, err := specutils.RetryEintr(func() (uintptr, uintptr, error) {
// Use ppoll instead of poll because it's already whilelisted in seccomp.
@@ -854,9 +883,9 @@ func (l *Loader) startGoferMonitor(cid string, goferFDs []*fd.FD) {
}()
}
-// destroyContainer stops a container if it is still running and cleans up its
-// filesystem.
-func (l *Loader) destroyContainer(cid string) error {
+// destroySubcontainer stops a container if it is still running and cleans up
+// its filesystem.
+func (l *Loader) destroySubcontainer(cid string) error {
l.mu.Lock()
defer l.mu.Unlock()
@@ -983,7 +1012,7 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
tty: ttyFile,
ttyVFS2: ttyFileVFS2,
}
- log.Debugf("updated processes: %s", l.processes)
+ log.Debugf("updated processes: %v", l.processes)
return tgid, nil
}
@@ -1004,7 +1033,7 @@ func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
// Check for leaks and write coverage report after the root container has
// exited. This guarantees that the report is written in cases where the
- // sandbox is killed by a signal after the ContainerWait request is completed.
+ // sandbox is killed by a signal after the ContMgrWait request is completed.
if l.root.procArgs.ContainerID == cid {
// All sentry-created resources should have been released at this point.
refsvfs2.DoLeakCheck()
@@ -1027,7 +1056,7 @@ func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) e
l.mu.Lock()
delete(l.processes, eid)
- log.Debugf("updated processes (removal): %s", l.processes)
+ log.Debugf("updated processes (removal): %v", l.processes)
l.mu.Unlock()
return nil
}
@@ -1054,7 +1083,7 @@ func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) e
// to exit.
func (l *Loader) wait(tg *kernel.ThreadGroup) uint32 {
tg.WaitExited()
- return tg.ExitStatus().Status()
+ return uint32(tg.ExitStatus())
}
// WaitForStartSignal waits for a start signal from the control server.
@@ -1063,7 +1092,7 @@ func (l *Loader) WaitForStartSignal() {
}
// WaitExit waits for the root container to exit, and returns its exit status.
-func (l *Loader) WaitExit() kernel.ExitStatus {
+func (l *Loader) WaitExit() linux.WaitStatus {
// Wait for container.
l.k.WaitExited()
@@ -1084,23 +1113,24 @@ func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID st
return inet.NewRootNamespace(hostinet.NewStack(), nil), nil
case config.NetworkNone, config.NetworkSandbox:
- s, err := newEmptySandboxNetworkStack(clock, uniqueID)
+ s, err := newEmptySandboxNetworkStack(clock, uniqueID, conf.AllowPacketEndpointWrite)
if err != nil {
return nil, err
}
creator := &sandboxNetstackCreator{
- clock: clock,
- uniqueID: uniqueID,
+ clock: clock,
+ uniqueID: uniqueID,
+ allowPacketEndpointWrite: conf.AllowPacketEndpointWrite,
}
return inet.NewRootNamespace(s, creator), nil
default:
- panic(fmt.Sprintf("invalid network configuration: %d", conf.Network))
+ panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
}
}
-func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) {
+func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID, allowPacketEndpointWrite bool) (inet.Stack, error) {
netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol}
transProtos := []stack.TransportProtocolFactory{
tcp.NewProtocol,
@@ -1116,9 +1146,10 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in
HandleLocal: true,
// Enable raw sockets for users with sufficient
// privileges.
- RawFactory: raw.EndpointFactory{},
- UniqueID: uniqueID,
- DefaultIPTables: netfilter.DefaultLinuxTables,
+ RawFactory: raw.EndpointFactory{},
+ AllowPacketEndpointWrite: allowPacketEndpointWrite,
+ UniqueID: uniqueID,
+ DefaultIPTables: netfilter.DefaultLinuxTables,
})}
// Enable SACK Recovery.
@@ -1155,13 +1186,14 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in
//
// +stateify savable
type sandboxNetstackCreator struct {
- clock tcpip.Clock
- uniqueID stack.UniqueID
+ clock tcpip.Clock
+ uniqueID stack.UniqueID
+ allowPacketEndpointWrite bool
}
// CreateStack implements kernel.NetworkStackCreator.CreateStack.
func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) {
- s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID)
+ s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID, f.allowPacketEndpointWrite)
if err != nil {
return nil, err
}
@@ -1170,7 +1202,7 @@ func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) {
n := &Network{Stack: s.(*netstack.Stack).Stack}
nicID := tcpip.NICID(f.uniqueID.UniqueID())
link := DefaultLoopbackLink
- linkEP := loopback.New()
+ linkEP := ethernet.New(loopback.New())
if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
return nil, err
}
@@ -1215,7 +1247,7 @@ func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) e
return nil
default:
- panic(fmt.Sprintf("unknown signal delivery mode %s", mode))
+ panic(fmt.Sprintf("unknown signal delivery mode %v", mode))
}
}
@@ -1340,14 +1372,14 @@ func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileOperations, *hostvfs2
return ep.tty, ep.ttyVFS2, nil
}
-func createFDTable(ctx context.Context, console bool, stdioFDs []*fd.FD) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
+func createFDTable(ctx context.Context, console bool, stdioFDs []*fd.FD, user specs.User) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
if len(stdioFDs) != 3 {
return nil, nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
}
k := kernel.KernelFromContext(ctx)
fdTable := k.NewFDTable()
- ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, console, stdioFDs)
+ ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, console, auth.KUID(user.UID), auth.KGID(user.GID), stdioFDs)
if err != nil {
fdTable.DecRef(ctx)
return nil, nil, nil, err