diff options
Diffstat (limited to 'runsc/boot/loader.go')
-rw-r--r-- | runsc/boot/loader.go | 201 |
1 files changed, 120 insertions, 81 deletions
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 9cd9c5909..2e652ddad 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -16,22 +16,25 @@ package boot import ( + "errors" "fmt" mrand "math/rand" "os" "runtime" "sync/atomic" - "syscall" gtime "time" specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/memutil" "gvisor.dev/gvisor/pkg/rand" + "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/fdimport" @@ -66,7 +69,9 @@ import ( "gvisor.dev/gvisor/runsc/boot/filter" _ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms. "gvisor.dev/gvisor/runsc/boot/pprof" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/specutils" + "gvisor.dev/gvisor/runsc/specutils/seccomp" // Include supported socket providers. "gvisor.dev/gvisor/pkg/sentry/socket/hostinet" @@ -78,7 +83,7 @@ import ( ) type containerInfo struct { - conf *Config + conf *config.Config // spec is the base configuration for the root container. spec *specs.Spec @@ -87,10 +92,10 @@ type containerInfo struct { procArgs kernel.CreateProcessArgs // stdioFDs contains stdin, stdout, and stderr. - stdioFDs []int + stdioFDs []*fd.FD // goferFDs are the FDs that attach the sandbox to the gofers. - goferFDs []int + goferFDs []*fd.FD } // Loader keeps state needed to start the kernel and run the container.. @@ -164,7 +169,7 @@ type Args struct { // Spec is the sandbox specification. Spec *specs.Spec // Conf is the system configuration. - Conf *Config + Conf *config.Config // ControllerFD is the FD to the URPC controller. The Loader takes ownership // of this FD and may close it at any time. ControllerFD int @@ -187,7 +192,7 @@ type Args struct { } // make sure stdioFDs are always the same on initial start and on restore -const startingStdioFD = 64 +const startingStdioFD = 256 // New initializes a new kernel loader configured by spec. // New also handles setting up a kernel for restoring a container. @@ -346,7 +351,7 @@ func New(args Args) (*Loader, error) { if err != nil { return nil, fmt.Errorf("failed to create hostfs filesystem: %v", err) } - defer hostFilesystem.DecRef() + defer hostFilesystem.DecRef(k.SupervisorContext()) hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{}) if err != nil { return nil, fmt.Errorf("failed to create hostfs mount: %v", err) @@ -354,23 +359,36 @@ func New(args Args) (*Loader, error) { k.SetHostMount(hostMount) } + info := containerInfo{ + conf: args.Conf, + spec: args.Spec, + procArgs: procArgs, + } + // Make host FDs stable between invocations. Host FDs must map to the exact // same number when the sandbox is restored. Otherwise the wrong FD will be // used. - var stdioFDs []int newfd := startingStdioFD - for _, fd := range args.StdioFDs { - err := syscall.Dup3(fd, newfd, syscall.O_CLOEXEC) - if err != nil { - return nil, fmt.Errorf("dup3 of stdioFDs failed: %v", err) + for _, stdioFD := range args.StdioFDs { + // Check that newfd is unused to avoid clobbering over it. + if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) { + if err != nil { + return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err) + } + return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd) } - stdioFDs = append(stdioFDs, newfd) - err = syscall.Close(fd) + + err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC) if err != nil { - return nil, fmt.Errorf("close original stdioFDs failed: %v", err) + return nil, fmt.Errorf("dup3 of stdios failed: %w", err) } + info.stdioFDs = append(info.stdioFDs, fd.New(newfd)) + _ = unix.Close(stdioFD) newfd++ } + for _, goferFD := range args.GoferFDs { + info.goferFDs = append(info.goferFDs, fd.New(goferFD)) + } eid := execID{cid: args.ID} l := &Loader{ @@ -379,13 +397,7 @@ func New(args Args) (*Loader, error) { sandboxID: args.ID, processes: map[execID]*execProcess{eid: {}}, mountHints: mountHints, - root: containerInfo{ - conf: args.Conf, - stdioFDs: stdioFDs, - goferFDs: args.GoferFDs, - spec: args.Spec, - procArgs: procArgs, - }, + root: info, } // We don't care about child signals; some platforms can generate a @@ -458,9 +470,19 @@ func (l *Loader) Destroy() { l.stopSignalForwarding() } l.watchdog.Stop() + + // In the success case, stdioFDs and goferFDs will only contain + // released/closed FDs that ownership has been passed over to host FDs and + // gofer sessions. Close them here in case on failure. + for _, fd := range l.root.stdioFDs { + _ = fd.Close() + } + for _, fd := range l.root.goferFDs { + _ = fd.Close() + } } -func createPlatform(conf *Config, deviceFile *os.File) (platform.Platform, error) { +func createPlatform(conf *config.Config, deviceFile *os.File) (platform.Platform, error) { p, err := platform.Lookup(conf.Platform) if err != nil { panic(fmt.Sprintf("invalid platform %v: %v", conf.Platform, err)) @@ -487,13 +509,14 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) { return mf, nil } +// installSeccompFilters installs sandbox seccomp filters with the host. func (l *Loader) installSeccompFilters() error { if l.root.conf.DisableSeccomp { filter.Report("syscall filter is DISABLED. Running in less secure mode.") } else { opts := filter.Options{ Platform: l.k.Platform, - HostNetwork: l.root.conf.Network == NetworkHost, + HostNetwork: l.root.conf.Network == config.NetworkHost, ProfileEnable: l.root.conf.ProfileEnable, ControllerFD: l.ctrl.srv.FD(), } @@ -520,7 +543,7 @@ func (l *Loader) Run() error { } func (l *Loader) run() error { - if l.root.conf.Network == NetworkHost { + if l.root.conf.Network == config.NetworkHost { // Delay host network configuration to this point because network namespace // is configured after the loader is created and before Run() is called. log.Debugf("Configuring host network") @@ -557,6 +580,7 @@ func (l *Loader) run() error { if _, err := l.createContainerProcess(true, l.sandboxID, &l.root, ep); err != nil { return err } + } ep.tg = l.k.GlobalInit() @@ -586,19 +610,6 @@ func (l *Loader) run() error { } }) - // l.stdioFDs are derived from dup() in boot.New() and they are now dup()ed again - // either in createFDTable() during initial start or in descriptor.initAfterLoad() - // during restore, we can release l.stdioFDs now. VFS2 takes ownership of the - // passed FDs, so only close for VFS1. - if !kernel.VFS2Enabled { - for _, fd := range l.root.stdioFDs { - err := syscall.Close(fd) - if err != nil { - return fmt.Errorf("close dup()ed stdioFDs: %v", err) - } - } - } - log.Infof("Process should have started...") l.watchdog.Start() return l.k.Start() @@ -618,9 +629,9 @@ func (l *Loader) createContainer(cid string) error { } // startContainer starts a child container. It returns the thread group ID of -// the newly created process. Caller owns 'files' and may close them after -// this method returns. -func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, files []*os.File) error { +// the newly created process. Used FDs are either closed or released. It's safe +// for the caller to close any remaining files upon return. +func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, files []*fd.FD) error { // Create capabilities. caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities) if err != nil { @@ -671,28 +682,15 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file } info := &containerInfo{ - conf: conf, - spec: spec, + conf: conf, + spec: spec, + stdioFDs: files[:3], + goferFDs: files[3:], } info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns) if err != nil { return fmt.Errorf("creating new process: %v", err) } - - // setupContainerFS() dups stdioFDs, so we don't need to dup them here. - for _, f := range files[:3] { - info.stdioFDs = append(info.stdioFDs, int(f.Fd())) - } - - // Can't take ownership away from os.File. dup them to get a new FDs. - for _, f := range files[3:] { - fd, err := syscall.Dup(int(f.Fd())) - if err != nil { - return fmt.Errorf("failed to dup file: %v", err) - } - info.goferFDs = append(info.goferFDs, fd) - } - tg, err := l.createContainerProcess(false, cid, info, ep) if err != nil { return err @@ -755,7 +753,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn return nil, fmt.Errorf("creating process: %v", err) } // CreateProcess takes a reference on FDTable if successful. - info.procArgs.FDTable.DecRef() + info.procArgs.FDTable.DecRef(ctx) // Set the foreground process group on the TTY to the global init process // group, since that is what we are about to start running. @@ -770,19 +768,44 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn } } + // Install seccomp filters with the new task if there are any. + if info.conf.OCISeccomp { + if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil { + program, err := seccomp.BuildProgram(info.spec.Linux.Seccomp) + if err != nil { + return nil, fmt.Errorf("building seccomp program: %v", err) + } + + if log.IsLogging(log.Debug) { + out, _ := bpf.DecodeProgram(program) + log.Debugf("Installing OCI seccomp filters\nProgram:\n%s", out) + } + + task := tg.Leader() + // NOTE: It seems Flags are ignored by runc so we ignore them too. + if err := task.AppendSyscallFilter(program, true); err != nil { + return nil, fmt.Errorf("appending seccomp filters: %v", err) + } + } + } else { + if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil { + log.Warningf("Seccomp spec is being ignored") + } + } + return tg, nil } // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on // the gofer FDs looking for disconnects, and destroys the container if a // disconnect occurs in any of the gofer FDs. -func (l *Loader) startGoferMonitor(cid string, goferFDs []int) { +func (l *Loader) startGoferMonitor(cid string, goferFDs []*fd.FD) { go func() { log.Debugf("Monitoring gofer health for container %q", cid) var events []unix.PollFd - for _, fd := range goferFDs { + for _, goferFD := range goferFDs { events = append(events, unix.PollFd{ - Fd: int32(fd), + Fd: int32(goferFD.FD()), Events: unix.POLLHUP | unix.POLLRDHUP, }) } @@ -890,22 +913,20 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { // Add the HOME environment variable if it is not already set. if kernel.VFS2Enabled { - defer args.MountNamespaceVFS2.DecRef() - root := args.MountNamespaceVFS2.Root() - defer root.DecRef() ctx := vfs.WithRoot(l.k.SupervisorContext(), root) + defer args.MountNamespaceVFS2.DecRef(ctx) + defer root.DecRef(ctx) envv, err := user.MaybeAddExecUserHomeVFS2(ctx, args.MountNamespaceVFS2, args.KUID, args.Envv) if err != nil { return 0, err } args.Envv = envv } else { - defer args.MountNamespace.DecRef() - root := args.MountNamespace.Root() - defer root.DecRef() ctx := fs.WithRoot(l.k.SupervisorContext(), root) + defer args.MountNamespace.DecRef(ctx) + defer root.DecRef(ctx) envv, err := user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv) if err != nil { return 0, err @@ -1002,20 +1023,25 @@ func (l *Loader) WaitExit() kernel.ExitStatus { // Wait for container. l.k.WaitExited() + // Cleanup + l.ctrl.stop() + + refs.OnExit() + return l.k.GlobalInit().ExitStatus() } -func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) { +func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) { // Create an empty network stack because the network namespace may be empty at // this point. Netns is configured before Run() is called. Netstack is // configured using a control uRPC message. Host network is configured inside // Run(). switch conf.Network { - case NetworkHost: + case config.NetworkHost: // No network namespacing support for hostinet yet, hence creator is nil. return inet.NewRootNamespace(hostinet.NewStack(), nil), nil - case NetworkNone, NetworkSandbox: + case config.NetworkNone, config.NetworkSandbox: s, err := newEmptySandboxNetworkStack(clock, uniqueID) if err != nil { return nil, err @@ -1033,8 +1059,8 @@ func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.Uni } func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) { - netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()} - transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()} + netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol} + transProtos := []stack.TransportProtocolFactory{tcp.NewProtocol, udp.NewProtocol, icmp.NewProtocol4} s := netstack.Stack{stack.New(stack.Options{ NetworkProtocols: netProtos, TransportProtocols: transProtos, @@ -1048,17 +1074,30 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in })} // Enable SACK Recovery. - if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil { - return nil, fmt.Errorf("failed to enable SACK: %s", err) + { + opt := tcpip.TCPSACKEnabled(true) + if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { + return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) + } } // Set default TTLs as required by socket/netstack. - s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL)) - s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL)) + { + opt := tcpip.DefaultTTLOption(netstack.DefaultTTL) + if err := s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, &opt); err != nil { + return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv4.ProtocolNumber, opt, opt, err) + } + if err := s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, &opt); err != nil { + return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv6.ProtocolNumber, opt, opt, err) + } + } // Enable Receive Buffer Auto-Tuning. - if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil { - return nil, fmt.Errorf("SetTransportProtocolOption failed: %s", err) + { + opt := tcpip.TCPModerateReceiveBufferOption(true) + if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { + return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) + } } return &s, nil @@ -1254,7 +1293,7 @@ func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileOperations, *hostvfs2 return ep.tty, ep.ttyVFS2, nil } -func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { +func createFDTable(ctx context.Context, console bool, stdioFDs []*fd.FD) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { if len(stdioFDs) != 3 { return nil, nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs)) } @@ -1263,7 +1302,7 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F fdTable := k.NewFDTable() ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, console, stdioFDs) if err != nil { - fdTable.DecRef() + fdTable.DecRef(ctx) return nil, nil, nil, err } return fdTable, ttyFile, ttyFileVFS2, nil |