diff options
Diffstat (limited to 'runsc/boot/loader.go')
-rw-r--r-- | runsc/boot/loader.go | 208 |
1 files changed, 128 insertions, 80 deletions
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 42bddb2e8..8e8c6105b 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -20,51 +20,52 @@ import ( mrand "math/rand" "os" "runtime" + "strings" "sync" "sync/atomic" "syscall" gtime "time" specs "github.com/opencontainers/runtime-spec/specs-go" - "gvisor.googlesource.com/gvisor/pkg/abi/linux" - "gvisor.googlesource.com/gvisor/pkg/cpuid" - "gvisor.googlesource.com/gvisor/pkg/log" - "gvisor.googlesource.com/gvisor/pkg/memutil" - "gvisor.googlesource.com/gvisor/pkg/rand" - "gvisor.googlesource.com/gvisor/pkg/sentry/arch" - "gvisor.googlesource.com/gvisor/pkg/sentry/control" - "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host" - "gvisor.googlesource.com/gvisor/pkg/sentry/inet" - "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" - "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" - "gvisor.googlesource.com/gvisor/pkg/sentry/loader" - "gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc" - "gvisor.googlesource.com/gvisor/pkg/sentry/platform" - "gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm" - "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace" - "gvisor.googlesource.com/gvisor/pkg/sentry/sighandling" - slinux "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/linux" - "gvisor.googlesource.com/gvisor/pkg/sentry/time" - "gvisor.googlesource.com/gvisor/pkg/sentry/usage" - "gvisor.googlesource.com/gvisor/pkg/sentry/watchdog" - "gvisor.googlesource.com/gvisor/pkg/tcpip" - "gvisor.googlesource.com/gvisor/pkg/tcpip/link/sniffer" - "gvisor.googlesource.com/gvisor/pkg/tcpip/network/arp" - "gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4" - "gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv6" - "gvisor.googlesource.com/gvisor/pkg/tcpip/stack" - "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/icmp" - "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/tcp" - "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/udp" - "gvisor.googlesource.com/gvisor/runsc/boot/filter" - "gvisor.googlesource.com/gvisor/runsc/specutils" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/memutil" + "gvisor.dev/gvisor/pkg/rand" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/control" + "gvisor.dev/gvisor/pkg/sentry/fs/host" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/loader" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/sighandling" + slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/sentry/time" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/watchdog" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" + "gvisor.dev/gvisor/pkg/tcpip/network/arp" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/tcpip/transport/icmp" + "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" + "gvisor.dev/gvisor/pkg/tcpip/transport/udp" + "gvisor.dev/gvisor/runsc/boot/filter" + _ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms. + "gvisor.dev/gvisor/runsc/specutils" // Include supported socket providers. - "gvisor.googlesource.com/gvisor/pkg/sentry/socket/epsocket" - "gvisor.googlesource.com/gvisor/pkg/sentry/socket/hostinet" - _ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink" - _ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/route" - _ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix" + "gvisor.dev/gvisor/pkg/sentry/socket/epsocket" + "gvisor.dev/gvisor/pkg/sentry/socket/hostinet" + _ "gvisor.dev/gvisor/pkg/sentry/socket/netlink" + _ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/route" + _ "gvisor.dev/gvisor/pkg/sentry/socket/unix" ) // Loader keeps state needed to start the kernel and run the container.. @@ -117,6 +118,10 @@ type Loader struct { // // processes is guardded by mu. processes map[execID]*execProcess + + // mountHints provides extra information about mounts for containers that + // apply to the entire pod. + mountHints *podMountHints } // execID uniquely identifies a sentry process that is executed in a container. @@ -201,7 +206,9 @@ func New(args Args) (*Loader, error) { // Create VDSO. // // Pass k as the platform since it is savable, unlike the actual platform. - vdso, err := loader.PrepareVDSO(k) + // + // FIXME(b/109889800): Use non-nil context. + vdso, err := loader.PrepareVDSO(nil, k) if err != nil { return nil, fmt.Errorf("creating vdso: %v", err) } @@ -255,7 +262,7 @@ func New(args Args) (*Loader, error) { // Adjust the total memory returned by the Sentry so that applications that // use /proc/meminfo can make allocations based on this limit. usage.MinimumTotalMemoryBytes = args.TotalMem - log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(2^30)) + log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(1<<30)) } // Initiate the Kernel object, which is required by the Context passed @@ -299,6 +306,11 @@ func New(args Args) (*Loader, error) { return nil, fmt.Errorf("initializing compat logs: %v", err) } + mountHints, err := newPodMountHints(args.Spec) + if err != nil { + return nil, fmt.Errorf("creating pod mount hints: %v", err) + } + eid := execID{cid: args.ID} l := &Loader{ k: k, @@ -311,6 +323,7 @@ func New(args Args) (*Loader, error) { rootProcArgs: procArgs, sandboxID: args.ID, processes: map[execID]*execProcess{eid: {}}, + mountHints: mountHints, } // We don't care about child signals; some platforms can generate a @@ -402,19 +415,12 @@ func (l *Loader) Destroy() { } func createPlatform(conf *Config, deviceFile *os.File) (platform.Platform, error) { - switch conf.Platform { - case PlatformPtrace: - log.Infof("Platform: ptrace") - return ptrace.New() - case PlatformKVM: - log.Infof("Platform: kvm") - if deviceFile == nil { - return nil, fmt.Errorf("kvm device file must be provided") - } - return kvm.New(deviceFile) - default: - return nil, fmt.Errorf("invalid platform %v", conf.Platform) + p, err := platform.Lookup(conf.Platform) + if err != nil { + panic(fmt.Sprintf("invalid platform %v: %v", conf.Platform, err)) } + log.Infof("Platform: %s", conf.Platform) + return p.New(deviceFile) } func createMemoryFile() (*pgalloc.MemoryFile, error) { @@ -435,6 +441,23 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) { return mf, nil } +func (l *Loader) installSeccompFilters() error { + if l.conf.DisableSeccomp { + filter.Report("syscall filter is DISABLED. Running in less secure mode.") + } else { + opts := filter.Options{ + Platform: l.k.Platform, + HostNetwork: l.conf.Network == NetworkHost, + ProfileEnable: l.conf.ProfileEnable, + ControllerFD: l.ctrl.srv.FD(), + } + if err := filter.Install(opts); err != nil { + return fmt.Errorf("installing seccomp filters: %v", err) + } + } + return nil +} + // Run runs the root container. func (l *Loader) Run() error { err := l.run() @@ -470,39 +493,33 @@ func (l *Loader) run() error { return fmt.Errorf("trying to start deleted container %q", l.sandboxID) } - // Finally done with all configuration. Setup filters before user code - // is loaded. - if l.conf.DisableSeccomp { - filter.Report("syscall filter is DISABLED. Running in less secure mode.") - } else { - opts := filter.Options{ - Platform: l.k.Platform, - HostNetwork: l.conf.Network == NetworkHost, - ProfileEnable: l.conf.ProfileEnable, - ControllerFD: l.ctrl.srv.FD(), - } - if err := filter.Install(opts); err != nil { - return fmt.Errorf("installing seccomp filters: %v", err) - } - } - // If we are restoring, we do not want to create a process. // l.restore is set by the container manager when a restore call is made. if !l.restore { + if l.conf.ProfileEnable { + initializePProf() + } + + // Finally done with all configuration. Setup filters before user code + // is loaded. + if err := l.installSeccompFilters(); err != nil { + return err + } + // Create the FD map, which will set stdin, stdout, and stderr. If console // is true, then ioctl calls will be passed through to the host fd. ctx := l.rootProcArgs.NewContext(l.k) - fdm, err := createFDMap(ctx, l.rootProcArgs.Limits, l.console, l.stdioFDs) + fdTable, err := createFDTable(ctx, l.console, l.stdioFDs) if err != nil { return fmt.Errorf("importing fds: %v", err) } // CreateProcess takes a reference on FDMap if successful. We won't need // ours either way. - l.rootProcArgs.FDMap = fdm + l.rootProcArgs.FDTable = fdTable // cid for root container can be empty. Only subcontainers need it to set // the mount location. - mntr := newContainerMounter(l.spec, "", l.goferFDs, l.k) + mntr := newContainerMounter(l.spec, "", l.goferFDs, l.k, l.mountHints) if err := mntr.setupFS(ctx, l.conf, &l.rootProcArgs, l.rootProcArgs.Credentials); err != nil { return err } @@ -513,19 +530,37 @@ func (l *Loader) run() error { return err } + // Read /etc/passwd for the user's HOME directory and set the HOME + // environment variable as required by POSIX if it is not overridden by + // the user. + hasHomeEnvv := false + for _, envv := range l.rootProcArgs.Envv { + if strings.HasPrefix(envv, "HOME=") { + hasHomeEnvv = true + } + } + if !hasHomeEnvv { + homeDir, err := getExecUserHome(rootCtx, rootMns, uint32(l.rootProcArgs.Credentials.RealKUID)) + if err != nil { + return fmt.Errorf("error reading exec user: %v", err) + } + + l.rootProcArgs.Envv = append(l.rootProcArgs.Envv, "HOME="+homeDir) + } + // Create the root container init task. It will begin running // when the kernel is started. if _, _, err := l.k.CreateProcess(l.rootProcArgs); err != nil { return fmt.Errorf("creating init process: %v", err) } - // CreateProcess takes a reference on FDMap if successful. - l.rootProcArgs.FDMap.DecRef() + // CreateProcess takes a reference on FDTable if successful. + l.rootProcArgs.FDTable.DecRef() } ep.tg = l.k.GlobalInit() if l.console { - ttyFile := l.rootProcArgs.FDMap.GetFile(0) + ttyFile, _ := l.rootProcArgs.FDTable.Get(0) defer ttyFile.DecRef() ep.tty = ttyFile.FileOperations.(*host.TTYFileOperations) @@ -605,13 +640,13 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file // Create the FD map, which will set stdin, stdout, and stderr. ctx := procArgs.NewContext(l.k) - fdm, err := createFDMap(ctx, procArgs.Limits, false, stdioFDs) + fdTable, err := createFDTable(ctx, false, stdioFDs) if err != nil { return fmt.Errorf("importing fds: %v", err) } - // CreateProcess takes a reference on FDMap if successful. We won't need ours - // either way. - procArgs.FDMap = fdm + // CreateProcess takes a reference on fdTable if successful. We won't + // need ours either way. + procArgs.FDTable = fdTable // Can't take ownership away from os.File. dup them to get a new FDs. var goferFDs []int @@ -623,7 +658,7 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file goferFDs = append(goferFDs, fd) } - mntr := newContainerMounter(spec, cid, goferFDs, l.k) + mntr := newContainerMounter(spec, cid, goferFDs, l.k, l.mountHints) if err := mntr.setupFS(ctx, conf, &procArgs, creds); err != nil { return fmt.Errorf("configuring container FS: %v", err) } @@ -640,8 +675,8 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file } l.k.StartProcess(tg) - // CreateProcess takes a reference on FDMap if successful. - procArgs.FDMap.DecRef() + // CreateProcess takes a reference on FDTable if successful. + procArgs.FDTable.DecRef() l.processes[eid].tg = tg return nil @@ -805,9 +840,17 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) { // privileges. Raw: true, })} + + // Enable SACK Recovery. if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil { return nil, fmt.Errorf("failed to enable SACK: %v", err) } + + // Enable Receive Buffer Auto-Tuning. + if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil { + return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err) + } + return &s, nil default: @@ -956,3 +999,8 @@ func (l *Loader) threadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, *host } return ep.tg, ep.tty, nil } + +func init() { + // TODO(gvisor.dev/issue/365): Make this configurable. + refs.SetLeakMode(refs.NoLeakChecking) +} |