summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/kernel/kernel.go
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/kernel/kernel.go')
-rw-r--r--pkg/sentry/kernel/kernel.go957
1 files changed, 957 insertions, 0 deletions
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
new file mode 100644
index 000000000..0932965e0
--- /dev/null
+++ b/pkg/sentry/kernel/kernel.go
@@ -0,0 +1,957 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package kernel provides an emulation of the Linux kernel.
+//
+// See README.md for a detailed overview.
+//
+// Lock order (outermost locks must be taken first):
+//
+// Kernel.extMu
+// TaskSet.mu
+// SignalHandlers.mu
+// Task.mu
+//
+// Locking SignalHandlers.mu in multiple SignalHandlers requires locking
+// TaskSet.mu exclusively first. Locking Task.mu in multiple Tasks at the same
+// time requires locking all of their signal mutexes first.
+package kernel
+
+import (
+ "fmt"
+ "io"
+ "path/filepath"
+ "sync"
+ "sync/atomic"
+ "time"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/cpuid"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/timerfd"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/hostcpu"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/inet"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/epoll"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
+ ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/loader"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/mm"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/port"
+ sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
+ "gvisor.googlesource.com/gvisor/pkg/state"
+)
+
+// Kernel represents an emulated Linux kernel. It must be initialized by calling
+// Init() or LoadFrom().
+type Kernel struct {
+ // extMu serializes external changes to the Kernel with calls to
+ // Kernel.SaveTo. (Kernel.SaveTo requires that the state of the Kernel
+ // remains frozen for the duration of the call; it requires that the Kernel
+ // is paused as a precondition, which ensures that none of the tasks
+ // running within the Kernel can affect its state, but extMu is required to
+ // ensure that concurrent users of the Kernel *outside* the Kernel's
+ // control cannot affect its state by calling e.g.
+ // Kernel.SendExternalSignal.)
+ extMu sync.Mutex `state:"nosave"`
+
+ // started is true if Start has been called. Unless otherwise specified,
+ // all Kernel fields become immutable once started becomes true.
+ started bool `state:"nosave"`
+
+ // All of the following fields are immutable unless otherwise specified.
+
+ // Platform is the platform that is used to execute tasks in the
+ // created Kernel. It is embedded so that Kernel can directly serve as
+ // Platform in mm logic and also serve as platform.MemoryProvider in
+ // filemem S/R logic.
+ platform.Platform `state:"nosave"`
+
+ // See InitKernelArgs for the meaning of these fields.
+ featureSet *cpuid.FeatureSet
+ timekeeper *Timekeeper
+ tasks *TaskSet
+ rootUserNamespace *auth.UserNamespace
+ networkStack inet.Stack `state:"nosave"`
+ applicationCores uint
+ useHostCores bool
+ extraAuxv []arch.AuxEntry
+ vdso *loader.VDSO
+ rootUTSNamespace *UTSNamespace
+ rootIPCNamespace *IPCNamespace
+
+ // mounts holds the state of the virtual filesystem. mounts is initially
+ // nil, and must be set by calling Kernel.SetRootMountNamespace before
+ // Kernel.CreateProcess can succeed.
+ mounts *fs.MountNamespace
+
+ // globalInit is the thread group whose leader has ID 1 in the root PID
+ // namespace. globalInit is stored separately so that it is accessible even
+ // after all tasks in the thread group have exited, such that ID 1 is no
+ // longer mapped.
+ //
+ // globalInit is mutable until it is assigned by the first successful call
+ // to CreateProcess, and is protected by extMu.
+ globalInit *ThreadGroup
+
+ // realtimeClock is a ktime.Clock based on timekeeper's Realtime.
+ realtimeClock *timekeeperClock
+
+ // monotonicClock is a ktime.Clock based on timekeeper's Monotonic.
+ monotonicClock *timekeeperClock
+
+ // syslog is the kernel log.
+ syslog syslog
+
+ // cpuClock is incremented every linux.ClockTick. cpuClock is used to
+ // measure task CPU usage, since sampling monotonicClock twice on every
+ // syscall turns out to be unreasonably expensive. This is similar to how
+ // Linux does task CPU accounting on x86 (CONFIG_IRQ_TIME_ACCOUNTING),
+ // although Linux also uses scheduler timing information to improve
+ // resolution (kernel/sched/cputime.c:cputime_adjust()), which we can't do
+ // since "preeemptive" scheduling is managed by the Go runtime, which
+ // doesn't provide this information.
+ //
+ // cpuClock is mutable, and is accessed using atomic memory operations.
+ cpuClock uint64
+
+ // cpuClockTicker increments cpuClock.
+ cpuClockTicker *ktime.Timer `state:"nosave"`
+
+ // fdMapUids is an ever-increasing counter for generating FDMap uids.
+ //
+ // fdMapUids is mutable, and is accessed using atomic memory operations.
+ fdMapUids uint64
+
+ // uniqueID is used to generate unique identifiers.
+ //
+ // uniqueID is mutable, and is accessed using atomic memory operations.
+ uniqueID uint64
+
+ // nextInotifyCookie is a monotonically increasing counter used for
+ // generating unique inotify event cookies.
+ //
+ // nextInotifyCookie is mutable, and is accesed using atomic memory
+ // operations.
+ nextInotifyCookie uint32
+
+ // netlinkPorts manages allocation of netlink socket port IDs.
+ netlinkPorts *port.Manager
+
+ // exitErr is the error causing the sandbox to exit, if any. It is
+ // protected by extMu.
+ exitErr error
+}
+
+// InitKernelArgs holds arguments to Init.
+type InitKernelArgs struct {
+ // FeatureSet is the emulated CPU feature set.
+ FeatureSet *cpuid.FeatureSet
+
+ // Timekeeper manages time for all tasks in the system.
+ Timekeeper *Timekeeper
+
+ // RootUserNamespace is the root user namespace.
+ RootUserNamespace *auth.UserNamespace
+
+ // NetworkStack is the TCP/IP network stack. NetworkStack may be nil.
+ NetworkStack inet.Stack
+
+ // ApplicationCores is the number of logical CPUs visible to sandboxed
+ // applications. The set of logical CPU IDs is [0, ApplicationCores); thus
+ // ApplicationCores is analogous to Linux's nr_cpu_ids, the index of the
+ // most significant bit in cpu_possible_mask + 1.
+ ApplicationCores uint
+
+ // If UseHostCores is true, Task.CPU() returns the task goroutine's CPU
+ // instead of a virtualized CPU number, and Task.CopyToCPUMask() is a
+ // no-op. If ApplicationCores is less than hostcpu.MaxPossibleCPU(), it
+ // will be overridden.
+ UseHostCores bool
+
+ // ExtraAuxv contains additional auxiliary vector entries that are added to
+ // each process by the ELF loader.
+ ExtraAuxv []arch.AuxEntry
+
+ // Vdso holds the VDSO and its parameter page.
+ Vdso *loader.VDSO
+
+ // RootUTSNamespace is the root UTS namepsace.
+ RootUTSNamespace *UTSNamespace
+
+ // RootIPCNamespace is the root IPC namepsace.
+ RootIPCNamespace *IPCNamespace
+}
+
+// Init initialize the Kernel with no tasks.
+//
+// Callers must manually set Kernel.Platform before caling Init.
+func (k *Kernel) Init(args InitKernelArgs) error {
+ if args.FeatureSet == nil {
+ return fmt.Errorf("FeatureSet is nil")
+ }
+ if args.Timekeeper == nil {
+ return fmt.Errorf("Timekeeper is nil")
+ }
+ if args.RootUserNamespace == nil {
+ return fmt.Errorf("RootUserNamespace is nil")
+ }
+ if args.ApplicationCores == 0 {
+ return fmt.Errorf("ApplicationCores is 0")
+ }
+
+ k.featureSet = args.FeatureSet
+ k.timekeeper = args.Timekeeper
+ k.tasks = newTaskSet()
+ k.rootUserNamespace = args.RootUserNamespace
+ k.rootUTSNamespace = args.RootUTSNamespace
+ k.rootIPCNamespace = args.RootIPCNamespace
+ k.networkStack = args.NetworkStack
+ k.applicationCores = args.ApplicationCores
+ if args.UseHostCores {
+ k.useHostCores = true
+ maxCPU, err := hostcpu.MaxPossibleCPU()
+ if err != nil {
+ return fmt.Errorf("Failed to get maximum CPU number: %v", err)
+ }
+ minAppCores := uint(maxCPU) + 1
+ if k.applicationCores < minAppCores {
+ log.Infof("UseHostCores enabled: increasing ApplicationCores from %d to %d", k.applicationCores, minAppCores)
+ k.applicationCores = minAppCores
+ }
+ }
+ k.extraAuxv = args.ExtraAuxv
+ k.vdso = args.Vdso
+ k.realtimeClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Realtime}
+ k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic}
+ k.netlinkPorts = port.New()
+
+ return nil
+}
+
+// SaveTo saves the state of k to w.
+//
+// Preconditions: The kernel must be paused throughout the call to SaveTo.
+func (k *Kernel) SaveTo(w io.Writer) error {
+ saveStart := time.Now()
+ ctx := k.SupervisorContext()
+
+ // Do not allow other Kernel methods to affect it while it's being saved.
+ k.extMu.Lock()
+ defer k.extMu.Unlock()
+
+ // Stop time.
+ k.pauseTimeLocked()
+ defer k.resumeTimeLocked()
+
+ // Flush write operations on open files so data reaches backing storage.
+ if err := k.tasks.flushWritesToFiles(ctx); err != nil {
+ return err
+ }
+
+ // Remove all epoll waiter objects from underlying wait queues.
+ // NOTE: for programs to resume execution in future snapshot scenarios,
+ // we will need to re-establish these waiter objects after saving.
+ k.tasks.unregisterEpollWaiters()
+
+ // Clear the dirent cache before saving because Dirents must be Loaded in a
+ // particular order (parents before children), and Loading dirents from a cache
+ // breaks that order.
+ k.mounts.FlushMountSourceRefs()
+
+ // Ensure that all pending asynchronous work is complete:
+ // - inode and mount release
+ // - asynchronuous IO
+ fs.AsyncBarrier()
+
+ // Once all fs work has completed (flushed references have all been released),
+ // reset mount mappings. This allows individual mounts to save how inodes map
+ // to filesystem resources. Without this, fs.Inodes cannot be restored.
+ fs.SaveInodeMappings()
+
+ // Discard unsavable mappings, such as those for host file descriptors.
+ // This must be done after waiting for "asynchronous fs work", which
+ // includes async I/O that may touch application memory.
+ if err := k.invalidateUnsavableMappings(ctx); err != nil {
+ return fmt.Errorf("failed to invalidate unsavable mappings: %v", err)
+ }
+
+ // Save the kernel state.
+ kernelStart := time.Now()
+ var stats state.Stats
+ if err := state.Save(w, k, &stats); err != nil {
+ return err
+ }
+ log.Infof("Kernel save stats: %s", &stats)
+ log.Infof("Kernel save took [%s].", time.Since(kernelStart))
+
+ // Save the memory state.
+ //
+ // FIXME: In the future, this should not be dispatched via
+ // an abstract memory type. This should be dispatched to a single
+ // memory implementation that belongs to the kernel. (There is
+ // currently a single implementation anyways, it just needs to be
+ // "unabstracted" and reparented appropriately.)
+ memoryStart := time.Now()
+ if err := k.Platform.Memory().SaveTo(w); err != nil {
+ return err
+ }
+ log.Infof("Memory save took [%s].", time.Since(memoryStart))
+
+ log.Infof("Overall save took [%s].", time.Since(saveStart))
+
+ return nil
+}
+
+func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
+ ts.mu.RLock()
+ defer ts.mu.RUnlock()
+ for t := range ts.Root.tids {
+ if fdmap := t.FDMap(); fdmap != nil {
+ for _, desc := range fdmap.files {
+ if flags := desc.file.Flags(); !flags.Write {
+ continue
+ }
+ if sattr := desc.file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) {
+ continue
+ }
+ // Here we need all metadata synced.
+ syncErr := desc.file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll)
+ if err := fs.SaveFileFsyncError(syncErr); err != nil {
+ name, _ := desc.file.Dirent.FullName(nil /* root */)
+ return fmt.Errorf("%q was not sufficiently synced: %v", name, err)
+ }
+ }
+ }
+ }
+ return nil
+}
+
+// Preconditions: The kernel must be paused.
+func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error {
+ invalidated := make(map[*mm.MemoryManager]struct{})
+ k.tasks.mu.RLock()
+ defer k.tasks.mu.RUnlock()
+ for t := range k.tasks.Root.tids {
+ // We can skip locking Task.mu here since the kernel is paused.
+ if mm := t.tc.MemoryManager; mm != nil {
+ if _, ok := invalidated[mm]; !ok {
+ if err := mm.InvalidateUnsavable(ctx); err != nil {
+ return err
+ }
+ invalidated[mm] = struct{}{}
+ }
+ }
+ // I really wish we just had a sync.Map of all MMs...
+ if r, ok := t.runState.(*runSyscallAfterExecStop); ok {
+ if err := r.tc.MemoryManager.InvalidateUnsavable(ctx); err != nil {
+ return err
+ }
+ }
+ }
+ return nil
+}
+
+func (ts *TaskSet) unregisterEpollWaiters() {
+ ts.mu.RLock()
+ defer ts.mu.RUnlock()
+ for t := range ts.Root.tids {
+ if fdmap := t.FDMap(); fdmap != nil {
+ for _, desc := range fdmap.files {
+ if desc.file != nil {
+ if e, ok := desc.file.FileOperations.(*epoll.EventPoll); ok {
+ e.UnregisterEpollWaiters()
+ }
+ }
+ }
+ }
+ }
+}
+
+// LoadFrom returns a new Kernel loaded from args.
+func (k *Kernel) LoadFrom(r io.Reader, p platform.Platform, net inet.Stack) error {
+ loadStart := time.Now()
+ if p == nil {
+ return fmt.Errorf("Platform is nil")
+ }
+
+ k.Platform = p
+ k.networkStack = net
+
+ initAppCores := k.applicationCores
+
+ // Load the kernel state.
+ kernelStart := time.Now()
+ var stats state.Stats
+ if err := state.Load(r, k, &stats); err != nil {
+ return err
+ }
+ log.Infof("Kernel load stats: %s", &stats)
+ log.Infof("Kernel load took [%s].", time.Since(kernelStart))
+
+ // Load the memory state.
+ //
+ // See the note in SaveTo.
+ memoryStart := time.Now()
+ if err := k.Platform.Memory().LoadFrom(r); err != nil {
+ return err
+ }
+ log.Infof("Memory load took [%s].", time.Since(memoryStart))
+
+ // Ensure that all pending asynchronous work is complete:
+ // - namedpipe opening
+ // - inode file opening
+ fs.AsyncBarrier()
+
+ log.Infof("Overall load took [%s]", time.Since(loadStart))
+
+ // Applications may size per-cpu structures based on k.applicationCores, so
+ // it can't change across save/restore. When we are virtualizing CPU
+ // numbers, this isn't a problem. However, when we are exposing host CPU
+ // assignments, we can't tolerate an increase in the number of host CPUs,
+ // which could result in getcpu(2) returning CPUs that applications expect
+ // not to exist.
+ if k.useHostCores && initAppCores > k.applicationCores {
+ return fmt.Errorf("UseHostCores enabled: can't increase ApplicationCores from %d to %d after restore", k.applicationCores, initAppCores)
+ }
+
+ return nil
+}
+
+// Destroy releases resources owned by k.
+//
+// Preconditions: There must be no task goroutines running in k.
+func (k *Kernel) Destroy() {
+ if k.mounts != nil {
+ k.mounts.DecRef()
+ k.mounts = nil
+ }
+}
+
+// UniqueID returns a unique identifier.
+func (k *Kernel) UniqueID() uint64 {
+ id := atomic.AddUint64(&k.uniqueID, 1)
+ if id == 0 {
+ panic("unique identifier generator wrapped around")
+ }
+ return id
+}
+
+// CreateProcessArgs holds arguments to kernel.CreateProcess.
+type CreateProcessArgs struct {
+ // Filename is the filename to load.
+ //
+ // If this is provided as "", then the file will be guessed via Argv[0].
+ Filename string
+
+ // Argvv is a list of arguments.
+ Argv []string
+
+ // Envv is a list of environment variables.
+ Envv []string
+
+ // WorkingDirectory is the initial working directory.
+ //
+ // This defaults to the root if empty.
+ WorkingDirectory string
+
+ // Credentials is the initial credentials.
+ Credentials *auth.Credentials
+
+ // FDMap is the initial set of file descriptors. If CreateProcess succeeds,
+ // it takes a reference on FDMap.
+ FDMap *FDMap
+
+ // Umask is the initial umask.
+ Umask uint
+
+ // Limits is the initial resource limits.
+ Limits *limits.LimitSet
+
+ // MaxSymlinkTraversals is the maximum number of symlinks to follow
+ // during resolution.
+ MaxSymlinkTraversals uint
+
+ // UTSNamespace is the initial UTS namespace.
+ UTSNamespace *UTSNamespace
+
+ // IPCNamespace is the initial IPC namespace.
+ IPCNamespace *IPCNamespace
+}
+
+// NewContext returns a context.Context that represents the task that will be
+// created by args.NewContext(k).
+func (args *CreateProcessArgs) NewContext(k *Kernel) *createProcessContext {
+ return &createProcessContext{
+ Logger: log.Log(),
+ k: k,
+ args: args,
+ }
+}
+
+// createProcessContext is a context.Context that represents the context
+// associated with a task that is being created.
+type createProcessContext struct {
+ context.NoopSleeper
+ log.Logger
+ k *Kernel
+ args *CreateProcessArgs
+}
+
+// Value implements context.Context.Value.
+func (ctx *createProcessContext) Value(key interface{}) interface{} {
+ switch key {
+ case CtxKernel:
+ return ctx.k
+ case CtxPIDNamespace:
+ // "The new task ... is in the root PID namespace." -
+ // Kernel.CreateProcess
+ return ctx.k.tasks.Root
+ case CtxUTSNamespace:
+ return ctx.args.UTSNamespace
+ case CtxIPCNamespace:
+ return ctx.args.IPCNamespace
+ case auth.CtxCredentials:
+ return ctx.args.Credentials
+ case fs.CtxRoot:
+ if ctx.k.mounts == nil {
+ return nil
+ }
+ return ctx.k.mounts.Root()
+ case ktime.CtxRealtimeClock:
+ return ctx.k.RealtimeClock()
+ case limits.CtxLimits:
+ return ctx.args.Limits
+ case platform.CtxPlatform:
+ return ctx.k
+ case uniqueid.CtxGlobalUniqueID:
+ return ctx.k.UniqueID()
+ case uniqueid.CtxInotifyCookie:
+ return ctx.k.GenerateInotifyCookie()
+ default:
+ return nil
+ }
+}
+
+// CreateProcess creates a new task in a new thread group with the given
+// options. The new task has no parent and is in the root PID namespace.
+//
+// If k.Start() has already been called, the created task will begin running
+// immediately. Otherwise, it will be started when k.Start() is called.
+//
+// CreateProcess has no analogue in Linux; it is used to create the initial
+// application task, as well as processes started by the control server.
+func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, error) {
+ k.extMu.Lock()
+ defer k.extMu.Unlock()
+ log.Infof("EXEC: %v", args.Argv)
+
+ if k.mounts == nil {
+ return nil, fmt.Errorf("no kernel MountNamespace")
+ }
+
+ tg := NewThreadGroup(k.tasks.Root, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock)
+ ctx := args.NewContext(k)
+
+ // Grab the root directory.
+ root := fs.RootFromContext(ctx)
+ defer root.DecRef()
+
+ // Grab the working directory.
+ wd := root // Default.
+ if args.WorkingDirectory != "" {
+ var err error
+ wd, err = k.mounts.FindInode(ctx, root, nil, args.WorkingDirectory, args.MaxSymlinkTraversals)
+ if err != nil {
+ return nil, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
+ }
+ defer wd.DecRef()
+ }
+
+ if args.Filename == "" {
+ // Was anything provided?
+ if len(args.Argv) == 0 {
+ return nil, fmt.Errorf("no filename or command provided")
+ }
+ if !filepath.IsAbs(args.Argv[0]) {
+ return nil, fmt.Errorf("'%s' is not an absolute path", args.Argv[0])
+ }
+ args.Filename = args.Argv[0]
+ }
+
+ // Create a fresh task context.
+ tc, err := k.LoadTaskImage(ctx, k.mounts, root, wd, args.MaxSymlinkTraversals, args.Filename, args.Argv, args.Envv, k.featureSet)
+ if err != nil {
+ return nil, err
+ }
+ tr := newTaskResources(args.FDMap, newFSContext(root, wd, args.Umask))
+ // NewTask unconditionally takes ownership of tr, so we never have to call
+ // tr.release.
+
+ // Create the task.
+ config := &TaskConfig{
+ Kernel: k,
+ ThreadGroup: tg,
+ TaskContext: tc,
+ TaskResources: tr,
+ Credentials: args.Credentials,
+ UTSNamespace: args.UTSNamespace,
+ IPCNamespace: args.IPCNamespace,
+ AllowedCPUMask: sched.NewFullCPUSet(k.applicationCores),
+ }
+ t, err := k.tasks.NewTask(config)
+ if err != nil {
+ return nil, err
+ }
+
+ // Success.
+ if k.started {
+ tid := k.tasks.Root.IDOfTask(t)
+ t.Start(tid)
+ } else if k.globalInit == nil {
+ k.globalInit = tg
+ }
+ return tg, nil
+}
+
+// Start starts execution of all tasks in k.
+//
+// Preconditions: Start may be called exactly once.
+func (k *Kernel) Start() error {
+ k.extMu.Lock()
+ defer k.extMu.Unlock()
+
+ if k.globalInit == nil {
+ return fmt.Errorf("kernel contains no tasks")
+ }
+ if k.started {
+ return fmt.Errorf("kernel already started")
+ }
+
+ k.started = true
+ k.cpuClockTicker = ktime.NewTimer(k.monotonicClock, kernelCPUClockListener{k})
+ k.cpuClockTicker.Swap(ktime.Setting{
+ Enabled: true,
+ Period: linux.ClockTick,
+ })
+ // If k was created by LoadKernelFrom, timers were stopped during
+ // Kernel.SaveTo and need to be resumed. If k was created by NewKernel,
+ // this is a no-op.
+ k.resumeTimeLocked()
+ // Start task goroutines.
+ k.tasks.mu.RLock()
+ defer k.tasks.mu.RUnlock()
+ for t, tid := range k.tasks.Root.tids {
+ t.Start(tid)
+ }
+ return nil
+}
+
+// pauseTimeLocked pauses all Timers and Timekeeper updates.
+//
+// Preconditions: Any task goroutines running in k must be stopped. k.extMu
+// must be locked.
+func (k *Kernel) pauseTimeLocked() {
+ // k.cpuClockTicker may be nil since Kernel.SaveTo() may be called before
+ // Kernel.Start().
+ if k.cpuClockTicker != nil {
+ k.cpuClockTicker.Pause()
+ }
+
+ // By precondition, nothing else can be interacting with PIDNamespace.tids
+ // or FDMap.files, so we can iterate them without synchronization. (We
+ // can't hold the TaskSet mutex when pausing thread group timers because
+ // thread group timers call ThreadGroup.SendSignal, which takes the TaskSet
+ // mutex, while holding the Timer mutex.)
+ for t := range k.tasks.Root.tids {
+ if t == t.tg.leader {
+ t.tg.tm.pause()
+ }
+ // This means we'll iterate FDMaps shared by multiple tasks repeatedly,
+ // but ktime.Timer.Pause is idempotent so this is harmless.
+ if fdm := t.tr.FDMap; fdm != nil {
+ for _, desc := range fdm.files {
+ if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok {
+ tfd.PauseTimer()
+ }
+ }
+ }
+ }
+ k.timekeeper.PauseUpdates()
+}
+
+// resumeTimeLocked resumes all Timers and Timekeeper updates. If
+// pauseTimeLocked has not been previously called, resumeTimeLocked has no
+// effect.
+//
+// Preconditions: Any task goroutines running in k must be stopped. k.extMu
+// must be locked.
+func (k *Kernel) resumeTimeLocked() {
+ if k.cpuClockTicker != nil {
+ k.cpuClockTicker.Resume()
+ }
+
+ k.timekeeper.ResumeUpdates()
+ for t := range k.tasks.Root.tids {
+ if t == t.tg.leader {
+ t.tg.tm.resume()
+ }
+ if fdm := t.tr.FDMap; fdm != nil {
+ for _, desc := range fdm.files {
+ if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok {
+ tfd.ResumeTimer()
+ }
+ }
+ }
+ }
+}
+
+// WaitExited blocks until all tasks in k have exited.
+func (k *Kernel) WaitExited() {
+ k.tasks.liveGoroutines.Wait()
+}
+
+// Kill requests that all tasks in k immediately exit as if group exiting with
+// status es. Kill does not wait for tasks to exit.
+func (k *Kernel) Kill(es ExitStatus) {
+ k.extMu.Lock()
+ defer k.extMu.Unlock()
+ k.tasks.Kill(es)
+}
+
+// Pause requests that all tasks in k temporarily stop executing, and blocks
+// until all tasks in k have stopped. Multiple calls to Pause nest and require
+// an equal number of calls to Unpause to resume execution.
+func (k *Kernel) Pause() {
+ k.extMu.Lock()
+ k.tasks.BeginExternalStop()
+ k.extMu.Unlock()
+ k.tasks.runningGoroutines.Wait()
+}
+
+// Unpause ends the effect of a previous call to Pause. If Unpause is called
+// without a matching preceding call to Pause, Unpause may panic.
+func (k *Kernel) Unpause() {
+ k.extMu.Lock()
+ defer k.extMu.Unlock()
+ k.tasks.EndExternalStop()
+}
+
+// SendExternalSignal injects a signal into the kernel.
+//
+// context is used only for debugging to describe how the signal was received.
+//
+// Returns false if signal could not be sent because the Kernel is not fully
+// initialized yet.
+func (k *Kernel) SendExternalSignal(info *arch.SignalInfo, context string) bool {
+ k.extMu.Lock()
+ defer k.extMu.Unlock()
+ return k.sendExternalSignal(info, context)
+}
+
+// FeatureSet returns the FeatureSet.
+func (k *Kernel) FeatureSet() *cpuid.FeatureSet {
+ return k.featureSet
+}
+
+// Timekeeper returns the Timekeeper.
+func (k *Kernel) Timekeeper() *Timekeeper {
+ return k.timekeeper
+}
+
+// TaskSet returns the TaskSet.
+func (k *Kernel) TaskSet() *TaskSet {
+ return k.tasks
+}
+
+// RootUserNamespace returns the root UserNamespace.
+func (k *Kernel) RootUserNamespace() *auth.UserNamespace {
+ return k.rootUserNamespace
+}
+
+// RootUTSNamespace returns the root UTSNamespace.
+func (k *Kernel) RootUTSNamespace() *UTSNamespace {
+ return k.rootUTSNamespace
+}
+
+// RootIPCNamespace returns the root IPCNamespace.
+func (k *Kernel) RootIPCNamespace() *IPCNamespace {
+ return k.rootIPCNamespace
+}
+
+// RootMountNamespace returns the MountNamespace.
+func (k *Kernel) RootMountNamespace() *fs.MountNamespace {
+ k.extMu.Lock()
+ defer k.extMu.Unlock()
+ return k.mounts
+}
+
+// SetRootMountNamespace sets the MountNamespace.
+func (k *Kernel) SetRootMountNamespace(mounts *fs.MountNamespace) {
+ k.extMu.Lock()
+ defer k.extMu.Unlock()
+ k.mounts = mounts
+}
+
+// NetworkStack returns the network stack. NetworkStack may return nil if no
+// network stack is available.
+func (k *Kernel) NetworkStack() inet.Stack {
+ return k.networkStack
+}
+
+// GlobalInit returns the thread group with ID 1 in the root PID namespace, or
+// nil if no such thread group exists. GlobalInit may return a thread group
+// containing no tasks if the thread group has already exited.
+func (k *Kernel) GlobalInit() *ThreadGroup {
+ k.extMu.Lock()
+ defer k.extMu.Unlock()
+ return k.globalInit
+}
+
+// ApplicationCores returns the number of CPUs visible to sandboxed
+// applications.
+func (k *Kernel) ApplicationCores() uint {
+ return k.applicationCores
+}
+
+// RealtimeClock returns the application CLOCK_REALTIME clock.
+func (k *Kernel) RealtimeClock() ktime.Clock {
+ return k.realtimeClock
+}
+
+// MonotonicClock returns the application CLOCK_MONOTONIC clock.
+func (k *Kernel) MonotonicClock() ktime.Clock {
+ return k.monotonicClock
+}
+
+// CPUClockNow returns the current value of k.cpuClock.
+func (k *Kernel) CPUClockNow() uint64 {
+ return atomic.LoadUint64(&k.cpuClock)
+}
+
+// Syslog returns the syslog.
+func (k *Kernel) Syslog() *syslog {
+ return &k.syslog
+}
+
+// GenerateInotifyCookie generates a unique inotify event cookie.
+//
+// Returned values may overlap with previously returned values if the value
+// space is exhausted. 0 is not a valid cookie value, all other values
+// representable in a uint32 are allowed.
+func (k *Kernel) GenerateInotifyCookie() uint32 {
+ id := atomic.AddUint32(&k.nextInotifyCookie, 1)
+ // Wrap-around is explicitly allowed for inotify event cookies.
+ if id == 0 {
+ id = atomic.AddUint32(&k.nextInotifyCookie, 1)
+ }
+ return id
+}
+
+// NetlinkPorts returns the netlink port manager.
+func (k *Kernel) NetlinkPorts() *port.Manager {
+ return k.netlinkPorts
+}
+
+// ExitError returns the sandbox error that caused the kernel to exit.
+func (k *Kernel) ExitError() error {
+ k.extMu.Lock()
+ defer k.extMu.Unlock()
+ return k.exitErr
+}
+
+// SetExitError sets the sandbox error that caused the kernel to exit, if one is
+// not already set.
+func (k *Kernel) SetExitError(err error) {
+ k.extMu.Lock()
+ defer k.extMu.Unlock()
+ if k.exitErr == nil {
+ k.exitErr = err
+ }
+}
+
+// SupervisorContext returns a Context with maximum privileges in k. It should
+// only be used by goroutines outside the control of the emulated kernel
+// defined by e.
+//
+// Callers are responsible for ensuring that the returned Context is not used
+// concurrently with changes to the Kernel.
+func (k *Kernel) SupervisorContext() context.Context {
+ return supervisorContext{
+ Logger: log.Log(),
+ k: k,
+ }
+}
+
+type supervisorContext struct {
+ context.NoopSleeper
+ log.Logger
+ k *Kernel
+}
+
+// Value implements context.Context.
+func (ctx supervisorContext) Value(key interface{}) interface{} {
+ switch key {
+ case CtxCanTrace:
+ // The supervisor context can trace anything. (None of
+ // supervisorContext's users are expected to invoke ptrace, but ptrace
+ // permissions are required for certain file accesses.)
+ return func(*Task, bool) bool { return true }
+ case CtxKernel:
+ return ctx.k
+ case CtxPIDNamespace:
+ return ctx.k.tasks.Root
+ case CtxUTSNamespace:
+ return ctx.k.rootUTSNamespace
+ case CtxIPCNamespace:
+ return ctx.k.rootIPCNamespace
+ case auth.CtxCredentials:
+ // The supervisor context is global root.
+ return auth.NewRootCredentials(ctx.k.rootUserNamespace)
+ case fs.CtxRoot:
+ return ctx.k.mounts.Root()
+ case ktime.CtxRealtimeClock:
+ return ctx.k.RealtimeClock()
+ case limits.CtxLimits:
+ // No limits apply.
+ return limits.NewLimitSet()
+ case platform.CtxPlatform:
+ return ctx.k
+ case uniqueid.CtxGlobalUniqueID:
+ return ctx.k.UniqueID()
+ case uniqueid.CtxInotifyCookie:
+ return ctx.k.GenerateInotifyCookie()
+ default:
+ return nil
+ }
+}
+
+type kernelCPUClockListener struct {
+ k *Kernel
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (l kernelCPUClockListener) Notify(exp uint64) {
+ atomic.AddUint64(&l.k.cpuClock, exp)
+}
+
+// Destroy implements ktime.TimerListener.Destroy.
+func (l kernelCPUClockListener) Destroy() {
+}