diff options
author | Googler <noreply@google.com> | 2018-04-27 10:37:02 -0700 |
---|---|---|
committer | Adin Scannell <ascannell@google.com> | 2018-04-28 01:44:26 -0400 |
commit | d02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 (patch) | |
tree | 54f95eef73aee6bacbfc736fffc631be2605ed53 /pkg/sentry/kernel/kernel.go | |
parent | f70210e742919f40aa2f0934a22f1c9ba6dada62 (diff) |
Check in gVisor.
PiperOrigin-RevId: 194583126
Change-Id: Ica1d8821a90f74e7e745962d71801c598c652463
Diffstat (limited to 'pkg/sentry/kernel/kernel.go')
-rw-r--r-- | pkg/sentry/kernel/kernel.go | 957 |
1 files changed, 957 insertions, 0 deletions
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go new file mode 100644 index 000000000..0932965e0 --- /dev/null +++ b/pkg/sentry/kernel/kernel.go @@ -0,0 +1,957 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package kernel provides an emulation of the Linux kernel. +// +// See README.md for a detailed overview. +// +// Lock order (outermost locks must be taken first): +// +// Kernel.extMu +// TaskSet.mu +// SignalHandlers.mu +// Task.mu +// +// Locking SignalHandlers.mu in multiple SignalHandlers requires locking +// TaskSet.mu exclusively first. Locking Task.mu in multiple Tasks at the same +// time requires locking all of their signal mutexes first. +package kernel + +import ( + "fmt" + "io" + "path/filepath" + "sync" + "sync/atomic" + "time" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/cpuid" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/timerfd" + "gvisor.googlesource.com/gvisor/pkg/sentry/hostcpu" + "gvisor.googlesource.com/gvisor/pkg/sentry/inet" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/epoll" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" + "gvisor.googlesource.com/gvisor/pkg/sentry/loader" + "gvisor.googlesource.com/gvisor/pkg/sentry/mm" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/port" + sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid" + "gvisor.googlesource.com/gvisor/pkg/state" +) + +// Kernel represents an emulated Linux kernel. It must be initialized by calling +// Init() or LoadFrom(). +type Kernel struct { + // extMu serializes external changes to the Kernel with calls to + // Kernel.SaveTo. (Kernel.SaveTo requires that the state of the Kernel + // remains frozen for the duration of the call; it requires that the Kernel + // is paused as a precondition, which ensures that none of the tasks + // running within the Kernel can affect its state, but extMu is required to + // ensure that concurrent users of the Kernel *outside* the Kernel's + // control cannot affect its state by calling e.g. + // Kernel.SendExternalSignal.) + extMu sync.Mutex `state:"nosave"` + + // started is true if Start has been called. Unless otherwise specified, + // all Kernel fields become immutable once started becomes true. + started bool `state:"nosave"` + + // All of the following fields are immutable unless otherwise specified. + + // Platform is the platform that is used to execute tasks in the + // created Kernel. It is embedded so that Kernel can directly serve as + // Platform in mm logic and also serve as platform.MemoryProvider in + // filemem S/R logic. + platform.Platform `state:"nosave"` + + // See InitKernelArgs for the meaning of these fields. + featureSet *cpuid.FeatureSet + timekeeper *Timekeeper + tasks *TaskSet + rootUserNamespace *auth.UserNamespace + networkStack inet.Stack `state:"nosave"` + applicationCores uint + useHostCores bool + extraAuxv []arch.AuxEntry + vdso *loader.VDSO + rootUTSNamespace *UTSNamespace + rootIPCNamespace *IPCNamespace + + // mounts holds the state of the virtual filesystem. mounts is initially + // nil, and must be set by calling Kernel.SetRootMountNamespace before + // Kernel.CreateProcess can succeed. + mounts *fs.MountNamespace + + // globalInit is the thread group whose leader has ID 1 in the root PID + // namespace. globalInit is stored separately so that it is accessible even + // after all tasks in the thread group have exited, such that ID 1 is no + // longer mapped. + // + // globalInit is mutable until it is assigned by the first successful call + // to CreateProcess, and is protected by extMu. + globalInit *ThreadGroup + + // realtimeClock is a ktime.Clock based on timekeeper's Realtime. + realtimeClock *timekeeperClock + + // monotonicClock is a ktime.Clock based on timekeeper's Monotonic. + monotonicClock *timekeeperClock + + // syslog is the kernel log. + syslog syslog + + // cpuClock is incremented every linux.ClockTick. cpuClock is used to + // measure task CPU usage, since sampling monotonicClock twice on every + // syscall turns out to be unreasonably expensive. This is similar to how + // Linux does task CPU accounting on x86 (CONFIG_IRQ_TIME_ACCOUNTING), + // although Linux also uses scheduler timing information to improve + // resolution (kernel/sched/cputime.c:cputime_adjust()), which we can't do + // since "preeemptive" scheduling is managed by the Go runtime, which + // doesn't provide this information. + // + // cpuClock is mutable, and is accessed using atomic memory operations. + cpuClock uint64 + + // cpuClockTicker increments cpuClock. + cpuClockTicker *ktime.Timer `state:"nosave"` + + // fdMapUids is an ever-increasing counter for generating FDMap uids. + // + // fdMapUids is mutable, and is accessed using atomic memory operations. + fdMapUids uint64 + + // uniqueID is used to generate unique identifiers. + // + // uniqueID is mutable, and is accessed using atomic memory operations. + uniqueID uint64 + + // nextInotifyCookie is a monotonically increasing counter used for + // generating unique inotify event cookies. + // + // nextInotifyCookie is mutable, and is accesed using atomic memory + // operations. + nextInotifyCookie uint32 + + // netlinkPorts manages allocation of netlink socket port IDs. + netlinkPorts *port.Manager + + // exitErr is the error causing the sandbox to exit, if any. It is + // protected by extMu. + exitErr error +} + +// InitKernelArgs holds arguments to Init. +type InitKernelArgs struct { + // FeatureSet is the emulated CPU feature set. + FeatureSet *cpuid.FeatureSet + + // Timekeeper manages time for all tasks in the system. + Timekeeper *Timekeeper + + // RootUserNamespace is the root user namespace. + RootUserNamespace *auth.UserNamespace + + // NetworkStack is the TCP/IP network stack. NetworkStack may be nil. + NetworkStack inet.Stack + + // ApplicationCores is the number of logical CPUs visible to sandboxed + // applications. The set of logical CPU IDs is [0, ApplicationCores); thus + // ApplicationCores is analogous to Linux's nr_cpu_ids, the index of the + // most significant bit in cpu_possible_mask + 1. + ApplicationCores uint + + // If UseHostCores is true, Task.CPU() returns the task goroutine's CPU + // instead of a virtualized CPU number, and Task.CopyToCPUMask() is a + // no-op. If ApplicationCores is less than hostcpu.MaxPossibleCPU(), it + // will be overridden. + UseHostCores bool + + // ExtraAuxv contains additional auxiliary vector entries that are added to + // each process by the ELF loader. + ExtraAuxv []arch.AuxEntry + + // Vdso holds the VDSO and its parameter page. + Vdso *loader.VDSO + + // RootUTSNamespace is the root UTS namepsace. + RootUTSNamespace *UTSNamespace + + // RootIPCNamespace is the root IPC namepsace. + RootIPCNamespace *IPCNamespace +} + +// Init initialize the Kernel with no tasks. +// +// Callers must manually set Kernel.Platform before caling Init. +func (k *Kernel) Init(args InitKernelArgs) error { + if args.FeatureSet == nil { + return fmt.Errorf("FeatureSet is nil") + } + if args.Timekeeper == nil { + return fmt.Errorf("Timekeeper is nil") + } + if args.RootUserNamespace == nil { + return fmt.Errorf("RootUserNamespace is nil") + } + if args.ApplicationCores == 0 { + return fmt.Errorf("ApplicationCores is 0") + } + + k.featureSet = args.FeatureSet + k.timekeeper = args.Timekeeper + k.tasks = newTaskSet() + k.rootUserNamespace = args.RootUserNamespace + k.rootUTSNamespace = args.RootUTSNamespace + k.rootIPCNamespace = args.RootIPCNamespace + k.networkStack = args.NetworkStack + k.applicationCores = args.ApplicationCores + if args.UseHostCores { + k.useHostCores = true + maxCPU, err := hostcpu.MaxPossibleCPU() + if err != nil { + return fmt.Errorf("Failed to get maximum CPU number: %v", err) + } + minAppCores := uint(maxCPU) + 1 + if k.applicationCores < minAppCores { + log.Infof("UseHostCores enabled: increasing ApplicationCores from %d to %d", k.applicationCores, minAppCores) + k.applicationCores = minAppCores + } + } + k.extraAuxv = args.ExtraAuxv + k.vdso = args.Vdso + k.realtimeClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Realtime} + k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic} + k.netlinkPorts = port.New() + + return nil +} + +// SaveTo saves the state of k to w. +// +// Preconditions: The kernel must be paused throughout the call to SaveTo. +func (k *Kernel) SaveTo(w io.Writer) error { + saveStart := time.Now() + ctx := k.SupervisorContext() + + // Do not allow other Kernel methods to affect it while it's being saved. + k.extMu.Lock() + defer k.extMu.Unlock() + + // Stop time. + k.pauseTimeLocked() + defer k.resumeTimeLocked() + + // Flush write operations on open files so data reaches backing storage. + if err := k.tasks.flushWritesToFiles(ctx); err != nil { + return err + } + + // Remove all epoll waiter objects from underlying wait queues. + // NOTE: for programs to resume execution in future snapshot scenarios, + // we will need to re-establish these waiter objects after saving. + k.tasks.unregisterEpollWaiters() + + // Clear the dirent cache before saving because Dirents must be Loaded in a + // particular order (parents before children), and Loading dirents from a cache + // breaks that order. + k.mounts.FlushMountSourceRefs() + + // Ensure that all pending asynchronous work is complete: + // - inode and mount release + // - asynchronuous IO + fs.AsyncBarrier() + + // Once all fs work has completed (flushed references have all been released), + // reset mount mappings. This allows individual mounts to save how inodes map + // to filesystem resources. Without this, fs.Inodes cannot be restored. + fs.SaveInodeMappings() + + // Discard unsavable mappings, such as those for host file descriptors. + // This must be done after waiting for "asynchronous fs work", which + // includes async I/O that may touch application memory. + if err := k.invalidateUnsavableMappings(ctx); err != nil { + return fmt.Errorf("failed to invalidate unsavable mappings: %v", err) + } + + // Save the kernel state. + kernelStart := time.Now() + var stats state.Stats + if err := state.Save(w, k, &stats); err != nil { + return err + } + log.Infof("Kernel save stats: %s", &stats) + log.Infof("Kernel save took [%s].", time.Since(kernelStart)) + + // Save the memory state. + // + // FIXME: In the future, this should not be dispatched via + // an abstract memory type. This should be dispatched to a single + // memory implementation that belongs to the kernel. (There is + // currently a single implementation anyways, it just needs to be + // "unabstracted" and reparented appropriately.) + memoryStart := time.Now() + if err := k.Platform.Memory().SaveTo(w); err != nil { + return err + } + log.Infof("Memory save took [%s].", time.Since(memoryStart)) + + log.Infof("Overall save took [%s].", time.Since(saveStart)) + + return nil +} + +func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error { + ts.mu.RLock() + defer ts.mu.RUnlock() + for t := range ts.Root.tids { + if fdmap := t.FDMap(); fdmap != nil { + for _, desc := range fdmap.files { + if flags := desc.file.Flags(); !flags.Write { + continue + } + if sattr := desc.file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) { + continue + } + // Here we need all metadata synced. + syncErr := desc.file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll) + if err := fs.SaveFileFsyncError(syncErr); err != nil { + name, _ := desc.file.Dirent.FullName(nil /* root */) + return fmt.Errorf("%q was not sufficiently synced: %v", name, err) + } + } + } + } + return nil +} + +// Preconditions: The kernel must be paused. +func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { + invalidated := make(map[*mm.MemoryManager]struct{}) + k.tasks.mu.RLock() + defer k.tasks.mu.RUnlock() + for t := range k.tasks.Root.tids { + // We can skip locking Task.mu here since the kernel is paused. + if mm := t.tc.MemoryManager; mm != nil { + if _, ok := invalidated[mm]; !ok { + if err := mm.InvalidateUnsavable(ctx); err != nil { + return err + } + invalidated[mm] = struct{}{} + } + } + // I really wish we just had a sync.Map of all MMs... + if r, ok := t.runState.(*runSyscallAfterExecStop); ok { + if err := r.tc.MemoryManager.InvalidateUnsavable(ctx); err != nil { + return err + } + } + } + return nil +} + +func (ts *TaskSet) unregisterEpollWaiters() { + ts.mu.RLock() + defer ts.mu.RUnlock() + for t := range ts.Root.tids { + if fdmap := t.FDMap(); fdmap != nil { + for _, desc := range fdmap.files { + if desc.file != nil { + if e, ok := desc.file.FileOperations.(*epoll.EventPoll); ok { + e.UnregisterEpollWaiters() + } + } + } + } + } +} + +// LoadFrom returns a new Kernel loaded from args. +func (k *Kernel) LoadFrom(r io.Reader, p platform.Platform, net inet.Stack) error { + loadStart := time.Now() + if p == nil { + return fmt.Errorf("Platform is nil") + } + + k.Platform = p + k.networkStack = net + + initAppCores := k.applicationCores + + // Load the kernel state. + kernelStart := time.Now() + var stats state.Stats + if err := state.Load(r, k, &stats); err != nil { + return err + } + log.Infof("Kernel load stats: %s", &stats) + log.Infof("Kernel load took [%s].", time.Since(kernelStart)) + + // Load the memory state. + // + // See the note in SaveTo. + memoryStart := time.Now() + if err := k.Platform.Memory().LoadFrom(r); err != nil { + return err + } + log.Infof("Memory load took [%s].", time.Since(memoryStart)) + + // Ensure that all pending asynchronous work is complete: + // - namedpipe opening + // - inode file opening + fs.AsyncBarrier() + + log.Infof("Overall load took [%s]", time.Since(loadStart)) + + // Applications may size per-cpu structures based on k.applicationCores, so + // it can't change across save/restore. When we are virtualizing CPU + // numbers, this isn't a problem. However, when we are exposing host CPU + // assignments, we can't tolerate an increase in the number of host CPUs, + // which could result in getcpu(2) returning CPUs that applications expect + // not to exist. + if k.useHostCores && initAppCores > k.applicationCores { + return fmt.Errorf("UseHostCores enabled: can't increase ApplicationCores from %d to %d after restore", k.applicationCores, initAppCores) + } + + return nil +} + +// Destroy releases resources owned by k. +// +// Preconditions: There must be no task goroutines running in k. +func (k *Kernel) Destroy() { + if k.mounts != nil { + k.mounts.DecRef() + k.mounts = nil + } +} + +// UniqueID returns a unique identifier. +func (k *Kernel) UniqueID() uint64 { + id := atomic.AddUint64(&k.uniqueID, 1) + if id == 0 { + panic("unique identifier generator wrapped around") + } + return id +} + +// CreateProcessArgs holds arguments to kernel.CreateProcess. +type CreateProcessArgs struct { + // Filename is the filename to load. + // + // If this is provided as "", then the file will be guessed via Argv[0]. + Filename string + + // Argvv is a list of arguments. + Argv []string + + // Envv is a list of environment variables. + Envv []string + + // WorkingDirectory is the initial working directory. + // + // This defaults to the root if empty. + WorkingDirectory string + + // Credentials is the initial credentials. + Credentials *auth.Credentials + + // FDMap is the initial set of file descriptors. If CreateProcess succeeds, + // it takes a reference on FDMap. + FDMap *FDMap + + // Umask is the initial umask. + Umask uint + + // Limits is the initial resource limits. + Limits *limits.LimitSet + + // MaxSymlinkTraversals is the maximum number of symlinks to follow + // during resolution. + MaxSymlinkTraversals uint + + // UTSNamespace is the initial UTS namespace. + UTSNamespace *UTSNamespace + + // IPCNamespace is the initial IPC namespace. + IPCNamespace *IPCNamespace +} + +// NewContext returns a context.Context that represents the task that will be +// created by args.NewContext(k). +func (args *CreateProcessArgs) NewContext(k *Kernel) *createProcessContext { + return &createProcessContext{ + Logger: log.Log(), + k: k, + args: args, + } +} + +// createProcessContext is a context.Context that represents the context +// associated with a task that is being created. +type createProcessContext struct { + context.NoopSleeper + log.Logger + k *Kernel + args *CreateProcessArgs +} + +// Value implements context.Context.Value. +func (ctx *createProcessContext) Value(key interface{}) interface{} { + switch key { + case CtxKernel: + return ctx.k + case CtxPIDNamespace: + // "The new task ... is in the root PID namespace." - + // Kernel.CreateProcess + return ctx.k.tasks.Root + case CtxUTSNamespace: + return ctx.args.UTSNamespace + case CtxIPCNamespace: + return ctx.args.IPCNamespace + case auth.CtxCredentials: + return ctx.args.Credentials + case fs.CtxRoot: + if ctx.k.mounts == nil { + return nil + } + return ctx.k.mounts.Root() + case ktime.CtxRealtimeClock: + return ctx.k.RealtimeClock() + case limits.CtxLimits: + return ctx.args.Limits + case platform.CtxPlatform: + return ctx.k + case uniqueid.CtxGlobalUniqueID: + return ctx.k.UniqueID() + case uniqueid.CtxInotifyCookie: + return ctx.k.GenerateInotifyCookie() + default: + return nil + } +} + +// CreateProcess creates a new task in a new thread group with the given +// options. The new task has no parent and is in the root PID namespace. +// +// If k.Start() has already been called, the created task will begin running +// immediately. Otherwise, it will be started when k.Start() is called. +// +// CreateProcess has no analogue in Linux; it is used to create the initial +// application task, as well as processes started by the control server. +func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, error) { + k.extMu.Lock() + defer k.extMu.Unlock() + log.Infof("EXEC: %v", args.Argv) + + if k.mounts == nil { + return nil, fmt.Errorf("no kernel MountNamespace") + } + + tg := NewThreadGroup(k.tasks.Root, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock) + ctx := args.NewContext(k) + + // Grab the root directory. + root := fs.RootFromContext(ctx) + defer root.DecRef() + + // Grab the working directory. + wd := root // Default. + if args.WorkingDirectory != "" { + var err error + wd, err = k.mounts.FindInode(ctx, root, nil, args.WorkingDirectory, args.MaxSymlinkTraversals) + if err != nil { + return nil, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) + } + defer wd.DecRef() + } + + if args.Filename == "" { + // Was anything provided? + if len(args.Argv) == 0 { + return nil, fmt.Errorf("no filename or command provided") + } + if !filepath.IsAbs(args.Argv[0]) { + return nil, fmt.Errorf("'%s' is not an absolute path", args.Argv[0]) + } + args.Filename = args.Argv[0] + } + + // Create a fresh task context. + tc, err := k.LoadTaskImage(ctx, k.mounts, root, wd, args.MaxSymlinkTraversals, args.Filename, args.Argv, args.Envv, k.featureSet) + if err != nil { + return nil, err + } + tr := newTaskResources(args.FDMap, newFSContext(root, wd, args.Umask)) + // NewTask unconditionally takes ownership of tr, so we never have to call + // tr.release. + + // Create the task. + config := &TaskConfig{ + Kernel: k, + ThreadGroup: tg, + TaskContext: tc, + TaskResources: tr, + Credentials: args.Credentials, + UTSNamespace: args.UTSNamespace, + IPCNamespace: args.IPCNamespace, + AllowedCPUMask: sched.NewFullCPUSet(k.applicationCores), + } + t, err := k.tasks.NewTask(config) + if err != nil { + return nil, err + } + + // Success. + if k.started { + tid := k.tasks.Root.IDOfTask(t) + t.Start(tid) + } else if k.globalInit == nil { + k.globalInit = tg + } + return tg, nil +} + +// Start starts execution of all tasks in k. +// +// Preconditions: Start may be called exactly once. +func (k *Kernel) Start() error { + k.extMu.Lock() + defer k.extMu.Unlock() + + if k.globalInit == nil { + return fmt.Errorf("kernel contains no tasks") + } + if k.started { + return fmt.Errorf("kernel already started") + } + + k.started = true + k.cpuClockTicker = ktime.NewTimer(k.monotonicClock, kernelCPUClockListener{k}) + k.cpuClockTicker.Swap(ktime.Setting{ + Enabled: true, + Period: linux.ClockTick, + }) + // If k was created by LoadKernelFrom, timers were stopped during + // Kernel.SaveTo and need to be resumed. If k was created by NewKernel, + // this is a no-op. + k.resumeTimeLocked() + // Start task goroutines. + k.tasks.mu.RLock() + defer k.tasks.mu.RUnlock() + for t, tid := range k.tasks.Root.tids { + t.Start(tid) + } + return nil +} + +// pauseTimeLocked pauses all Timers and Timekeeper updates. +// +// Preconditions: Any task goroutines running in k must be stopped. k.extMu +// must be locked. +func (k *Kernel) pauseTimeLocked() { + // k.cpuClockTicker may be nil since Kernel.SaveTo() may be called before + // Kernel.Start(). + if k.cpuClockTicker != nil { + k.cpuClockTicker.Pause() + } + + // By precondition, nothing else can be interacting with PIDNamespace.tids + // or FDMap.files, so we can iterate them without synchronization. (We + // can't hold the TaskSet mutex when pausing thread group timers because + // thread group timers call ThreadGroup.SendSignal, which takes the TaskSet + // mutex, while holding the Timer mutex.) + for t := range k.tasks.Root.tids { + if t == t.tg.leader { + t.tg.tm.pause() + } + // This means we'll iterate FDMaps shared by multiple tasks repeatedly, + // but ktime.Timer.Pause is idempotent so this is harmless. + if fdm := t.tr.FDMap; fdm != nil { + for _, desc := range fdm.files { + if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok { + tfd.PauseTimer() + } + } + } + } + k.timekeeper.PauseUpdates() +} + +// resumeTimeLocked resumes all Timers and Timekeeper updates. If +// pauseTimeLocked has not been previously called, resumeTimeLocked has no +// effect. +// +// Preconditions: Any task goroutines running in k must be stopped. k.extMu +// must be locked. +func (k *Kernel) resumeTimeLocked() { + if k.cpuClockTicker != nil { + k.cpuClockTicker.Resume() + } + + k.timekeeper.ResumeUpdates() + for t := range k.tasks.Root.tids { + if t == t.tg.leader { + t.tg.tm.resume() + } + if fdm := t.tr.FDMap; fdm != nil { + for _, desc := range fdm.files { + if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok { + tfd.ResumeTimer() + } + } + } + } +} + +// WaitExited blocks until all tasks in k have exited. +func (k *Kernel) WaitExited() { + k.tasks.liveGoroutines.Wait() +} + +// Kill requests that all tasks in k immediately exit as if group exiting with +// status es. Kill does not wait for tasks to exit. +func (k *Kernel) Kill(es ExitStatus) { + k.extMu.Lock() + defer k.extMu.Unlock() + k.tasks.Kill(es) +} + +// Pause requests that all tasks in k temporarily stop executing, and blocks +// until all tasks in k have stopped. Multiple calls to Pause nest and require +// an equal number of calls to Unpause to resume execution. +func (k *Kernel) Pause() { + k.extMu.Lock() + k.tasks.BeginExternalStop() + k.extMu.Unlock() + k.tasks.runningGoroutines.Wait() +} + +// Unpause ends the effect of a previous call to Pause. If Unpause is called +// without a matching preceding call to Pause, Unpause may panic. +func (k *Kernel) Unpause() { + k.extMu.Lock() + defer k.extMu.Unlock() + k.tasks.EndExternalStop() +} + +// SendExternalSignal injects a signal into the kernel. +// +// context is used only for debugging to describe how the signal was received. +// +// Returns false if signal could not be sent because the Kernel is not fully +// initialized yet. +func (k *Kernel) SendExternalSignal(info *arch.SignalInfo, context string) bool { + k.extMu.Lock() + defer k.extMu.Unlock() + return k.sendExternalSignal(info, context) +} + +// FeatureSet returns the FeatureSet. +func (k *Kernel) FeatureSet() *cpuid.FeatureSet { + return k.featureSet +} + +// Timekeeper returns the Timekeeper. +func (k *Kernel) Timekeeper() *Timekeeper { + return k.timekeeper +} + +// TaskSet returns the TaskSet. +func (k *Kernel) TaskSet() *TaskSet { + return k.tasks +} + +// RootUserNamespace returns the root UserNamespace. +func (k *Kernel) RootUserNamespace() *auth.UserNamespace { + return k.rootUserNamespace +} + +// RootUTSNamespace returns the root UTSNamespace. +func (k *Kernel) RootUTSNamespace() *UTSNamespace { + return k.rootUTSNamespace +} + +// RootIPCNamespace returns the root IPCNamespace. +func (k *Kernel) RootIPCNamespace() *IPCNamespace { + return k.rootIPCNamespace +} + +// RootMountNamespace returns the MountNamespace. +func (k *Kernel) RootMountNamespace() *fs.MountNamespace { + k.extMu.Lock() + defer k.extMu.Unlock() + return k.mounts +} + +// SetRootMountNamespace sets the MountNamespace. +func (k *Kernel) SetRootMountNamespace(mounts *fs.MountNamespace) { + k.extMu.Lock() + defer k.extMu.Unlock() + k.mounts = mounts +} + +// NetworkStack returns the network stack. NetworkStack may return nil if no +// network stack is available. +func (k *Kernel) NetworkStack() inet.Stack { + return k.networkStack +} + +// GlobalInit returns the thread group with ID 1 in the root PID namespace, or +// nil if no such thread group exists. GlobalInit may return a thread group +// containing no tasks if the thread group has already exited. +func (k *Kernel) GlobalInit() *ThreadGroup { + k.extMu.Lock() + defer k.extMu.Unlock() + return k.globalInit +} + +// ApplicationCores returns the number of CPUs visible to sandboxed +// applications. +func (k *Kernel) ApplicationCores() uint { + return k.applicationCores +} + +// RealtimeClock returns the application CLOCK_REALTIME clock. +func (k *Kernel) RealtimeClock() ktime.Clock { + return k.realtimeClock +} + +// MonotonicClock returns the application CLOCK_MONOTONIC clock. +func (k *Kernel) MonotonicClock() ktime.Clock { + return k.monotonicClock +} + +// CPUClockNow returns the current value of k.cpuClock. +func (k *Kernel) CPUClockNow() uint64 { + return atomic.LoadUint64(&k.cpuClock) +} + +// Syslog returns the syslog. +func (k *Kernel) Syslog() *syslog { + return &k.syslog +} + +// GenerateInotifyCookie generates a unique inotify event cookie. +// +// Returned values may overlap with previously returned values if the value +// space is exhausted. 0 is not a valid cookie value, all other values +// representable in a uint32 are allowed. +func (k *Kernel) GenerateInotifyCookie() uint32 { + id := atomic.AddUint32(&k.nextInotifyCookie, 1) + // Wrap-around is explicitly allowed for inotify event cookies. + if id == 0 { + id = atomic.AddUint32(&k.nextInotifyCookie, 1) + } + return id +} + +// NetlinkPorts returns the netlink port manager. +func (k *Kernel) NetlinkPorts() *port.Manager { + return k.netlinkPorts +} + +// ExitError returns the sandbox error that caused the kernel to exit. +func (k *Kernel) ExitError() error { + k.extMu.Lock() + defer k.extMu.Unlock() + return k.exitErr +} + +// SetExitError sets the sandbox error that caused the kernel to exit, if one is +// not already set. +func (k *Kernel) SetExitError(err error) { + k.extMu.Lock() + defer k.extMu.Unlock() + if k.exitErr == nil { + k.exitErr = err + } +} + +// SupervisorContext returns a Context with maximum privileges in k. It should +// only be used by goroutines outside the control of the emulated kernel +// defined by e. +// +// Callers are responsible for ensuring that the returned Context is not used +// concurrently with changes to the Kernel. +func (k *Kernel) SupervisorContext() context.Context { + return supervisorContext{ + Logger: log.Log(), + k: k, + } +} + +type supervisorContext struct { + context.NoopSleeper + log.Logger + k *Kernel +} + +// Value implements context.Context. +func (ctx supervisorContext) Value(key interface{}) interface{} { + switch key { + case CtxCanTrace: + // The supervisor context can trace anything. (None of + // supervisorContext's users are expected to invoke ptrace, but ptrace + // permissions are required for certain file accesses.) + return func(*Task, bool) bool { return true } + case CtxKernel: + return ctx.k + case CtxPIDNamespace: + return ctx.k.tasks.Root + case CtxUTSNamespace: + return ctx.k.rootUTSNamespace + case CtxIPCNamespace: + return ctx.k.rootIPCNamespace + case auth.CtxCredentials: + // The supervisor context is global root. + return auth.NewRootCredentials(ctx.k.rootUserNamespace) + case fs.CtxRoot: + return ctx.k.mounts.Root() + case ktime.CtxRealtimeClock: + return ctx.k.RealtimeClock() + case limits.CtxLimits: + // No limits apply. + return limits.NewLimitSet() + case platform.CtxPlatform: + return ctx.k + case uniqueid.CtxGlobalUniqueID: + return ctx.k.UniqueID() + case uniqueid.CtxInotifyCookie: + return ctx.k.GenerateInotifyCookie() + default: + return nil + } +} + +type kernelCPUClockListener struct { + k *Kernel +} + +// Notify implements ktime.TimerListener.Notify. +func (l kernelCPUClockListener) Notify(exp uint64) { + atomic.AddUint64(&l.k.cpuClock, exp) +} + +// Destroy implements ktime.TimerListener.Destroy. +func (l kernelCPUClockListener) Destroy() { +} |