diff options
Diffstat (limited to 'pkg/sentry/kernel/kernel.go')
-rw-r--r-- | pkg/sentry/kernel/kernel.go | 1682 |
1 files changed, 1682 insertions, 0 deletions
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go new file mode 100644 index 000000000..2177b785a --- /dev/null +++ b/pkg/sentry/kernel/kernel.go @@ -0,0 +1,1682 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package kernel provides an emulation of the Linux kernel. +// +// See README.md for a detailed overview. +// +// Lock order (outermost locks must be taken first): +// +// Kernel.extMu +// ThreadGroup.timerMu +// ktime.Timer.mu (for kernelCPUClockTicker and IntervalTimer) +// TaskSet.mu +// SignalHandlers.mu +// Task.mu +// runningTasksMu +// +// Locking SignalHandlers.mu in multiple SignalHandlers requires locking +// TaskSet.mu exclusively first. Locking Task.mu in multiple Tasks at the same +// time requires locking all of their signal mutexes first. +package kernel + +import ( + "errors" + "fmt" + "path/filepath" + "sync/atomic" + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/eventchannel" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + oldtimerfd "gvisor.dev/gvisor/pkg/sentry/fs/timerfd" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/timerfd" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/hostcpu" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/epoll" + "gvisor.dev/gvisor/pkg/sentry/kernel/futex" + "gvisor.dev/gvisor/pkg/sentry/kernel/sched" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/loader" + "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/socket/netlink/port" + sentrytime "gvisor.dev/gvisor/pkg/sentry/time" + "gvisor.dev/gvisor/pkg/sentry/unimpl" + uspb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto" + "gvisor.dev/gvisor/pkg/sentry/uniqueid" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/state" + "gvisor.dev/gvisor/pkg/state/wire" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/tcpip" +) + +// VFS2Enabled is set to true when VFS2 is enabled. Added as a global for allow +// easy access everywhere. To be removed once VFS2 becomes the default. +var VFS2Enabled = false + +// Kernel represents an emulated Linux kernel. It must be initialized by calling +// Init() or LoadFrom(). +// +// +stateify savable +type Kernel struct { + // extMu serializes external changes to the Kernel with calls to + // Kernel.SaveTo. (Kernel.SaveTo requires that the state of the Kernel + // remains frozen for the duration of the call; it requires that the Kernel + // is paused as a precondition, which ensures that none of the tasks + // running within the Kernel can affect its state, but extMu is required to + // ensure that concurrent users of the Kernel *outside* the Kernel's + // control cannot affect its state by calling e.g. + // Kernel.SendExternalSignal.) + extMu sync.Mutex `state:"nosave"` + + // started is true if Start has been called. Unless otherwise specified, + // all Kernel fields become immutable once started becomes true. + started bool `state:"nosave"` + + // All of the following fields are immutable unless otherwise specified. + + // Platform is the platform that is used to execute tasks in the created + // Kernel. See comment on pgalloc.MemoryFileProvider for why Platform is + // embedded anonymously (the same issue applies). + platform.Platform `state:"nosave"` + + // mf provides application memory. + mf *pgalloc.MemoryFile `state:"nosave"` + + // See InitKernelArgs for the meaning of these fields. + featureSet *cpuid.FeatureSet + timekeeper *Timekeeper + tasks *TaskSet + rootUserNamespace *auth.UserNamespace + rootNetworkNamespace *inet.Namespace + applicationCores uint + useHostCores bool + extraAuxv []arch.AuxEntry + vdso *loader.VDSO + rootUTSNamespace *UTSNamespace + rootIPCNamespace *IPCNamespace + rootAbstractSocketNamespace *AbstractSocketNamespace + + // futexes is the "root" futex.Manager, from which all others are forked. + // This is necessary to ensure that shared futexes are coherent across all + // tasks, including those created by CreateProcess. + futexes *futex.Manager + + // globalInit is the thread group whose leader has ID 1 in the root PID + // namespace. globalInit is stored separately so that it is accessible even + // after all tasks in the thread group have exited, such that ID 1 is no + // longer mapped. + // + // globalInit is mutable until it is assigned by the first successful call + // to CreateProcess, and is protected by extMu. + globalInit *ThreadGroup + + // realtimeClock is a ktime.Clock based on timekeeper's Realtime. + realtimeClock *timekeeperClock + + // monotonicClock is a ktime.Clock based on timekeeper's Monotonic. + monotonicClock *timekeeperClock + + // syslog is the kernel log. + syslog syslog + + // runningTasksMu synchronizes disable/enable of cpuClockTicker when + // the kernel is idle (runningTasks == 0). + // + // runningTasksMu is used to exclude critical sections when the timer + // disables itself and when the first active task enables the timer, + // ensuring that tasks always see a valid cpuClock value. + runningTasksMu sync.Mutex `state:"nosave"` + + // runningTasks is the total count of tasks currently in + // TaskGoroutineRunningSys or TaskGoroutineRunningApp. i.e., they are + // not blocked or stopped. + // + // runningTasks must be accessed atomically. Increments from 0 to 1 are + // further protected by runningTasksMu (see incRunningTasks). + runningTasks int64 + + // cpuClock is incremented every linux.ClockTick. cpuClock is used to + // measure task CPU usage, since sampling monotonicClock twice on every + // syscall turns out to be unreasonably expensive. This is similar to how + // Linux does task CPU accounting on x86 (CONFIG_IRQ_TIME_ACCOUNTING), + // although Linux also uses scheduler timing information to improve + // resolution (kernel/sched/cputime.c:cputime_adjust()), which we can't do + // since "preeemptive" scheduling is managed by the Go runtime, which + // doesn't provide this information. + // + // cpuClock is mutable, and is accessed using atomic memory operations. + cpuClock uint64 + + // cpuClockTicker increments cpuClock. + cpuClockTicker *ktime.Timer `state:"nosave"` + + // cpuClockTickerDisabled indicates that cpuClockTicker has been + // disabled because no tasks are running. + // + // cpuClockTickerDisabled is protected by runningTasksMu. + cpuClockTickerDisabled bool + + // cpuClockTickerSetting is the ktime.Setting of cpuClockTicker at the + // point it was disabled. It is cached here to avoid a lock ordering + // violation with cpuClockTicker.mu when runningTaskMu is held. + // + // cpuClockTickerSetting is only valid when cpuClockTickerDisabled is + // true. + // + // cpuClockTickerSetting is protected by runningTasksMu. + cpuClockTickerSetting ktime.Setting + + // uniqueID is used to generate unique identifiers. + // + // uniqueID is mutable, and is accessed using atomic memory operations. + uniqueID uint64 + + // nextInotifyCookie is a monotonically increasing counter used for + // generating unique inotify event cookies. + // + // nextInotifyCookie is mutable, and is accessed using atomic memory + // operations. + nextInotifyCookie uint32 + + // netlinkPorts manages allocation of netlink socket port IDs. + netlinkPorts *port.Manager + + // saveErr is the error causing the sandbox to exit during save, if + // any. It is protected by extMu. + saveErr error `state:"nosave"` + + // danglingEndpoints is used to save / restore tcpip.DanglingEndpoints. + danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"` + + // sockets is the list of all network sockets the system. Protected by + // extMu. + sockets socketList + + // nextSocketEntry is the next entry number to use in sockets. Protected + // by extMu. + nextSocketEntry uint64 + + // deviceRegistry is used to save/restore device.SimpleDevices. + deviceRegistry struct{} `state:".(*device.Registry)"` + + // DirentCacheLimiter controls the number of total dirent entries can be in + // caches. Not all caches use it, only the caches that use host resources use + // the limiter. It may be nil if disabled. + DirentCacheLimiter *fs.DirentCacheLimiter + + // unimplementedSyscallEmitterOnce is used in the initialization of + // unimplementedSyscallEmitter. + unimplementedSyscallEmitterOnce sync.Once `state:"nosave"` + + // unimplementedSyscallEmitter is used to emit unimplemented syscall + // events. This is initialized lazily on the first unimplemented + // syscall. + unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"` + + // SpecialOpts contains special kernel options. + SpecialOpts + + // VFS keeps the filesystem state used across the kernel. + vfs vfs.VirtualFilesystem + + // hostMount is the Mount used for file descriptors that were imported + // from the host. + hostMount *vfs.Mount + + // pipeMount is the Mount used for pipes created by the pipe() and pipe2() + // syscalls (as opposed to named pipes created by mknod()). + pipeMount *vfs.Mount + + // shmMount is the Mount used for anonymous files created by the + // memfd_create() syscalls. It is analagous to Linux's shm_mnt. + shmMount *vfs.Mount + + // socketMount is the Mount used for sockets created by the socket() and + // socketpair() syscalls. There are several cases where a socket dentry will + // not be contained in socketMount: + // 1. Socket files created by mknod() + // 2. Socket fds imported from the host (Kernel.hostMount is used for these) + // 3. Socket files created by binding Unix sockets to a file path + socketMount *vfs.Mount + + // If set to true, report address space activation waits as if the task is in + // external wait so that the watchdog doesn't report the task stuck. + SleepForAddressSpaceActivation bool +} + +// InitKernelArgs holds arguments to Init. +type InitKernelArgs struct { + // FeatureSet is the emulated CPU feature set. + FeatureSet *cpuid.FeatureSet + + // Timekeeper manages time for all tasks in the system. + Timekeeper *Timekeeper + + // RootUserNamespace is the root user namespace. + RootUserNamespace *auth.UserNamespace + + // RootNetworkNamespace is the root network namespace. If nil, no networking + // will be available. + RootNetworkNamespace *inet.Namespace + + // ApplicationCores is the number of logical CPUs visible to sandboxed + // applications. The set of logical CPU IDs is [0, ApplicationCores); thus + // ApplicationCores is analogous to Linux's nr_cpu_ids, the index of the + // most significant bit in cpu_possible_mask + 1. + ApplicationCores uint + + // If UseHostCores is true, Task.CPU() returns the task goroutine's CPU + // instead of a virtualized CPU number, and Task.CopyToCPUMask() is a + // no-op. If ApplicationCores is less than hostcpu.MaxPossibleCPU(), it + // will be overridden. + UseHostCores bool + + // ExtraAuxv contains additional auxiliary vector entries that are added to + // each process by the ELF loader. + ExtraAuxv []arch.AuxEntry + + // Vdso holds the VDSO and its parameter page. + Vdso *loader.VDSO + + // RootUTSNamespace is the root UTS namespace. + RootUTSNamespace *UTSNamespace + + // RootIPCNamespace is the root IPC namespace. + RootIPCNamespace *IPCNamespace + + // RootAbstractSocketNamespace is the root Abstract Socket namespace. + RootAbstractSocketNamespace *AbstractSocketNamespace + + // PIDNamespace is the root PID namespace. + PIDNamespace *PIDNamespace +} + +// Init initialize the Kernel with no tasks. +// +// Callers must manually set Kernel.Platform and call Kernel.SetMemoryFile +// before calling Init. +func (k *Kernel) Init(args InitKernelArgs) error { + if args.FeatureSet == nil { + return fmt.Errorf("FeatureSet is nil") + } + if args.Timekeeper == nil { + return fmt.Errorf("Timekeeper is nil") + } + if args.Timekeeper.clocks == nil { + return fmt.Errorf("Must call Timekeeper.SetClocks() before Kernel.Init()") + } + if args.RootUserNamespace == nil { + return fmt.Errorf("RootUserNamespace is nil") + } + if args.ApplicationCores == 0 { + return fmt.Errorf("ApplicationCores is 0") + } + + k.featureSet = args.FeatureSet + k.timekeeper = args.Timekeeper + k.tasks = newTaskSet(args.PIDNamespace) + k.rootUserNamespace = args.RootUserNamespace + k.rootUTSNamespace = args.RootUTSNamespace + k.rootIPCNamespace = args.RootIPCNamespace + k.rootAbstractSocketNamespace = args.RootAbstractSocketNamespace + k.rootNetworkNamespace = args.RootNetworkNamespace + if k.rootNetworkNamespace == nil { + k.rootNetworkNamespace = inet.NewRootNamespace(nil, nil) + } + k.applicationCores = args.ApplicationCores + if args.UseHostCores { + k.useHostCores = true + maxCPU, err := hostcpu.MaxPossibleCPU() + if err != nil { + return fmt.Errorf("Failed to get maximum CPU number: %v", err) + } + minAppCores := uint(maxCPU) + 1 + if k.applicationCores < minAppCores { + log.Infof("UseHostCores enabled: increasing ApplicationCores from %d to %d", k.applicationCores, minAppCores) + k.applicationCores = minAppCores + } + } + k.extraAuxv = args.ExtraAuxv + k.vdso = args.Vdso + k.realtimeClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Realtime} + k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic} + k.futexes = futex.NewManager() + k.netlinkPorts = port.New() + + if VFS2Enabled { + if err := k.vfs.Init(); err != nil { + return fmt.Errorf("failed to initialize VFS: %v", err) + } + + pipeFilesystem, err := pipefs.NewFilesystem(&k.vfs) + if err != nil { + return fmt.Errorf("failed to create pipefs filesystem: %v", err) + } + defer pipeFilesystem.DecRef() + pipeMount, err := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{}) + if err != nil { + return fmt.Errorf("failed to create pipefs mount: %v", err) + } + k.pipeMount = pipeMount + + tmpfsFilesystem, tmpfsRoot, err := tmpfs.NewFilesystem(k.SupervisorContext(), &k.vfs, auth.NewRootCredentials(k.rootUserNamespace)) + if err != nil { + return fmt.Errorf("failed to create tmpfs filesystem: %v", err) + } + defer tmpfsFilesystem.DecRef() + defer tmpfsRoot.DecRef() + shmMount, err := k.vfs.NewDisconnectedMount(tmpfsFilesystem, tmpfsRoot, &vfs.MountOptions{}) + if err != nil { + return fmt.Errorf("failed to create tmpfs mount: %v", err) + } + k.shmMount = shmMount + + socketFilesystem, err := sockfs.NewFilesystem(&k.vfs) + if err != nil { + return fmt.Errorf("failed to create sockfs filesystem: %v", err) + } + defer socketFilesystem.DecRef() + socketMount, err := k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{}) + if err != nil { + return fmt.Errorf("failed to create sockfs mount: %v", err) + } + k.socketMount = socketMount + } + + return nil +} + +// SaveTo saves the state of k to w. +// +// Preconditions: The kernel must be paused throughout the call to SaveTo. +func (k *Kernel) SaveTo(w wire.Writer) error { + saveStart := time.Now() + ctx := k.SupervisorContext() + + // Do not allow other Kernel methods to affect it while it's being saved. + k.extMu.Lock() + defer k.extMu.Unlock() + + // Stop time. + k.pauseTimeLocked() + defer k.resumeTimeLocked() + + // Evict all evictable MemoryFile allocations. + k.mf.StartEvictions() + k.mf.WaitForEvictions() + + // Flush write operations on open files so data reaches backing storage. + // This must come after MemoryFile eviction since eviction may cause file + // writes. + if err := k.tasks.flushWritesToFiles(ctx); err != nil { + return err + } + + // Remove all epoll waiter objects from underlying wait queues. + // NOTE: for programs to resume execution in future snapshot scenarios, + // we will need to re-establish these waiter objects after saving. + k.tasks.unregisterEpollWaiters() + + // Clear the dirent cache before saving because Dirents must be Loaded in a + // particular order (parents before children), and Loading dirents from a cache + // breaks that order. + if err := k.flushMountSourceRefs(); err != nil { + return err + } + + // Ensure that all inode and mount release operations have completed. + fs.AsyncBarrier() + + // Once all fs work has completed (flushed references have all been released), + // reset mount mappings. This allows individual mounts to save how inodes map + // to filesystem resources. Without this, fs.Inodes cannot be restored. + fs.SaveInodeMappings() + + // Discard unsavable mappings, such as those for host file descriptors. + // This must be done after waiting for "asynchronous fs work", which + // includes async I/O that may touch application memory. + if err := k.invalidateUnsavableMappings(ctx); err != nil { + return fmt.Errorf("failed to invalidate unsavable mappings: %v", err) + } + + // Save the CPUID FeatureSet before the rest of the kernel so we can + // verify its compatibility on restore before attempting to restore the + // entire kernel, which may fail on an incompatible machine. + // + // N.B. This will also be saved along with the full kernel save below. + cpuidStart := time.Now() + if _, err := state.Save(k.SupervisorContext(), w, k.FeatureSet()); err != nil { + return err + } + log.Infof("CPUID save took [%s].", time.Since(cpuidStart)) + + // Save the kernel state. + kernelStart := time.Now() + stats, err := state.Save(k.SupervisorContext(), w, k) + if err != nil { + return err + } + log.Infof("Kernel save stats: %s", stats.String()) + log.Infof("Kernel save took [%s].", time.Since(kernelStart)) + + // Save the memory file's state. + memoryStart := time.Now() + if err := k.mf.SaveTo(k.SupervisorContext(), w); err != nil { + return err + } + log.Infof("Memory save took [%s].", time.Since(memoryStart)) + + log.Infof("Overall save took [%s].", time.Since(saveStart)) + + return nil +} + +// flushMountSourceRefs flushes the MountSources for all mounted filesystems +// and open FDs. +func (k *Kernel) flushMountSourceRefs() error { + // Flush all mount sources for currently mounted filesystems in each task. + flushed := make(map[*fs.MountNamespace]struct{}) + k.tasks.mu.RLock() + k.tasks.forEachThreadGroupLocked(func(tg *ThreadGroup) { + if _, ok := flushed[tg.mounts]; ok { + // Already flushed. + return + } + tg.mounts.FlushMountSourceRefs() + flushed[tg.mounts] = struct{}{} + }) + k.tasks.mu.RUnlock() + + // There may be some open FDs whose filesystems have been unmounted. We + // must flush those as well. + return k.tasks.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error { + file.Dirent.Inode.MountSource.FlushDirentRefs() + return nil + }) +} + +// forEachFDPaused applies the given function to each open file descriptor in +// each task. +// +// Precondition: Must be called with the kernel paused. +func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) (err error) { + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. + if VFS2Enabled { + return nil + } + + ts.mu.RLock() + defer ts.mu.RUnlock() + for t := range ts.Root.tids { + // We can skip locking Task.mu here since the kernel is paused. + if t.fdTable == nil { + continue + } + t.fdTable.forEach(func(_ int32, file *fs.File, fileVFS2 *vfs.FileDescription, _ FDFlags) { + if lastErr := f(file, fileVFS2); lastErr != nil && err == nil { + err = lastErr + } + }) + } + return err +} + +func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error { + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. + return ts.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error { + if flags := file.Flags(); !flags.Write { + return nil + } + if sattr := file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) { + return nil + } + // Here we need all metadata synced. + syncErr := file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll) + if err := fs.SaveFileFsyncError(syncErr); err != nil { + name, _ := file.Dirent.FullName(nil /* root */) + // Wrap this error in ErrSaveRejection so that it will trigger a save + // error, rather than a panic. This also allows us to distinguish Fsync + // errors from state file errors in state.Save. + return fs.ErrSaveRejection{ + Err: fmt.Errorf("%q was not sufficiently synced: %v", name, err), + } + } + return nil + }) +} + +// Preconditions: The kernel must be paused. +func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { + invalidated := make(map[*mm.MemoryManager]struct{}) + k.tasks.mu.RLock() + defer k.tasks.mu.RUnlock() + for t := range k.tasks.Root.tids { + // We can skip locking Task.mu here since the kernel is paused. + if mm := t.tc.MemoryManager; mm != nil { + if _, ok := invalidated[mm]; !ok { + if err := mm.InvalidateUnsavable(ctx); err != nil { + return err + } + invalidated[mm] = struct{}{} + } + } + // I really wish we just had a sync.Map of all MMs... + if r, ok := t.runState.(*runSyscallAfterExecStop); ok { + if err := r.tc.MemoryManager.InvalidateUnsavable(ctx); err != nil { + return err + } + } + } + return nil +} + +func (ts *TaskSet) unregisterEpollWaiters() { + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. + if VFS2Enabled { + return + } + + ts.mu.RLock() + defer ts.mu.RUnlock() + + // Tasks that belong to the same process could potentially point to the + // same FDTable. So we retain a map of processed ones to avoid + // processing the same FDTable multiple times. + processed := make(map[*FDTable]struct{}) + for t := range ts.Root.tids { + // We can skip locking Task.mu here since the kernel is paused. + if t.fdTable == nil { + continue + } + if _, ok := processed[t.fdTable]; ok { + continue + } + t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { + if e, ok := file.FileOperations.(*epoll.EventPoll); ok { + e.UnregisterEpollWaiters() + } + }) + processed[t.fdTable] = struct{}{} + } +} + +// LoadFrom returns a new Kernel loaded from args. +func (k *Kernel) LoadFrom(r wire.Reader, net inet.Stack, clocks sentrytime.Clocks) error { + loadStart := time.Now() + + initAppCores := k.applicationCores + + // Load the pre-saved CPUID FeatureSet. + // + // N.B. This was also saved along with the full kernel below, so we + // don't need to explicitly install it in the Kernel. + cpuidStart := time.Now() + var features cpuid.FeatureSet + if _, err := state.Load(k.SupervisorContext(), r, &features); err != nil { + return err + } + log.Infof("CPUID load took [%s].", time.Since(cpuidStart)) + + // Verify that the FeatureSet is usable on this host. We do this before + // Kernel load so that the explicit CPUID mismatch error has priority + // over floating point state restore errors that may occur on load on + // an incompatible machine. + if err := features.CheckHostCompatible(); err != nil { + return err + } + + // Load the kernel state. + kernelStart := time.Now() + stats, err := state.Load(k.SupervisorContext(), r, k) + if err != nil { + return err + } + log.Infof("Kernel load stats: %s", stats.String()) + log.Infof("Kernel load took [%s].", time.Since(kernelStart)) + + // rootNetworkNamespace should be populated after loading the state file. + // Restore the root network stack. + k.rootNetworkNamespace.RestoreRootStack(net) + + // Load the memory file's state. + memoryStart := time.Now() + if err := k.mf.LoadFrom(k.SupervisorContext(), r); err != nil { + return err + } + log.Infof("Memory load took [%s].", time.Since(memoryStart)) + + log.Infof("Overall load took [%s]", time.Since(loadStart)) + + k.Timekeeper().SetClocks(clocks) + if net != nil { + net.Resume() + } + + // Ensure that all pending asynchronous work is complete: + // - namedpipe opening + // - inode file opening + if err := fs.AsyncErrorBarrier(); err != nil { + return err + } + + tcpip.AsyncLoading.Wait() + + log.Infof("Overall load took [%s] after async work", time.Since(loadStart)) + + // Applications may size per-cpu structures based on k.applicationCores, so + // it can't change across save/restore. When we are virtualizing CPU + // numbers, this isn't a problem. However, when we are exposing host CPU + // assignments, we can't tolerate an increase in the number of host CPUs, + // which could result in getcpu(2) returning CPUs that applications expect + // not to exist. + if k.useHostCores && initAppCores > k.applicationCores { + return fmt.Errorf("UseHostCores enabled: can't increase ApplicationCores from %d to %d after restore", k.applicationCores, initAppCores) + } + + return nil +} + +// UniqueID returns a unique identifier. +func (k *Kernel) UniqueID() uint64 { + id := atomic.AddUint64(&k.uniqueID, 1) + if id == 0 { + panic("unique identifier generator wrapped around") + } + return id +} + +// CreateProcessArgs holds arguments to kernel.CreateProcess. +type CreateProcessArgs struct { + // Filename is the filename to load as the init binary. + // + // If this is provided as "", File will be checked, then the file will be + // guessed via Argv[0]. + Filename string + + // File is a passed host FD pointing to a file to load as the init binary. + // + // This is checked if and only if Filename is "". + File fsbridge.File + + // Argvv is a list of arguments. + Argv []string + + // Envv is a list of environment variables. + Envv []string + + // WorkingDirectory is the initial working directory. + // + // This defaults to the root if empty. + WorkingDirectory string + + // Credentials is the initial credentials. + Credentials *auth.Credentials + + // FDTable is the initial set of file descriptors. If CreateProcess succeeds, + // it takes a reference on FDTable. + FDTable *FDTable + + // Umask is the initial umask. + Umask uint + + // Limits is the initial resource limits. + Limits *limits.LimitSet + + // MaxSymlinkTraversals is the maximum number of symlinks to follow + // during resolution. + MaxSymlinkTraversals uint + + // UTSNamespace is the initial UTS namespace. + UTSNamespace *UTSNamespace + + // IPCNamespace is the initial IPC namespace. + IPCNamespace *IPCNamespace + + // PIDNamespace is the initial PID Namespace. + PIDNamespace *PIDNamespace + + // AbstractSocketNamespace is the initial Abstract Socket namespace. + AbstractSocketNamespace *AbstractSocketNamespace + + // MountNamespace optionally contains the mount namespace for this + // process. If nil, the init process's mount namespace is used. + // + // Anyone setting MountNamespace must donate a reference (i.e. + // increment it). + MountNamespace *fs.MountNamespace + + // MountNamespaceVFS2 optionally contains the mount namespace for this + // process. If nil, the init process's mount namespace is used. + // + // Anyone setting MountNamespaceVFS2 must donate a reference (i.e. + // increment it). + MountNamespaceVFS2 *vfs.MountNamespace + + // ContainerID is the container that the process belongs to. + ContainerID string +} + +// NewContext returns a context.Context that represents the task that will be +// created by args.NewContext(k). +func (args *CreateProcessArgs) NewContext(k *Kernel) *createProcessContext { + return &createProcessContext{ + Logger: log.Log(), + k: k, + args: args, + } +} + +// createProcessContext is a context.Context that represents the context +// associated with a task that is being created. +type createProcessContext struct { + context.NoopSleeper + log.Logger + k *Kernel + args *CreateProcessArgs +} + +// Value implements context.Context.Value. +func (ctx *createProcessContext) Value(key interface{}) interface{} { + switch key { + case CtxKernel: + return ctx.k + case CtxPIDNamespace: + return ctx.args.PIDNamespace + case CtxUTSNamespace: + return ctx.args.UTSNamespace + case CtxIPCNamespace: + return ctx.args.IPCNamespace + case auth.CtxCredentials: + return ctx.args.Credentials + case fs.CtxRoot: + if ctx.args.MountNamespace != nil { + // MountNamespace.Root() will take a reference on the root dirent for us. + return ctx.args.MountNamespace.Root() + } + return nil + case vfs.CtxRoot: + if ctx.args.MountNamespaceVFS2 == nil { + return nil + } + // MountNamespaceVFS2.Root() takes a reference on the root dirent for us. + return ctx.args.MountNamespaceVFS2.Root() + case vfs.CtxMountNamespace: + if ctx.k.globalInit == nil { + return nil + } + // MountNamespaceVFS2 takes a reference for us. + return ctx.k.GlobalInit().Leader().MountNamespaceVFS2() + case fs.CtxDirentCacheLimiter: + return ctx.k.DirentCacheLimiter + case inet.CtxStack: + return ctx.k.RootNetworkNamespace().Stack() + case ktime.CtxRealtimeClock: + return ctx.k.RealtimeClock() + case limits.CtxLimits: + return ctx.args.Limits + case pgalloc.CtxMemoryFile: + return ctx.k.mf + case pgalloc.CtxMemoryFileProvider: + return ctx.k + case platform.CtxPlatform: + return ctx.k + case uniqueid.CtxGlobalUniqueID: + return ctx.k.UniqueID() + case uniqueid.CtxGlobalUniqueIDProvider: + return ctx.k + case uniqueid.CtxInotifyCookie: + return ctx.k.GenerateInotifyCookie() + case unimpl.CtxEvents: + return ctx.k + default: + return nil + } +} + +// CreateProcess creates a new task in a new thread group with the given +// options. The new task has no parent and is in the root PID namespace. +// +// If k.Start() has already been called, then the created process must be +// started by calling kernel.StartProcess(tg). +// +// If k.Start() has not yet been called, then the created task will begin +// running when k.Start() is called. +// +// CreateProcess has no analogue in Linux; it is used to create the initial +// application task, as well as processes started by the control server. +func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, error) { + k.extMu.Lock() + defer k.extMu.Unlock() + log.Infof("EXEC: %v", args.Argv) + + ctx := args.NewContext(k) + + var ( + opener fsbridge.Lookup + fsContext *FSContext + mntns *fs.MountNamespace + ) + + if VFS2Enabled { + mntnsVFS2 := args.MountNamespaceVFS2 + if mntnsVFS2 == nil { + // MountNamespaceVFS2 adds a reference to the namespace, which is + // transferred to the new process. + mntnsVFS2 = k.globalInit.Leader().MountNamespaceVFS2() + } + // Get the root directory from the MountNamespace. + root := args.MountNamespaceVFS2.Root() + // The call to newFSContext below will take a reference on root, so we + // don't need to hold this one. + defer root.DecRef() + + // Grab the working directory. + wd := root // Default. + if args.WorkingDirectory != "" { + pop := vfs.PathOperation{ + Root: root, + Start: wd, + Path: fspath.Parse(args.WorkingDirectory), + FollowFinalSymlink: true, + } + var err error + wd, err = k.VFS().GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{ + CheckSearchable: true, + }) + if err != nil { + return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) + } + defer wd.DecRef() + } + opener = fsbridge.NewVFSLookup(mntnsVFS2, root, wd) + fsContext = NewFSContextVFS2(root, wd, args.Umask) + + } else { + mntns = args.MountNamespace + if mntns == nil { + mntns = k.GlobalInit().Leader().MountNamespace() + mntns.IncRef() + } + // Get the root directory from the MountNamespace. + root := mntns.Root() + // The call to newFSContext below will take a reference on root, so we + // don't need to hold this one. + defer root.DecRef() + + // Grab the working directory. + remainingTraversals := args.MaxSymlinkTraversals + wd := root // Default. + if args.WorkingDirectory != "" { + var err error + wd, err = mntns.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals) + if err != nil { + return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) + } + defer wd.DecRef() + } + opener = fsbridge.NewFSLookup(mntns, root, wd) + fsContext = newFSContext(root, wd, args.Umask) + } + + tg := k.NewThreadGroup(mntns, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits) + + // Check which file to start from. + switch { + case args.Filename != "": + // If a filename is given, take that. + // Set File to nil so we resolve the path in LoadTaskImage. + args.File = nil + case args.File != nil: + // If File is set, take the File provided directly. + default: + // Otherwise look at Argv and see if the first argument is a valid path. + if len(args.Argv) == 0 { + return nil, 0, fmt.Errorf("no filename or command provided") + } + if !filepath.IsAbs(args.Argv[0]) { + return nil, 0, fmt.Errorf("'%s' is not an absolute path", args.Argv[0]) + } + args.Filename = args.Argv[0] + } + + // Create a fresh task context. + remainingTraversals := args.MaxSymlinkTraversals + loadArgs := loader.LoadArgs{ + Opener: opener, + RemainingTraversals: &remainingTraversals, + ResolveFinal: true, + Filename: args.Filename, + File: args.File, + CloseOnExec: false, + Argv: args.Argv, + Envv: args.Envv, + Features: k.featureSet, + } + + tc, se := k.LoadTaskImage(ctx, loadArgs) + if se != nil { + return nil, 0, errors.New(se.String()) + } + + // Take a reference on the FDTable, which will be transferred to + // TaskSet.NewTask(). + args.FDTable.IncRef() + + // Create the task. + config := &TaskConfig{ + Kernel: k, + ThreadGroup: tg, + TaskContext: tc, + FSContext: fsContext, + FDTable: args.FDTable, + Credentials: args.Credentials, + NetworkNamespace: k.RootNetworkNamespace(), + AllowedCPUMask: sched.NewFullCPUSet(k.applicationCores), + UTSNamespace: args.UTSNamespace, + IPCNamespace: args.IPCNamespace, + AbstractSocketNamespace: args.AbstractSocketNamespace, + MountNamespaceVFS2: args.MountNamespaceVFS2, + ContainerID: args.ContainerID, + } + t, err := k.tasks.NewTask(config) + if err != nil { + return nil, 0, err + } + t.traceExecEvent(tc) // Simulate exec for tracing. + + // Success. + tgid := k.tasks.Root.IDOfThreadGroup(tg) + if k.globalInit == nil { + k.globalInit = tg + } + return tg, tgid, nil +} + +// StartProcess starts running a process that was created with CreateProcess. +func (k *Kernel) StartProcess(tg *ThreadGroup) { + t := tg.Leader() + tid := k.tasks.Root.IDOfTask(t) + t.Start(tid) +} + +// Start starts execution of all tasks in k. +// +// Preconditions: Start may be called exactly once. +func (k *Kernel) Start() error { + k.extMu.Lock() + defer k.extMu.Unlock() + + if k.globalInit == nil { + return fmt.Errorf("kernel contains no tasks") + } + if k.started { + return fmt.Errorf("kernel already started") + } + + k.started = true + k.cpuClockTicker = ktime.NewTimer(k.monotonicClock, newKernelCPUClockTicker(k)) + k.cpuClockTicker.Swap(ktime.Setting{ + Enabled: true, + Period: linux.ClockTick, + }) + // If k was created by LoadKernelFrom, timers were stopped during + // Kernel.SaveTo and need to be resumed. If k was created by NewKernel, + // this is a no-op. + k.resumeTimeLocked() + // Start task goroutines. + k.tasks.mu.RLock() + defer k.tasks.mu.RUnlock() + for t, tid := range k.tasks.Root.tids { + t.Start(tid) + } + return nil +} + +// pauseTimeLocked pauses all Timers and Timekeeper updates. +// +// Preconditions: Any task goroutines running in k must be stopped. k.extMu +// must be locked. +func (k *Kernel) pauseTimeLocked() { + // k.cpuClockTicker may be nil since Kernel.SaveTo() may be called before + // Kernel.Start(). + if k.cpuClockTicker != nil { + k.cpuClockTicker.Pause() + } + + // By precondition, nothing else can be interacting with PIDNamespace.tids + // or FDTable.files, so we can iterate them without synchronization. (We + // can't hold the TaskSet mutex when pausing thread group timers because + // thread group timers call ThreadGroup.SendSignal, which takes the TaskSet + // mutex, while holding the Timer mutex.) + for t := range k.tasks.Root.tids { + if t == t.tg.leader { + t.tg.itimerRealTimer.Pause() + for _, it := range t.tg.timers { + it.PauseTimer() + } + } + // This means we'll iterate FDTables shared by multiple tasks repeatedly, + // but ktime.Timer.Pause is idempotent so this is harmless. + if t.fdTable != nil { + t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { + if VFS2Enabled { + if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { + tfd.PauseTimer() + } + } else { + if tfd, ok := file.FileOperations.(*oldtimerfd.TimerOperations); ok { + tfd.PauseTimer() + } + } + }) + } + } + k.timekeeper.PauseUpdates() +} + +// resumeTimeLocked resumes all Timers and Timekeeper updates. If +// pauseTimeLocked has not been previously called, resumeTimeLocked has no +// effect. +// +// Preconditions: Any task goroutines running in k must be stopped. k.extMu +// must be locked. +func (k *Kernel) resumeTimeLocked() { + if k.cpuClockTicker != nil { + k.cpuClockTicker.Resume() + } + + k.timekeeper.ResumeUpdates() + for t := range k.tasks.Root.tids { + if t == t.tg.leader { + t.tg.itimerRealTimer.Resume() + for _, it := range t.tg.timers { + it.ResumeTimer() + } + } + if t.fdTable != nil { + t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { + if VFS2Enabled { + if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { + tfd.ResumeTimer() + } + } else { + if tfd, ok := file.FileOperations.(*oldtimerfd.TimerOperations); ok { + tfd.ResumeTimer() + } + } + }) + } + } +} + +func (k *Kernel) incRunningTasks() { + for { + tasks := atomic.LoadInt64(&k.runningTasks) + if tasks != 0 { + // Standard case. Simply increment. + if !atomic.CompareAndSwapInt64(&k.runningTasks, tasks, tasks+1) { + continue + } + return + } + + // Transition from 0 -> 1. Synchronize with other transitions and timer. + k.runningTasksMu.Lock() + tasks = atomic.LoadInt64(&k.runningTasks) + if tasks != 0 { + // We're no longer the first task, no need to + // re-enable. + atomic.AddInt64(&k.runningTasks, 1) + k.runningTasksMu.Unlock() + return + } + + if !k.cpuClockTickerDisabled { + // Timer was never disabled. + atomic.StoreInt64(&k.runningTasks, 1) + k.runningTasksMu.Unlock() + return + } + + // We need to update cpuClock for all of the ticks missed while we + // slept, and then re-enable the timer. + // + // The Notify in Swap isn't sufficient. kernelCPUClockTicker.Notify + // always increments cpuClock by 1 regardless of the number of + // expirations as a heuristic to avoid over-accounting in cases of CPU + // throttling. + // + // We want to cover the normal case, when all time should be accounted, + // so we increment for all expirations. Throttling is less concerning + // here because the ticker is only disabled from Notify. This means + // that Notify must schedule and compensate for the throttled period + // before the timer is disabled. Throttling while the timer is disabled + // doesn't matter, as nothing is running or reading cpuClock anyways. + // + // S/R also adds complication, as there are two cases. Recall that + // monotonicClock will jump forward on restore. + // + // 1. If the ticker is enabled during save, then on Restore Notify is + // called with many expirations, covering the time jump, but cpuClock + // is only incremented by 1. + // + // 2. If the ticker is disabled during save, then after Restore the + // first wakeup will call this function and cpuClock will be + // incremented by the number of expirations across the S/R. + // + // These cause very different value of cpuClock. But again, since + // nothing was running while the ticker was disabled, those differences + // don't matter. + setting, exp := k.cpuClockTickerSetting.At(k.monotonicClock.Now()) + if exp > 0 { + atomic.AddUint64(&k.cpuClock, exp) + } + + // Now that cpuClock is updated it is safe to allow other tasks to + // transition to running. + atomic.StoreInt64(&k.runningTasks, 1) + + // N.B. we must unlock before calling Swap to maintain lock ordering. + // + // cpuClockTickerDisabled need not wait until after Swap to become + // true. It is sufficient that the timer *will* be enabled. + k.cpuClockTickerDisabled = false + k.runningTasksMu.Unlock() + + // This won't call Notify (unless it's been ClockTick since setting.At + // above). This means we skip the thread group work in Notify. However, + // since nothing was running while we were disabled, none of the timers + // could have expired. + k.cpuClockTicker.Swap(setting) + + return + } +} + +func (k *Kernel) decRunningTasks() { + tasks := atomic.AddInt64(&k.runningTasks, -1) + if tasks < 0 { + panic(fmt.Sprintf("Invalid running count %d", tasks)) + } + + // Nothing to do. The next CPU clock tick will disable the timer if + // there is still nothing running. This provides approximately one tick + // of slack in which we can switch back and forth between idle and + // active without an expensive transition. +} + +// WaitExited blocks until all tasks in k have exited. +func (k *Kernel) WaitExited() { + k.tasks.liveGoroutines.Wait() +} + +// Kill requests that all tasks in k immediately exit as if group exiting with +// status es. Kill does not wait for tasks to exit. +func (k *Kernel) Kill(es ExitStatus) { + k.extMu.Lock() + defer k.extMu.Unlock() + k.tasks.Kill(es) +} + +// Pause requests that all tasks in k temporarily stop executing, and blocks +// until all tasks and asynchronous I/O operations in k have stopped. Multiple +// calls to Pause nest and require an equal number of calls to Unpause to +// resume execution. +func (k *Kernel) Pause() { + k.extMu.Lock() + k.tasks.BeginExternalStop() + k.extMu.Unlock() + k.tasks.runningGoroutines.Wait() + k.tasks.aioGoroutines.Wait() +} + +// Unpause ends the effect of a previous call to Pause. If Unpause is called +// without a matching preceding call to Pause, Unpause may panic. +func (k *Kernel) Unpause() { + k.extMu.Lock() + defer k.extMu.Unlock() + k.tasks.EndExternalStop() +} + +// SendExternalSignal injects a signal into the kernel. +// +// context is used only for debugging to describe how the signal was received. +// +// Preconditions: Kernel must have an init process. +func (k *Kernel) SendExternalSignal(info *arch.SignalInfo, context string) { + k.extMu.Lock() + defer k.extMu.Unlock() + k.sendExternalSignal(info, context) +} + +// SendExternalSignalThreadGroup injects a signal into an specific ThreadGroup. +// This function doesn't skip signals like SendExternalSignal does. +func (k *Kernel) SendExternalSignalThreadGroup(tg *ThreadGroup, info *arch.SignalInfo) error { + k.extMu.Lock() + defer k.extMu.Unlock() + return tg.SendSignal(info) +} + +// SendContainerSignal sends the given signal to all processes inside the +// namespace that match the given container ID. +func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error { + k.extMu.Lock() + defer k.extMu.Unlock() + k.tasks.mu.RLock() + defer k.tasks.mu.RUnlock() + + var lastErr error + for tg := range k.tasks.Root.tgids { + if tg.leader.ContainerID() == cid { + tg.signalHandlers.mu.Lock() + infoCopy := *info + if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil { + lastErr = err + } + tg.signalHandlers.mu.Unlock() + } + } + return lastErr +} + +// RebuildTraceContexts rebuilds the trace context for all tasks. +// +// Unfortunately, if these are built while tracing is not enabled, then we will +// not have meaningful trace data. Rebuilding here ensures that we can do so +// after tracing has been enabled. +func (k *Kernel) RebuildTraceContexts() { + k.extMu.Lock() + defer k.extMu.Unlock() + k.tasks.mu.RLock() + defer k.tasks.mu.RUnlock() + + for t, tid := range k.tasks.Root.tids { + t.rebuildTraceContext(tid) + } +} + +// FeatureSet returns the FeatureSet. +func (k *Kernel) FeatureSet() *cpuid.FeatureSet { + return k.featureSet +} + +// Timekeeper returns the Timekeeper. +func (k *Kernel) Timekeeper() *Timekeeper { + return k.timekeeper +} + +// TaskSet returns the TaskSet. +func (k *Kernel) TaskSet() *TaskSet { + return k.tasks +} + +// RootUserNamespace returns the root UserNamespace. +func (k *Kernel) RootUserNamespace() *auth.UserNamespace { + return k.rootUserNamespace +} + +// RootUTSNamespace returns the root UTSNamespace. +func (k *Kernel) RootUTSNamespace() *UTSNamespace { + return k.rootUTSNamespace +} + +// RootIPCNamespace returns the root IPCNamespace. +func (k *Kernel) RootIPCNamespace() *IPCNamespace { + return k.rootIPCNamespace +} + +// RootPIDNamespace returns the root PIDNamespace. +func (k *Kernel) RootPIDNamespace() *PIDNamespace { + return k.tasks.Root +} + +// RootAbstractSocketNamespace returns the root AbstractSocketNamespace. +func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace { + return k.rootAbstractSocketNamespace +} + +// RootNetworkNamespace returns the root network namespace, always non-nil. +func (k *Kernel) RootNetworkNamespace() *inet.Namespace { + return k.rootNetworkNamespace +} + +// GlobalInit returns the thread group with ID 1 in the root PID namespace, or +// nil if no such thread group exists. GlobalInit may return a thread group +// containing no tasks if the thread group has already exited. +func (k *Kernel) GlobalInit() *ThreadGroup { + k.extMu.Lock() + defer k.extMu.Unlock() + return k.globalInit +} + +// TestOnly_SetGlobalInit sets the thread group with ID 1 in the root PID namespace. +func (k *Kernel) TestOnly_SetGlobalInit(tg *ThreadGroup) { + k.globalInit = tg +} + +// ApplicationCores returns the number of CPUs visible to sandboxed +// applications. +func (k *Kernel) ApplicationCores() uint { + return k.applicationCores +} + +// RealtimeClock returns the application CLOCK_REALTIME clock. +func (k *Kernel) RealtimeClock() ktime.Clock { + return k.realtimeClock +} + +// MonotonicClock returns the application CLOCK_MONOTONIC clock. +func (k *Kernel) MonotonicClock() ktime.Clock { + return k.monotonicClock +} + +// CPUClockNow returns the current value of k.cpuClock. +func (k *Kernel) CPUClockNow() uint64 { + return atomic.LoadUint64(&k.cpuClock) +} + +// Syslog returns the syslog. +func (k *Kernel) Syslog() *syslog { + return &k.syslog +} + +// GenerateInotifyCookie generates a unique inotify event cookie. +// +// Returned values may overlap with previously returned values if the value +// space is exhausted. 0 is not a valid cookie value, all other values +// representable in a uint32 are allowed. +func (k *Kernel) GenerateInotifyCookie() uint32 { + id := atomic.AddUint32(&k.nextInotifyCookie, 1) + // Wrap-around is explicitly allowed for inotify event cookies. + if id == 0 { + id = atomic.AddUint32(&k.nextInotifyCookie, 1) + } + return id +} + +// NetlinkPorts returns the netlink port manager. +func (k *Kernel) NetlinkPorts() *port.Manager { + return k.netlinkPorts +} + +// SaveError returns the sandbox error that caused the kernel to exit during +// save. +func (k *Kernel) SaveError() error { + k.extMu.Lock() + defer k.extMu.Unlock() + return k.saveErr +} + +// SetSaveError sets the sandbox error that caused the kernel to exit during +// save, if one is not already set. +func (k *Kernel) SetSaveError(err error) { + k.extMu.Lock() + defer k.extMu.Unlock() + if k.saveErr == nil { + k.saveErr = err + } +} + +var _ tcpip.Clock = (*Kernel)(nil) + +// NowNanoseconds implements tcpip.Clock.NowNanoseconds. +func (k *Kernel) NowNanoseconds() int64 { + now, err := k.timekeeper.GetTime(sentrytime.Realtime) + if err != nil { + panic("Kernel.NowNanoseconds: " + err.Error()) + } + return now +} + +// NowMonotonic implements tcpip.Clock.NowMonotonic. +func (k *Kernel) NowMonotonic() int64 { + now, err := k.timekeeper.GetTime(sentrytime.Monotonic) + if err != nil { + panic("Kernel.NowMonotonic: " + err.Error()) + } + return now +} + +// SetMemoryFile sets Kernel.mf. SetMemoryFile must be called before Init or +// LoadFrom. +func (k *Kernel) SetMemoryFile(mf *pgalloc.MemoryFile) { + k.mf = mf +} + +// MemoryFile implements pgalloc.MemoryFileProvider.MemoryFile. +func (k *Kernel) MemoryFile() *pgalloc.MemoryFile { + return k.mf +} + +// SupervisorContext returns a Context with maximum privileges in k. It should +// only be used by goroutines outside the control of the emulated kernel +// defined by e. +// +// Callers are responsible for ensuring that the returned Context is not used +// concurrently with changes to the Kernel. +func (k *Kernel) SupervisorContext() context.Context { + return supervisorContext{ + Logger: log.Log(), + k: k, + } +} + +// SocketEntry represents a socket recorded in Kernel.sockets. It implements +// refs.WeakRefUser for sockets stored in the socket table. +// +// +stateify savable +type SocketEntry struct { + socketEntry + k *Kernel + Sock *refs.WeakRef + SockVFS2 *vfs.FileDescription + ID uint64 // Socket table entry number. +} + +// WeakRefGone implements refs.WeakRefUser.WeakRefGone. +func (s *SocketEntry) WeakRefGone() { + s.k.extMu.Lock() + s.k.sockets.Remove(s) + s.k.extMu.Unlock() +} + +// RecordSocket adds a socket to the system-wide socket table for tracking. +// +// Precondition: Caller must hold a reference to sock. +func (k *Kernel) RecordSocket(sock *fs.File) { + k.extMu.Lock() + id := k.nextSocketEntry + k.nextSocketEntry++ + s := &SocketEntry{k: k, ID: id} + s.Sock = refs.NewWeakRef(sock, s) + k.sockets.PushBack(s) + k.extMu.Unlock() +} + +// RecordSocketVFS2 adds a VFS2 socket to the system-wide socket table for +// tracking. +// +// Precondition: Caller must hold a reference to sock. +// +// Note that the socket table will not hold a reference on the +// vfs.FileDescription, because we do not support weak refs on VFS2 files. +func (k *Kernel) RecordSocketVFS2(sock *vfs.FileDescription) { + k.extMu.Lock() + id := k.nextSocketEntry + k.nextSocketEntry++ + s := &SocketEntry{ + k: k, + ID: id, + SockVFS2: sock, + } + k.sockets.PushBack(s) + k.extMu.Unlock() +} + +// ListSockets returns a snapshot of all sockets. +// +// Callers of ListSockets() in VFS2 should use SocketEntry.SockVFS2.TryIncRef() +// to get a reference on a socket in the table. +func (k *Kernel) ListSockets() []*SocketEntry { + k.extMu.Lock() + var socks []*SocketEntry + for s := k.sockets.Front(); s != nil; s = s.Next() { + socks = append(socks, s) + } + k.extMu.Unlock() + return socks +} + +// supervisorContext is a privileged context. +type supervisorContext struct { + context.NoopSleeper + log.Logger + k *Kernel +} + +// Value implements context.Context. +func (ctx supervisorContext) Value(key interface{}) interface{} { + switch key { + case CtxCanTrace: + // The supervisor context can trace anything. (None of + // supervisorContext's users are expected to invoke ptrace, but ptrace + // permissions are required for certain file accesses.) + return func(*Task, bool) bool { return true } + case CtxKernel: + return ctx.k + case CtxPIDNamespace: + return ctx.k.tasks.Root + case CtxUTSNamespace: + return ctx.k.rootUTSNamespace + case CtxIPCNamespace: + return ctx.k.rootIPCNamespace + case auth.CtxCredentials: + // The supervisor context is global root. + return auth.NewRootCredentials(ctx.k.rootUserNamespace) + case fs.CtxRoot: + if ctx.k.globalInit != nil { + return ctx.k.globalInit.mounts.Root() + } + return nil + case vfs.CtxRoot: + if ctx.k.globalInit == nil { + return vfs.VirtualDentry{} + } + mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2() + defer mntns.DecRef() + // Root() takes a reference on the root dirent for us. + return mntns.Root() + case vfs.CtxMountNamespace: + if ctx.k.globalInit == nil { + return nil + } + // MountNamespaceVFS2() takes a reference for us. + return ctx.k.GlobalInit().Leader().MountNamespaceVFS2() + case fs.CtxDirentCacheLimiter: + return ctx.k.DirentCacheLimiter + case inet.CtxStack: + return ctx.k.RootNetworkNamespace().Stack() + case ktime.CtxRealtimeClock: + return ctx.k.RealtimeClock() + case limits.CtxLimits: + // No limits apply. + return limits.NewLimitSet() + case pgalloc.CtxMemoryFile: + return ctx.k.mf + case pgalloc.CtxMemoryFileProvider: + return ctx.k + case platform.CtxPlatform: + return ctx.k + case uniqueid.CtxGlobalUniqueID: + return ctx.k.UniqueID() + case uniqueid.CtxGlobalUniqueIDProvider: + return ctx.k + case uniqueid.CtxInotifyCookie: + return ctx.k.GenerateInotifyCookie() + case unimpl.CtxEvents: + return ctx.k + default: + return nil + } +} + +// Rate limits for the number of unimplemented syscall events. +const ( + unimplementedSyscallsMaxRate = 100 // events per second + unimplementedSyscallBurst = 1000 // events +) + +// EmitUnimplementedEvent emits an UnimplementedSyscall event via the event +// channel. +func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) { + k.unimplementedSyscallEmitterOnce.Do(func() { + k.unimplementedSyscallEmitter = eventchannel.RateLimitedEmitterFrom(eventchannel.DefaultEmitter, unimplementedSyscallsMaxRate, unimplementedSyscallBurst) + }) + + t := TaskFromContext(ctx) + k.unimplementedSyscallEmitter.Emit(&uspb.UnimplementedSyscall{ + Tid: int32(t.ThreadID()), + Registers: t.Arch().StateData().Proto(), + }) +} + +// VFS returns the virtual filesystem for the kernel. +func (k *Kernel) VFS() *vfs.VirtualFilesystem { + return &k.vfs +} + +// SetHostMount sets the hostfs mount. +func (k *Kernel) SetHostMount(mnt *vfs.Mount) { + if k.hostMount != nil { + panic("Kernel.hostMount cannot be set more than once") + } + k.hostMount = mnt +} + +// HostMount returns the hostfs mount. +func (k *Kernel) HostMount() *vfs.Mount { + return k.hostMount +} + +// PipeMount returns the pipefs mount. +func (k *Kernel) PipeMount() *vfs.Mount { + return k.pipeMount +} + +// ShmMount returns the tmpfs mount. +func (k *Kernel) ShmMount() *vfs.Mount { + return k.shmMount +} + +// SocketMount returns the sockfs mount. +func (k *Kernel) SocketMount() *vfs.Mount { + return k.socketMount +} |