// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package kernel provides an emulation of the Linux kernel. // // See README.md for a detailed overview. // // Lock order (outermost locks must be taken first): // // Kernel.extMu // ThreadGroup.timerMu // ktime.Timer.mu (for kernelCPUClockTicker and IntervalTimer) // TaskSet.mu // SignalHandlers.mu // Task.mu // runningTasksMu // // Locking SignalHandlers.mu in multiple SignalHandlers requires locking // TaskSet.mu exclusively first. Locking Task.mu in multiple Tasks at the same // time requires locking all of their signal mutexes first. package kernel import ( "errors" "fmt" "io" "path/filepath" "sync/atomic" "time" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/eventchannel" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/timerfd" "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/hostcpu" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/epoll" "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/loader" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/socket/netlink/port" sentrytime "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sentry/unimpl" uspb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto" "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/state" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" ) // VFS2Enabled is set to true when VFS2 is enabled. Added as a global for allow // easy access everywhere. To be removed once VFS2 becomes the default. var VFS2Enabled = false // Kernel represents an emulated Linux kernel. It must be initialized by calling // Init() or LoadFrom(). // // +stateify savable type Kernel struct { // extMu serializes external changes to the Kernel with calls to // Kernel.SaveTo. (Kernel.SaveTo requires that the state of the Kernel // remains frozen for the duration of the call; it requires that the Kernel // is paused as a precondition, which ensures that none of the tasks // running within the Kernel can affect its state, but extMu is required to // ensure that concurrent users of the Kernel *outside* the Kernel's // control cannot affect its state by calling e.g. // Kernel.SendExternalSignal.) extMu sync.Mutex `state:"nosave"` // started is true if Start has been called. Unless otherwise specified, // all Kernel fields become immutable once started becomes true. started bool `state:"nosave"` // All of the following fields are immutable unless otherwise specified. // Platform is the platform that is used to execute tasks in the created // Kernel. See comment on pgalloc.MemoryFileProvider for why Platform is // embedded anonymously (the same issue applies). platform.Platform `state:"nosave"` // mf provides application memory. mf *pgalloc.MemoryFile `state:"nosave"` // See InitKernelArgs for the meaning of these fields. featureSet *cpuid.FeatureSet timekeeper *Timekeeper tasks *TaskSet rootUserNamespace *auth.UserNamespace networkStack inet.Stack `state:"nosave"` applicationCores uint useHostCores bool extraAuxv []arch.AuxEntry vdso *loader.VDSO rootUTSNamespace *UTSNamespace rootIPCNamespace *IPCNamespace rootAbstractSocketNamespace *AbstractSocketNamespace // futexes is the "root" futex.Manager, from which all others are forked. // This is necessary to ensure that shared futexes are coherent across all // tasks, including those created by CreateProcess. futexes *futex.Manager // globalInit is the thread group whose leader has ID 1 in the root PID // namespace. globalInit is stored separately so that it is accessible even // after all tasks in the thread group have exited, such that ID 1 is no // longer mapped. // // globalInit is mutable until it is assigned by the first successful call // to CreateProcess, and is protected by extMu. globalInit *ThreadGroup // realtimeClock is a ktime.Clock based on timekeeper's Realtime. realtimeClock *timekeeperClock // monotonicClock is a ktime.Clock based on timekeeper's Monotonic. monotonicClock *timekeeperClock // syslog is the kernel log. syslog syslog // runningTasksMu synchronizes disable/enable of cpuClockTicker when // the kernel is idle (runningTasks == 0). // // runningTasksMu is used to exclude critical sections when the timer // disables itself and when the first active task enables the timer, // ensuring that tasks always see a valid cpuClock value. runningTasksMu sync.Mutex `state:"nosave"` // runningTasks is the total count of tasks currently in // TaskGoroutineRunningSys or TaskGoroutineRunningApp. i.e., they are // not blocked or stopped. // // runningTasks must be accessed atomically. Increments from 0 to 1 are // further protected by runningTasksMu (see incRunningTasks). runningTasks int64 // cpuClock is incremented every linux.ClockTick. cpuClock is used to // measure task CPU usage, since sampling monotonicClock twice on every // syscall turns out to be unreasonably expensive. This is similar to how // Linux does task CPU accounting on x86 (CONFIG_IRQ_TIME_ACCOUNTING), // although Linux also uses scheduler timing information to improve // resolution (kernel/sched/cputime.c:cputime_adjust()), which we can't do // since "preeemptive" scheduling is managed by the Go runtime, which // doesn't provide this information. // // cpuClock is mutable, and is accessed using atomic memory operations. cpuClock uint64 // cpuClockTicker increments cpuClock. cpuClockTicker *ktime.Timer `state:"nosave"` // cpuClockTickerDisabled indicates that cpuClockTicker has been // disabled because no tasks are running. // // cpuClockTickerDisabled is protected by runningTasksMu. cpuClockTickerDisabled bool // cpuClockTickerSetting is the ktime.Setting of cpuClockTicker at the // point it was disabled. It is cached here to avoid a lock ordering // violation with cpuClockTicker.mu when runningTaskMu is held. // // cpuClockTickerSetting is only valid when cpuClockTickerDisabled is // true. // // cpuClockTickerSetting is protected by runningTasksMu. cpuClockTickerSetting ktime.Setting // fdMapUids is an ever-increasing counter for generating FDTable uids. // // fdMapUids is mutable, and is accessed using atomic memory operations. fdMapUids uint64 // uniqueID is used to generate unique identifiers. // // uniqueID is mutable, and is accessed using atomic memory operations. uniqueID uint64 // nextInotifyCookie is a monotonically increasing counter used for // generating unique inotify event cookies. // // nextInotifyCookie is mutable, and is accessed using atomic memory // operations. nextInotifyCookie uint32 // netlinkPorts manages allocation of netlink socket port IDs. netlinkPorts *port.Manager // saveErr is the error causing the sandbox to exit during save, if // any. It is protected by extMu. saveErr error `state:"nosave"` // danglingEndpoints is used to save / restore tcpip.DanglingEndpoints. danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"` // sockets is the list of all network sockets the system. Protected by // extMu. sockets socketList // nextSocketEntry is the next entry number to use in sockets. Protected // by extMu. nextSocketEntry uint64 // deviceRegistry is used to save/restore device.SimpleDevices. deviceRegistry struct{} `state:".(*device.Registry)"` // DirentCacheLimiter controls the number of total dirent entries can be in // caches. Not all caches use it, only the caches that use host resources use // the limiter. It may be nil if disabled. DirentCacheLimiter *fs.DirentCacheLimiter // unimplementedSyscallEmitterOnce is used in the initialization of // unimplementedSyscallEmitter. unimplementedSyscallEmitterOnce sync.Once `state:"nosave"` // unimplementedSyscallEmitter is used to emit unimplemented syscall // events. This is initialized lazily on the first unimplemented // syscall. unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"` // SpecialOpts contains special kernel options. SpecialOpts // VFS keeps the filesystem state used across the kernel. VFS *vfs.VirtualFilesystem } // InitKernelArgs holds arguments to Init. type InitKernelArgs struct { // FeatureSet is the emulated CPU feature set. FeatureSet *cpuid.FeatureSet // Timekeeper manages time for all tasks in the system. Timekeeper *Timekeeper // RootUserNamespace is the root user namespace. RootUserNamespace *auth.UserNamespace // NetworkStack is the TCP/IP network stack. NetworkStack may be nil. NetworkStack inet.Stack // ApplicationCores is the number of logical CPUs visible to sandboxed // applications. The set of logical CPU IDs is [0, ApplicationCores); thus // ApplicationCores is analogous to Linux's nr_cpu_ids, the index of the // most significant bit in cpu_possible_mask + 1. ApplicationCores uint // If UseHostCores is true, Task.CPU() returns the task goroutine's CPU // instead of a virtualized CPU number, and Task.CopyToCPUMask() is a // no-op. If ApplicationCores is less than hostcpu.MaxPossibleCPU(), it // will be overridden. UseHostCores bool // ExtraAuxv contains additional auxiliary vector entries that are added to // each process by the ELF loader. ExtraAuxv []arch.AuxEntry // Vdso holds the VDSO and its parameter page. Vdso *loader.VDSO // RootUTSNamespace is the root UTS namespace. RootUTSNamespace *UTSNamespace // RootIPCNamespace is the root IPC namespace. RootIPCNamespace *IPCNamespace // RootAbstractSocketNamespace is the root Abstract Socket namespace. RootAbstractSocketNamespace *AbstractSocketNamespace // PIDNamespace is the root PID namespace. PIDNamespace *PIDNamespace } // Init initialize the Kernel with no tasks. // // Callers must manually set Kernel.Platform and call Kernel.SetMemoryFile // before calling Init. func (k *Kernel) Init(args InitKernelArgs) error { if args.FeatureSet == nil { return fmt.Errorf("FeatureSet is nil") } if args.Timekeeper == nil { return fmt.Errorf("Timekeeper is nil") } if args.RootUserNamespace == nil { return fmt.Errorf("RootUserNamespace is nil") } if args.ApplicationCores == 0 { return fmt.Errorf("ApplicationCores is 0") } k.featureSet = args.FeatureSet k.timekeeper = args.Timekeeper k.tasks = newTaskSet(args.PIDNamespace) k.rootUserNamespace = args.RootUserNamespace k.rootUTSNamespace = args.RootUTSNamespace k.rootIPCNamespace = args.RootIPCNamespace k.rootAbstractSocketNamespace = args.RootAbstractSocketNamespace k.networkStack = args.NetworkStack k.applicationCores = args.ApplicationCores if args.UseHostCores { k.useHostCores = true maxCPU, err := hostcpu.MaxPossibleCPU() if err != nil { return fmt.Errorf("Failed to get maximum CPU number: %v", err) } minAppCores := uint(maxCPU) + 1 if k.applicationCores < minAppCores { log.Infof("UseHostCores enabled: increasing ApplicationCores from %d to %d", k.applicationCores, minAppCores) k.applicationCores = minAppCores } } k.extraAuxv = args.ExtraAuxv k.vdso = args.Vdso k.realtimeClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Realtime} k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic} k.futexes = futex.NewManager() k.netlinkPorts = port.New() return nil } // SaveTo saves the state of k to w. // // Preconditions: The kernel must be paused throughout the call to SaveTo. func (k *Kernel) SaveTo(w io.Writer) error { saveStart := time.Now() ctx := k.SupervisorContext() // Do not allow other Kernel methods to affect it while it's being saved. k.extMu.Lock() defer k.extMu.Unlock() // Stop time. k.pauseTimeLocked() defer k.resumeTimeLocked() // Evict all evictable MemoryFile allocations. k.mf.StartEvictions() k.mf.WaitForEvictions() // Flush write operations on open files so data reaches backing storage. // This must come after MemoryFile eviction since eviction may cause file // writes. if err := k.tasks.flushWritesToFiles(ctx); err != nil { return err } // Remove all epoll waiter objects from underlying wait queues. // NOTE: for programs to resume execution in future snapshot scenarios, // we will need to re-establish these waiter objects after saving. k.tasks.unregisterEpollWaiters() // Clear the dirent cache before saving because Dirents must be Loaded in a // particular order (parents before children), and Loading dirents from a cache // breaks that order. if err := k.flushMountSourceRefs(); err != nil { return err } // Ensure that all pending asynchronous work is complete: // - inode and mount release // - asynchronuous IO fs.AsyncBarrier() // Once all fs work has completed (flushed references have all been released), // reset mount mappings. This allows individual mounts to save how inodes map // to filesystem resources. Without this, fs.Inodes cannot be restored. fs.SaveInodeMappings() // Discard unsavable mappings, such as those for host file descriptors. // This must be done after waiting for "asynchronous fs work", which // includes async I/O that may touch application memory. if err := k.invalidateUnsavableMappings(ctx); err != nil { return fmt.Errorf("failed to invalidate unsavable mappings: %v", err) } // Save the CPUID FeatureSet before the rest of the kernel so we can // verify its compatibility on restore before attempting to restore the // entire kernel, which may fail on an incompatible machine. // // N.B. This will also be saved along with the full kernel save below. cpuidStart := time.Now() if err := state.Save(k.SupervisorContext(), w, k.FeatureSet(), nil); err != nil { return err } log.Infof("CPUID save took [%s].", time.Since(cpuidStart)) // Save the kernel state. kernelStart := time.Now() var stats state.Stats if err := state.Save(k.SupervisorContext(), w, k, &stats); err != nil { return err } log.Infof("Kernel save stats: %s", &stats) log.Infof("Kernel save took [%s].", time.Since(kernelStart)) // Save the memory file's state. memoryStart := time.Now() if err := k.mf.SaveTo(k.SupervisorContext(), w); err != nil { return err } log.Infof("Memory save took [%s].", time.Since(memoryStart)) log.Infof("Overall save took [%s].", time.Since(saveStart)) return nil } // flushMountSourceRefs flushes the MountSources for all mounted filesystems // and open FDs. func (k *Kernel) flushMountSourceRefs() error { // Flush all mount sources for currently mounted filesystems in each task. flushed := make(map[*fs.MountNamespace]struct{}) k.tasks.mu.RLock() k.tasks.forEachThreadGroupLocked(func(tg *ThreadGroup) { if _, ok := flushed[tg.mounts]; ok { // Already flushed. return } tg.mounts.FlushMountSourceRefs() flushed[tg.mounts] = struct{}{} }) k.tasks.mu.RUnlock() // There may be some open FDs whose filesystems have been unmounted. We // must flush those as well. return k.tasks.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error { file.Dirent.Inode.MountSource.FlushDirentRefs() return nil }) } // forEachFDPaused applies the given function to each open file descriptor in // each task. // // Precondition: Must be called with the kernel paused. func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) (err error) { ts.mu.RLock() defer ts.mu.RUnlock() for t := range ts.Root.tids { // We can skip locking Task.mu here since the kernel is paused. if t.fdTable == nil { continue } t.fdTable.forEach(func(_ int32, file *fs.File, fileVFS2 *vfs.FileDescription, _ FDFlags) { if lastErr := f(file, fileVFS2); lastErr != nil && err == nil { err = lastErr } }) } return err } func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error { // TODO(gvisor.dev/issues/1663): Add save support for VFS2. return ts.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error { if flags := file.Flags(); !flags.Write { return nil } if sattr := file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) { return nil } // Here we need all metadata synced. syncErr := file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll) if err := fs.SaveFileFsyncError(syncErr); err != nil { name, _ := file.Dirent.FullName(nil /* root */) // Wrap this error in ErrSaveRejection so that it will trigger a save // error, rather than a panic. This also allows us to distinguish Fsync // errors from state file errors in state.Save. return fs.ErrSaveRejection{ Err: fmt.Errorf("%q was not sufficiently synced: %v", name, err), } } return nil }) } // Preconditions: The kernel must be paused. func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { invalidated := make(map[*mm.MemoryManager]struct{}) k.tasks.mu.RLock() defer k.tasks.mu.RUnlock() for t := range k.tasks.Root.tids { // We can skip locking Task.mu here since the kernel is paused. if mm := t.tc.MemoryManager; mm != nil { if _, ok := invalidated[mm]; !ok { if err := mm.InvalidateUnsavable(ctx); err != nil { return err } invalidated[mm] = struct{}{} } } // I really wish we just had a sync.Map of all MMs... if r, ok := t.runState.(*runSyscallAfterExecStop); ok { if err := r.tc.MemoryManager.InvalidateUnsavable(ctx); err != nil { return err } } } return nil } func (ts *TaskSet) unregisterEpollWaiters() { ts.mu.RLock() defer ts.mu.RUnlock() for t := range ts.Root.tids { // We can skip locking Task.mu here since the kernel is paused. if t.fdTable != nil { t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { if e, ok := file.FileOperations.(*epoll.EventPoll); ok { e.UnregisterEpollWaiters() } }) } } } // LoadFrom returns a new Kernel loaded from args. func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) error { loadStart := time.Now() k.networkStack = net initAppCores := k.applicationCores // Load the pre-saved CPUID FeatureSet. // // N.B. This was also saved along with the full kernel below, so we // don't need to explicitly install it in the Kernel. cpuidStart := time.Now() var features cpuid.FeatureSet if err := state.Load(k.SupervisorContext(), r, &features, nil); err != nil { return err } log.Infof("CPUID load took [%s].", time.Since(cpuidStart)) // Verify that the FeatureSet is usable on this host. We do this before // Kernel load so that the explicit CPUID mismatch error has priority // over floating point state restore errors that may occur on load on // an incompatible machine. if err := features.CheckHostCompatible(); err != nil { return err } // Load the kernel state. kernelStart := time.Now() var stats state.Stats if err := state.Load(k.SupervisorContext(), r, k, &stats); err != nil { return err } log.Infof("Kernel load stats: %s", &stats) log.Infof("Kernel load took [%s].", time.Since(kernelStart)) // Load the memory file's state. memoryStart := time.Now() if err := k.mf.LoadFrom(k.SupervisorContext(), r); err != nil { return err } log.Infof("Memory load took [%s].", time.Since(memoryStart)) log.Infof("Overall load took [%s]", time.Since(loadStart)) k.Timekeeper().SetClocks(clocks) if net != nil { net.Resume() } // Ensure that all pending asynchronous work is complete: // - namedpipe opening // - inode file opening if err := fs.AsyncErrorBarrier(); err != nil { return err } tcpip.AsyncLoading.Wait() log.Infof("Overall load took [%s] after async work", time.Since(loadStart)) // Applications may size per-cpu structures based on k.applicationCores, so // it can't change across save/restore. When we are virtualizing CPU // numbers, this isn't a problem. However, when we are exposing host CPU // assignments, we can't tolerate an increase in the number of host CPUs, // which could result in getcpu(2) returning CPUs that applications expect // not to exist. if k.useHostCores && initAppCores > k.applicationCores { return fmt.Errorf("UseHostCores enabled: can't increase ApplicationCores from %d to %d after restore", k.applicationCores, initAppCores) } return nil } // UniqueID returns a unique identifier. func (k *Kernel) UniqueID() uint64 { id := atomic.AddUint64(&k.uniqueID, 1) if id == 0 { panic("unique identifier generator wrapped around") } return id } // CreateProcessArgs holds arguments to kernel.CreateProcess. type CreateProcessArgs struct { // Filename is the filename to load as the init binary. // // If this is provided as "", File will be checked, then the file will be // guessed via Argv[0]. Filename string // File is a passed host FD pointing to a file to load as the init binary. // // This is checked if and only if Filename is "". File fsbridge.File // Argvv is a list of arguments. Argv []string // Envv is a list of environment variables. Envv []string // WorkingDirectory is the initial working directory. // // This defaults to the root if empty. WorkingDirectory string // Credentials is the initial credentials. Credentials *auth.Credentials // FDTable is the initial set of file descriptors. If CreateProcess succeeds, // it takes a reference on FDTable. FDTable *FDTable // Umask is the initial umask. Umask uint // Limits is the initial resource limits. Limits *limits.LimitSet // MaxSymlinkTraversals is the maximum number of symlinks to follow // during resolution. MaxSymlinkTraversals uint // UTSNamespace is the initial UTS namespace. UTSNamespace *UTSNamespace // IPCNamespace is the initial IPC namespace. IPCNamespace *IPCNamespace // PIDNamespace is the initial PID Namespace. PIDNamespace *PIDNamespace // AbstractSocketNamespace is the initial Abstract Socket namespace. AbstractSocketNamespace *AbstractSocketNamespace // MountNamespace optionally contains the mount namespace for this // process. If nil, the init process's mount namespace is used. // // Anyone setting MountNamespace must donate a reference (i.e. // increment it). MountNamespace *fs.MountNamespace // MountNamespaceVFS2 optionally contains the mount namespace for this // process. If nil, the init process's mount namespace is used. // // Anyone setting MountNamespaceVFS2 must donate a reference (i.e. // increment it). MountNamespaceVFS2 *vfs.MountNamespace // ContainerID is the container that the process belongs to. ContainerID string } // NewContext returns a context.Context that represents the task that will be // created by args.NewContext(k). func (args *CreateProcessArgs) NewContext(k *Kernel) *createProcessContext { return &createProcessContext{ Logger: log.Log(), k: k, args: args, } } // createProcessContext is a context.Context that represents the context // associated with a task that is being created. type createProcessContext struct { context.NoopSleeper log.Logger k *Kernel args *CreateProcessArgs } // Value implements context.Context.Value. func (ctx *createProcessContext) Value(key interface{}) interface{} { switch key { case CtxKernel: return ctx.k case CtxPIDNamespace: return ctx.args.PIDNamespace case CtxUTSNamespace: return ctx.args.UTSNamespace case CtxIPCNamespace: return ctx.args.IPCNamespace case auth.CtxCredentials: return ctx.args.Credentials case fs.CtxRoot: if ctx.args.MountNamespace != nil { // MountNamespace.Root() will take a reference on the root dirent for us. return ctx.args.MountNamespace.Root() } return nil case vfs.CtxRoot: if ctx.args.MountNamespaceVFS2 == nil { return nil } // MountNamespaceVFS2.Root() takes a reference on the root dirent for us. return ctx.args.MountNamespaceVFS2.Root() case vfs.CtxMountNamespace: if ctx.k.globalInit == nil { return nil } // MountNamespaceVFS2 takes a reference for us. return ctx.k.GlobalInit().Leader().MountNamespaceVFS2() case fs.CtxDirentCacheLimiter: return ctx.k.DirentCacheLimiter case ktime.CtxRealtimeClock: return ctx.k.RealtimeClock() case limits.CtxLimits: return ctx.args.Limits case pgalloc.CtxMemoryFile: return ctx.k.mf case pgalloc.CtxMemoryFileProvider: return ctx.k case platform.CtxPlatform: return ctx.k case uniqueid.CtxGlobalUniqueID: return ctx.k.UniqueID() case uniqueid.CtxGlobalUniqueIDProvider: return ctx.k case uniqueid.CtxInotifyCookie: return ctx.k.GenerateInotifyCookie() case unimpl.CtxEvents: return ctx.k default: return nil } } // CreateProcess creates a new task in a new thread group with the given // options. The new task has no parent and is in the root PID namespace. // // If k.Start() has already been called, then the created process must be // started by calling kernel.StartProcess(tg). // // If k.Start() has not yet been called, then the created task will begin // running when k.Start() is called. // // CreateProcess has no analogue in Linux; it is used to create the initial // application task, as well as processes started by the control server. func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, error) { k.extMu.Lock() defer k.extMu.Unlock() log.Infof("EXEC: %v", args.Argv) ctx := args.NewContext(k) var ( opener fsbridge.Lookup fsContext *FSContext mntns *fs.MountNamespace ) if VFS2Enabled { mntnsVFS2 := args.MountNamespaceVFS2 if mntnsVFS2 == nil { // MountNamespaceVFS2 adds a reference to the namespace, which is // transferred to the new process. mntnsVFS2 = k.GlobalInit().Leader().MountNamespaceVFS2() } // Get the root directory from the MountNamespace. root := args.MountNamespaceVFS2.Root() // The call to newFSContext below will take a reference on root, so we // don't need to hold this one. defer root.DecRef() // Grab the working directory. wd := root // Default. if args.WorkingDirectory != "" { pop := vfs.PathOperation{ Root: root, Start: wd, Path: fspath.Parse(args.WorkingDirectory), FollowFinalSymlink: true, } var err error wd, err = k.VFS.GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{ CheckSearchable: true, }) if err != nil { return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) } defer wd.DecRef() } opener = fsbridge.NewVFSLookup(mntnsVFS2, root, wd) fsContext = NewFSContextVFS2(root, wd, args.Umask) } else { mntns = args.MountNamespace if mntns == nil { mntns = k.GlobalInit().Leader().MountNamespace() mntns.IncRef() } // Get the root directory from the MountNamespace. root := mntns.Root() // The call to newFSContext below will take a reference on root, so we // don't need to hold this one. defer root.DecRef() // Grab the working directory. remainingTraversals := args.MaxSymlinkTraversals wd := root // Default. if args.WorkingDirectory != "" { var err error wd, err = mntns.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals) if err != nil { return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) } defer wd.DecRef() } opener = fsbridge.NewFSLookup(mntns, root, wd) fsContext = newFSContext(root, wd, args.Umask) } tg := k.NewThreadGroup(mntns, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits) // Check which file to start from. switch { case args.Filename != "": // If a filename is given, take that. // Set File to nil so we resolve the path in LoadTaskImage. args.File = nil case args.File != nil: // If File is set, take the File provided directly. default: // Otherwise look at Argv and see if the first argument is a valid path. if len(args.Argv) == 0 { return nil, 0, fmt.Errorf("no filename or command provided") } if !filepath.IsAbs(args.Argv[0]) { return nil, 0, fmt.Errorf("'%s' is not an absolute path", args.Argv[0]) } args.Filename = args.Argv[0] } // Create a fresh task context. remainingTraversals := args.MaxSymlinkTraversals loadArgs := loader.LoadArgs{ Opener: opener, RemainingTraversals: &remainingTraversals, ResolveFinal: true, Filename: args.Filename, File: args.File, CloseOnExec: false, Argv: args.Argv, Envv: args.Envv, Features: k.featureSet, } tc, se := k.LoadTaskImage(ctx, loadArgs) if se != nil { return nil, 0, errors.New(se.String()) } // Take a reference on the FDTable, which will be transferred to // TaskSet.NewTask(). args.FDTable.IncRef() // Create the task. config := &TaskConfig{ Kernel: k, ThreadGroup: tg, TaskContext: tc, FSContext: fsContext, FDTable: args.FDTable, Credentials: args.Credentials, AllowedCPUMask: sched.NewFullCPUSet(k.applicationCores), UTSNamespace: args.UTSNamespace, IPCNamespace: args.IPCNamespace, AbstractSocketNamespace: args.AbstractSocketNamespace, MountNamespaceVFS2: args.MountNamespaceVFS2, ContainerID: args.ContainerID, } t, err := k.tasks.NewTask(config) if err != nil { return nil, 0, err } t.traceExecEvent(tc) // Simulate exec for tracing. // Success. tgid := k.tasks.Root.IDOfThreadGroup(tg) if k.globalInit == nil { k.globalInit = tg } return tg, tgid, nil } // StartProcess starts running a process that was created with CreateProcess. func (k *Kernel) StartProcess(tg *ThreadGroup) { t := tg.Leader() tid := k.tasks.Root.IDOfTask(t) t.Start(tid) } // Start starts execution of all tasks in k. // // Preconditions: Start may be called exactly once. func (k *Kernel) Start() error { k.extMu.Lock() defer k.extMu.Unlock() if k.globalInit == nil { return fmt.Errorf("kernel contains no tasks") } if k.started { return fmt.Errorf("kernel already started") } k.started = true k.cpuClockTicker = ktime.NewTimer(k.monotonicClock, newKernelCPUClockTicker(k)) k.cpuClockTicker.Swap(ktime.Setting{ Enabled: true, Period: linux.ClockTick, }) // If k was created by LoadKernelFrom, timers were stopped during // Kernel.SaveTo and need to be resumed. If k was created by NewKernel, // this is a no-op. k.resumeTimeLocked() // Start task goroutines. k.tasks.mu.RLock() defer k.tasks.mu.RUnlock() for t, tid := range k.tasks.Root.tids { t.Start(tid) } return nil } // pauseTimeLocked pauses all Timers and Timekeeper updates. // // Preconditions: Any task goroutines running in k must be stopped. k.extMu // must be locked. func (k *Kernel) pauseTimeLocked() { // k.cpuClockTicker may be nil since Kernel.SaveTo() may be called before // Kernel.Start(). if k.cpuClockTicker != nil { k.cpuClockTicker.Pause() } // By precondition, nothing else can be interacting with PIDNamespace.tids // or FDTable.files, so we can iterate them without synchronization. (We // can't hold the TaskSet mutex when pausing thread group timers because // thread group timers call ThreadGroup.SendSignal, which takes the TaskSet // mutex, while holding the Timer mutex.) for t := range k.tasks.Root.tids { if t == t.tg.leader { t.tg.itimerRealTimer.Pause() for _, it := range t.tg.timers { it.PauseTimer() } } // This means we'll iterate FDTables shared by multiple tasks repeatedly, // but ktime.Timer.Pause is idempotent so this is harmless. if t.fdTable != nil { t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { tfd.PauseTimer() } }) } } k.timekeeper.PauseUpdates() } // resumeTimeLocked resumes all Timers and Timekeeper updates. If // pauseTimeLocked has not been previously called, resumeTimeLocked has no // effect. // // Preconditions: Any task goroutines running in k must be stopped. k.extMu // must be locked. func (k *Kernel) resumeTimeLocked() { if k.cpuClockTicker != nil { k.cpuClockTicker.Resume() } k.timekeeper.ResumeUpdates() for t := range k.tasks.Root.tids { if t == t.tg.leader { t.tg.itimerRealTimer.Resume() for _, it := range t.tg.timers { it.ResumeTimer() } } if t.fdTable != nil { t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { tfd.ResumeTimer() } }) } } } func (k *Kernel) incRunningTasks() { for { tasks := atomic.LoadInt64(&k.runningTasks) if tasks != 0 { // Standard case. Simply increment. if !atomic.CompareAndSwapInt64(&k.runningTasks, tasks, tasks+1) { continue } return } // Transition from 0 -> 1. Synchronize with other transitions and timer. k.runningTasksMu.Lock() tasks = atomic.LoadInt64(&k.runningTasks) if tasks != 0 { // We're no longer the first task, no need to // re-enable. atomic.AddInt64(&k.runningTasks, 1) k.runningTasksMu.Unlock() return } if !k.cpuClockTickerDisabled { // Timer was never disabled. atomic.StoreInt64(&k.runningTasks, 1) k.runningTasksMu.Unlock() return } // We need to update cpuClock for all of the ticks missed while we // slept, and then re-enable the timer. // // The Notify in Swap isn't sufficient. kernelCPUClockTicker.Notify // always increments cpuClock by 1 regardless of the number of // expirations as a heuristic to avoid over-accounting in cases of CPU // throttling. // // We want to cover the normal case, when all time should be accounted, // so we increment for all expirations. Throttling is less concerning // here because the ticker is only disabled from Notify. This means // that Notify must schedule and compensate for the throttled period // before the timer is disabled. Throttling while the timer is disabled // doesn't matter, as nothing is running or reading cpuClock anyways. // // S/R also adds complication, as there are two cases. Recall that // monotonicClock will jump forward on restore. // // 1. If the ticker is enabled during save, then on Restore Notify is // called with many expirations, covering the time jump, but cpuClock // is only incremented by 1. // // 2. If the ticker is disabled during save, then after Restore the // first wakeup will call this function and cpuClock will be // incremented by the number of expirations across the S/R. // // These cause very different value of cpuClock. But again, since // nothing was running while the ticker was disabled, those differences // don't matter. setting, exp := k.cpuClockTickerSetting.At(k.monotonicClock.Now()) if exp > 0 { atomic.AddUint64(&k.cpuClock, exp) } // Now that cpuClock is updated it is safe to allow other tasks to // transition to running. atomic.StoreInt64(&k.runningTasks, 1) // N.B. we must unlock before calling Swap to maintain lock ordering. // // cpuClockTickerDisabled need not wait until after Swap to become // true. It is sufficient that the timer *will* be enabled. k.cpuClockTickerDisabled = false k.runningTasksMu.Unlock() // This won't call Notify (unless it's been ClockTick since setting.At // above). This means we skip the thread group work in Notify. However, // since nothing was running while we were disabled, none of the timers // could have expired. k.cpuClockTicker.Swap(setting) return } } func (k *Kernel) decRunningTasks() { tasks := atomic.AddInt64(&k.runningTasks, -1) if tasks < 0 { panic(fmt.Sprintf("Invalid running count %d", tasks)) } // Nothing to do. The next CPU clock tick will disable the timer if // there is still nothing running. This provides approximately one tick // of slack in which we can switch back and forth between idle and // active without an expensive transition. } // WaitExited blocks until all tasks in k have exited. func (k *Kernel) WaitExited() { k.tasks.liveGoroutines.Wait() } // Kill requests that all tasks in k immediately exit as if group exiting with // status es. Kill does not wait for tasks to exit. func (k *Kernel) Kill(es ExitStatus) { k.extMu.Lock() defer k.extMu.Unlock() k.tasks.Kill(es) } // Pause requests that all tasks in k temporarily stop executing, and blocks // until all tasks in k have stopped. Multiple calls to Pause nest and require // an equal number of calls to Unpause to resume execution. func (k *Kernel) Pause() { k.extMu.Lock() k.tasks.BeginExternalStop() k.extMu.Unlock() k.tasks.runningGoroutines.Wait() } // Unpause ends the effect of a previous call to Pause. If Unpause is called // without a matching preceding call to Pause, Unpause may panic. func (k *Kernel) Unpause() { k.extMu.Lock() defer k.extMu.Unlock() k.tasks.EndExternalStop() } // SendExternalSignal injects a signal into the kernel. // // context is used only for debugging to describe how the signal was received. // // Preconditions: Kernel must have an init process. func (k *Kernel) SendExternalSignal(info *arch.SignalInfo, context string) { k.extMu.Lock() defer k.extMu.Unlock() k.sendExternalSignal(info, context) } // SendContainerSignal sends the given signal to all processes inside the // namespace that match the given container ID. func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error { k.extMu.Lock() defer k.extMu.Unlock() k.tasks.mu.RLock() defer k.tasks.mu.RUnlock() var lastErr error for tg := range k.tasks.Root.tgids { if tg.leader.ContainerID() == cid { tg.signalHandlers.mu.Lock() infoCopy := *info if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil { lastErr = err } tg.signalHandlers.mu.Unlock() } } return lastErr } // RebuildTraceContexts rebuilds the trace context for all tasks. // // Unfortunately, if these are built while tracing is not enabled, then we will // not have meaningful trace data. Rebuilding here ensures that we can do so // after tracing has been enabled. func (k *Kernel) RebuildTraceContexts() { k.extMu.Lock() defer k.extMu.Unlock() k.tasks.mu.RLock() defer k.tasks.mu.RUnlock() for t, tid := range k.tasks.Root.tids { t.rebuildTraceContext(tid) } } // FeatureSet returns the FeatureSet. func (k *Kernel) FeatureSet() *cpuid.FeatureSet { return k.featureSet } // Timekeeper returns the Timekeeper. func (k *Kernel) Timekeeper() *Timekeeper { return k.timekeeper } // TaskSet returns the TaskSet. func (k *Kernel) TaskSet() *TaskSet { return k.tasks } // RootUserNamespace returns the root UserNamespace. func (k *Kernel) RootUserNamespace() *auth.UserNamespace { return k.rootUserNamespace } // RootUTSNamespace returns the root UTSNamespace. func (k *Kernel) RootUTSNamespace() *UTSNamespace { return k.rootUTSNamespace } // RootIPCNamespace returns the root IPCNamespace. func (k *Kernel) RootIPCNamespace() *IPCNamespace { return k.rootIPCNamespace } // RootPIDNamespace returns the root PIDNamespace. func (k *Kernel) RootPIDNamespace() *PIDNamespace { return k.tasks.Root } // RootAbstractSocketNamespace returns the root AbstractSocketNamespace. func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace { return k.rootAbstractSocketNamespace } // NetworkStack returns the network stack. NetworkStack may return nil if no // network stack is available. func (k *Kernel) NetworkStack() inet.Stack { return k.networkStack } // GlobalInit returns the thread group with ID 1 in the root PID namespace, or // nil if no such thread group exists. GlobalInit may return a thread group // containing no tasks if the thread group has already exited. func (k *Kernel) GlobalInit() *ThreadGroup { k.extMu.Lock() defer k.extMu.Unlock() return k.globalInit } // TestOnly_SetGlobalInit sets the thread group with ID 1 in the root PID namespace. func (k *Kernel) TestOnly_SetGlobalInit(tg *ThreadGroup) { k.globalInit = tg } // ApplicationCores returns the number of CPUs visible to sandboxed // applications. func (k *Kernel) ApplicationCores() uint { return k.applicationCores } // RealtimeClock returns the application CLOCK_REALTIME clock. func (k *Kernel) RealtimeClock() ktime.Clock { return k.realtimeClock } // MonotonicClock returns the application CLOCK_MONOTONIC clock. func (k *Kernel) MonotonicClock() ktime.Clock { return k.monotonicClock } // CPUClockNow returns the current value of k.cpuClock. func (k *Kernel) CPUClockNow() uint64 { return atomic.LoadUint64(&k.cpuClock) } // Syslog returns the syslog. func (k *Kernel) Syslog() *syslog { return &k.syslog } // GenerateInotifyCookie generates a unique inotify event cookie. // // Returned values may overlap with previously returned values if the value // space is exhausted. 0 is not a valid cookie value, all other values // representable in a uint32 are allowed. func (k *Kernel) GenerateInotifyCookie() uint32 { id := atomic.AddUint32(&k.nextInotifyCookie, 1) // Wrap-around is explicitly allowed for inotify event cookies. if id == 0 { id = atomic.AddUint32(&k.nextInotifyCookie, 1) } return id } // NetlinkPorts returns the netlink port manager. func (k *Kernel) NetlinkPorts() *port.Manager { return k.netlinkPorts } // SaveError returns the sandbox error that caused the kernel to exit during // save. func (k *Kernel) SaveError() error { k.extMu.Lock() defer k.extMu.Unlock() return k.saveErr } // SetSaveError sets the sandbox error that caused the kernel to exit during // save, if one is not already set. func (k *Kernel) SetSaveError(err error) { k.extMu.Lock() defer k.extMu.Unlock() if k.saveErr == nil { k.saveErr = err } } var _ tcpip.Clock = (*Kernel)(nil) // NowNanoseconds implements tcpip.Clock.NowNanoseconds. func (k *Kernel) NowNanoseconds() int64 { now, err := k.timekeeper.GetTime(sentrytime.Realtime) if err != nil { panic("Kernel.NowNanoseconds: " + err.Error()) } return now } // NowMonotonic implements tcpip.Clock.NowMonotonic. func (k *Kernel) NowMonotonic() int64 { now, err := k.timekeeper.GetTime(sentrytime.Monotonic) if err != nil { panic("Kernel.NowMonotonic: " + err.Error()) } return now } // SetMemoryFile sets Kernel.mf. SetMemoryFile must be called before Init or // LoadFrom. func (k *Kernel) SetMemoryFile(mf *pgalloc.MemoryFile) { k.mf = mf } // MemoryFile implements pgalloc.MemoryFileProvider.MemoryFile. func (k *Kernel) MemoryFile() *pgalloc.MemoryFile { return k.mf } // SupervisorContext returns a Context with maximum privileges in k. It should // only be used by goroutines outside the control of the emulated kernel // defined by e. // // Callers are responsible for ensuring that the returned Context is not used // concurrently with changes to the Kernel. func (k *Kernel) SupervisorContext() context.Context { return supervisorContext{ Logger: log.Log(), k: k, } } // SocketEntry represents a socket recorded in Kernel.sockets. It implements // refs.WeakRefUser for sockets stored in the socket table. // // +stateify savable type SocketEntry struct { socketEntry k *Kernel Sock *refs.WeakRef ID uint64 // Socket table entry number. } // WeakRefGone implements refs.WeakRefUser.WeakRefGone. func (s *SocketEntry) WeakRefGone() { s.k.extMu.Lock() s.k.sockets.Remove(s) s.k.extMu.Unlock() } // RecordSocket adds a socket to the system-wide socket table for tracking. // // Precondition: Caller must hold a reference to sock. func (k *Kernel) RecordSocket(sock *fs.File) { k.extMu.Lock() id := k.nextSocketEntry k.nextSocketEntry++ s := &SocketEntry{k: k, ID: id} s.Sock = refs.NewWeakRef(sock, s) k.sockets.PushBack(s) k.extMu.Unlock() } // ListSockets returns a snapshot of all sockets. func (k *Kernel) ListSockets() []*SocketEntry { k.extMu.Lock() var socks []*SocketEntry for s := k.sockets.Front(); s != nil; s = s.Next() { socks = append(socks, s) } k.extMu.Unlock() return socks } // supervisorContext is a privileged context. type supervisorContext struct { context.NoopSleeper log.Logger k *Kernel } // Value implements context.Context. func (ctx supervisorContext) Value(key interface{}) interface{} { switch key { case CtxCanTrace: // The supervisor context can trace anything. (None of // supervisorContext's users are expected to invoke ptrace, but ptrace // permissions are required for certain file accesses.) return func(*Task, bool) bool { return true } case CtxKernel: return ctx.k case CtxPIDNamespace: return ctx.k.tasks.Root case CtxUTSNamespace: return ctx.k.rootUTSNamespace case CtxIPCNamespace: return ctx.k.rootIPCNamespace case auth.CtxCredentials: // The supervisor context is global root. return auth.NewRootCredentials(ctx.k.rootUserNamespace) case fs.CtxRoot: if ctx.k.globalInit != nil { return ctx.k.globalInit.mounts.Root() } return nil case vfs.CtxRoot: if ctx.k.globalInit == nil { return vfs.VirtualDentry{} } mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2() defer mntns.DecRef() // Root() takes a reference on the root dirent for us. return mntns.Root() case vfs.CtxMountNamespace: if ctx.k.globalInit == nil { return nil } // MountNamespaceVFS2() takes a reference for us. return ctx.k.GlobalInit().Leader().MountNamespaceVFS2() case fs.CtxDirentCacheLimiter: return ctx.k.DirentCacheLimiter case ktime.CtxRealtimeClock: return ctx.k.RealtimeClock() case limits.CtxLimits: // No limits apply. return limits.NewLimitSet() case pgalloc.CtxMemoryFile: return ctx.k.mf case pgalloc.CtxMemoryFileProvider: return ctx.k case platform.CtxPlatform: return ctx.k case uniqueid.CtxGlobalUniqueID: return ctx.k.UniqueID() case uniqueid.CtxGlobalUniqueIDProvider: return ctx.k case uniqueid.CtxInotifyCookie: return ctx.k.GenerateInotifyCookie() case unimpl.CtxEvents: return ctx.k default: return nil } } // Rate limits for the number of unimplemented syscall events. const ( unimplementedSyscallsMaxRate = 100 // events per second unimplementedSyscallBurst = 1000 // events ) // EmitUnimplementedEvent emits an UnimplementedSyscall event via the event // channel. func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) { k.unimplementedSyscallEmitterOnce.Do(func() { k.unimplementedSyscallEmitter = eventchannel.RateLimitedEmitterFrom(eventchannel.DefaultEmitter, unimplementedSyscallsMaxRate, unimplementedSyscallBurst) }) t := TaskFromContext(ctx) k.unimplementedSyscallEmitter.Emit(&uspb.UnimplementedSyscall{ Tid: int32(t.ThreadID()), Registers: t.Arch().StateData().Proto(), }) }