diff options
Diffstat (limited to 'pkg/sentry/kernel/kernel.go')
-rw-r--r-- | pkg/sentry/kernel/kernel.go | 183 |
1 files changed, 106 insertions, 77 deletions
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index f253a81d9..38b49cba2 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -39,34 +39,34 @@ import ( "sync/atomic" "time" - "gvisor.googlesource.com/gvisor/pkg/abi/linux" - "gvisor.googlesource.com/gvisor/pkg/cpuid" - "gvisor.googlesource.com/gvisor/pkg/eventchannel" - "gvisor.googlesource.com/gvisor/pkg/log" - "gvisor.googlesource.com/gvisor/pkg/refs" - "gvisor.googlesource.com/gvisor/pkg/sentry/arch" - "gvisor.googlesource.com/gvisor/pkg/sentry/context" - "gvisor.googlesource.com/gvisor/pkg/sentry/fs" - "gvisor.googlesource.com/gvisor/pkg/sentry/fs/timerfd" - "gvisor.googlesource.com/gvisor/pkg/sentry/hostcpu" - "gvisor.googlesource.com/gvisor/pkg/sentry/inet" - "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" - "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/epoll" - "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex" - "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched" - ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" - "gvisor.googlesource.com/gvisor/pkg/sentry/limits" - "gvisor.googlesource.com/gvisor/pkg/sentry/loader" - "gvisor.googlesource.com/gvisor/pkg/sentry/mm" - "gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc" - "gvisor.googlesource.com/gvisor/pkg/sentry/platform" - "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/port" - sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time" - "gvisor.googlesource.com/gvisor/pkg/sentry/unimpl" - uspb "gvisor.googlesource.com/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto" - "gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid" - "gvisor.googlesource.com/gvisor/pkg/state" - "gvisor.googlesource.com/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/eventchannel" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/timerfd" + "gvisor.dev/gvisor/pkg/sentry/hostcpu" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/epoll" + "gvisor.dev/gvisor/pkg/sentry/kernel/futex" + "gvisor.dev/gvisor/pkg/sentry/kernel/sched" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/loader" + "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/socket/netlink/port" + sentrytime "gvisor.dev/gvisor/pkg/sentry/time" + "gvisor.dev/gvisor/pkg/sentry/unimpl" + uspb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto" + "gvisor.dev/gvisor/pkg/sentry/uniqueid" + "gvisor.dev/gvisor/pkg/state" + "gvisor.dev/gvisor/pkg/tcpip" ) // Kernel represents an emulated Linux kernel. It must be initialized by calling @@ -155,7 +155,7 @@ type Kernel struct { // cpuClockTicker increments cpuClock. cpuClockTicker *ktime.Timer `state:"nosave"` - // fdMapUids is an ever-increasing counter for generating FDMap uids. + // fdMapUids is an ever-increasing counter for generating FDTable uids. // // fdMapUids is mutable, and is accessed using atomic memory operations. fdMapUids uint64 @@ -381,13 +381,27 @@ func (k *Kernel) SaveTo(w io.Writer) error { // flushMountSourceRefs flushes the MountSources for all mounted filesystems // and open FDs. func (k *Kernel) flushMountSourceRefs() error { - // Flush all mount sources for currently mounted filesystems. + // Flush all mount sources for currently mounted filesystems in the + // root mount namespace. k.mounts.FlushMountSourceRefs() + // Some tasks may have other mount namespaces; flush those as well. + flushed := make(map[*fs.MountNamespace]struct{}) + k.tasks.mu.RLock() + k.tasks.forEachThreadGroupLocked(func(tg *ThreadGroup) { + if _, ok := flushed[tg.mounts]; ok { + // Already flushed. + return + } + tg.mounts.FlushMountSourceRefs() + flushed[tg.mounts] = struct{}{} + }) + k.tasks.mu.RUnlock() + // There may be some open FDs whose filesystems have been unmounted. We // must flush those as well. - return k.tasks.forEachFDPaused(func(desc descriptor) error { - desc.file.Dirent.Inode.MountSource.FlushDirentRefs() + return k.tasks.forEachFDPaused(func(file *fs.File) error { + file.Dirent.Inode.MountSource.FlushDirentRefs() return nil }) } @@ -396,35 +410,35 @@ func (k *Kernel) flushMountSourceRefs() error { // task. // // Precondition: Must be called with the kernel paused. -func (ts *TaskSet) forEachFDPaused(f func(descriptor) error) error { +func (ts *TaskSet) forEachFDPaused(f func(*fs.File) error) (err error) { ts.mu.RLock() defer ts.mu.RUnlock() for t := range ts.Root.tids { // We can skip locking Task.mu here since the kernel is paused. - if t.fds == nil { + if t.fdTable == nil { continue } - for _, desc := range t.fds.files { - if err := f(desc); err != nil { - return err + t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) { + if lastErr := f(file); lastErr != nil && err == nil { + err = lastErr } - } + }) } - return nil + return err } func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error { - return ts.forEachFDPaused(func(desc descriptor) error { - if flags := desc.file.Flags(); !flags.Write { + return ts.forEachFDPaused(func(file *fs.File) error { + if flags := file.Flags(); !flags.Write { return nil } - if sattr := desc.file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) { + if sattr := file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) { return nil } // Here we need all metadata synced. - syncErr := desc.file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll) + syncErr := file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll) if err := fs.SaveFileFsyncError(syncErr); err != nil { - name, _ := desc.file.Dirent.FullName(nil /* root */) + name, _ := file.Dirent.FullName(nil /* root */) // Wrap this error in ErrSaveRejection // so that it will trigger a save // error, rather than a panic. This @@ -469,14 +483,12 @@ func (ts *TaskSet) unregisterEpollWaiters() { defer ts.mu.RUnlock() for t := range ts.Root.tids { // We can skip locking Task.mu here since the kernel is paused. - if fdmap := t.fds; fdmap != nil { - for _, desc := range fdmap.files { - if desc.file != nil { - if e, ok := desc.file.FileOperations.(*epoll.EventPoll); ok { - e.UnregisterEpollWaiters() - } + if t.fdTable != nil { + t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) { + if e, ok := file.FileOperations.(*epoll.EventPoll); ok { + e.UnregisterEpollWaiters() } - } + }) } } } @@ -524,6 +536,8 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack) error { } log.Infof("Memory load took [%s].", time.Since(memoryStart)) + log.Infof("Overall load took [%s]", time.Since(loadStart)) + // Ensure that all pending asynchronous work is complete: // - namedpipe opening // - inode file opening @@ -588,9 +602,9 @@ type CreateProcessArgs struct { // Credentials is the initial credentials. Credentials *auth.Credentials - // FDMap is the initial set of file descriptors. If CreateProcess succeeds, - // it takes a reference on FDMap. - FDMap *FDMap + // FDTable is the initial set of file descriptors. If CreateProcess succeeds, + // it takes a reference on FDTable. + FDTable *FDTable // Umask is the initial umask. Umask uint @@ -611,12 +625,18 @@ type CreateProcessArgs struct { // AbstractSocketNamespace is the initial Abstract Socket namespace. AbstractSocketNamespace *AbstractSocketNamespace + // MountNamespace optionally contains the mount namespace for this + // process. If nil, the kernel's mount namespace is used. + // + // Anyone setting MountNamespace must donate a reference (i.e. + // increment it). + MountNamespace *fs.MountNamespace + // Root optionally contains the dirent that serves as the root for the // process. If nil, the mount namespace's root is used as the process' // root. // - // Anyone setting Root must donate a reference (i.e. increment it) to - // keep it alive until it is decremented by CreateProcess. + // Anyone setting Root must donate a reference (i.e. increment it). Root *fs.Dirent // ContainerID is the container that the process belongs to. @@ -659,7 +679,7 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} { return ctx.args.Credentials case fs.CtxRoot: if ctx.args.Root != nil { - // Take a refernce on the root dirent that will be + // Take a reference on the root dirent that will be // given to the caller. ctx.args.Root.IncRef() return ctx.args.Root @@ -715,20 +735,29 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, return nil, 0, fmt.Errorf("no kernel MountNamespace") } - tg := k.newThreadGroup(k.tasks.Root, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock) + // Grab the mount namespace. + mounts := args.MountNamespace + if mounts == nil { + // If no MountNamespace was configured, then use the kernel's + // root mount namespace, with an extra reference that will be + // donated to the task. + mounts = k.mounts + mounts.IncRef() + } + + tg := k.newThreadGroup(mounts, k.tasks.Root, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock) ctx := args.NewContext(k) // Grab the root directory. root := args.Root if root == nil { - root = fs.RootFromContext(ctx) - // Is the root STILL nil? - if root == nil { - return nil, 0, fmt.Errorf("CreateProcessArgs.Root was not provided, and failed to get root from context") - } + // If no Root was configured, then get it from the + // MountNamespace. + root = mounts.Root() } + // The call to newFSContext below will take a reference on root, so we + // don't need to hold this one. defer root.DecRef() - args.Root = nil // Grab the working directory. remainingTraversals := uint(args.MaxSymlinkTraversals) @@ -760,9 +789,9 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, return nil, 0, errors.New(se.String()) } - // Take a reference on the FDMap, which will be transferred to + // Take a reference on the FDTable, which will be transferred to // TaskSet.NewTask(). - args.FDMap.IncRef() + args.FDTable.IncRef() // Create the task. config := &TaskConfig{ @@ -770,7 +799,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, ThreadGroup: tg, TaskContext: tc, FSContext: newFSContext(root, wd, args.Umask), - FDMap: args.FDMap, + FDTable: args.FDTable, Credentials: args.Credentials, AllowedCPUMask: sched.NewFullCPUSet(k.applicationCores), UTSNamespace: args.UTSNamespace, @@ -842,7 +871,7 @@ func (k *Kernel) pauseTimeLocked() { } // By precondition, nothing else can be interacting with PIDNamespace.tids - // or FDMap.files, so we can iterate them without synchronization. (We + // or FDTable.files, so we can iterate them without synchronization. (We // can't hold the TaskSet mutex when pausing thread group timers because // thread group timers call ThreadGroup.SendSignal, which takes the TaskSet // mutex, while holding the Timer mutex.) @@ -853,14 +882,14 @@ func (k *Kernel) pauseTimeLocked() { it.PauseTimer() } } - // This means we'll iterate FDMaps shared by multiple tasks repeatedly, + // This means we'll iterate FDTables shared by multiple tasks repeatedly, // but ktime.Timer.Pause is idempotent so this is harmless. - if fdm := t.fds; fdm != nil { - for _, desc := range fdm.files { - if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok { + if t.fdTable != nil { + t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) { + if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { tfd.PauseTimer() } - } + }) } } k.timekeeper.PauseUpdates() @@ -885,12 +914,12 @@ func (k *Kernel) resumeTimeLocked() { it.ResumeTimer() } } - if fdm := t.fds; fdm != nil { - for _, desc := range fdm.files { - if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok { + if t.fdTable != nil { + t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) { + if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { tfd.ResumeTimer() } - } + }) } } } |