summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/kernel/kernel.go
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/kernel/kernel.go')
-rw-r--r--pkg/sentry/kernel/kernel.go183
1 files changed, 106 insertions, 77 deletions
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index f253a81d9..38b49cba2 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -39,34 +39,34 @@ import (
"sync/atomic"
"time"
- "gvisor.googlesource.com/gvisor/pkg/abi/linux"
- "gvisor.googlesource.com/gvisor/pkg/cpuid"
- "gvisor.googlesource.com/gvisor/pkg/eventchannel"
- "gvisor.googlesource.com/gvisor/pkg/log"
- "gvisor.googlesource.com/gvisor/pkg/refs"
- "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
- "gvisor.googlesource.com/gvisor/pkg/sentry/context"
- "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
- "gvisor.googlesource.com/gvisor/pkg/sentry/fs/timerfd"
- "gvisor.googlesource.com/gvisor/pkg/sentry/hostcpu"
- "gvisor.googlesource.com/gvisor/pkg/sentry/inet"
- "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
- "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/epoll"
- "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
- "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
- ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
- "gvisor.googlesource.com/gvisor/pkg/sentry/limits"
- "gvisor.googlesource.com/gvisor/pkg/sentry/loader"
- "gvisor.googlesource.com/gvisor/pkg/sentry/mm"
- "gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
- "gvisor.googlesource.com/gvisor/pkg/sentry/platform"
- "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/port"
- sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time"
- "gvisor.googlesource.com/gvisor/pkg/sentry/unimpl"
- uspb "gvisor.googlesource.com/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto"
- "gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
- "gvisor.googlesource.com/gvisor/pkg/state"
- "gvisor.googlesource.com/gvisor/pkg/tcpip"
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/cpuid"
+ "gvisor.dev/gvisor/pkg/eventchannel"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/refs"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/fs/timerfd"
+ "gvisor.dev/gvisor/pkg/sentry/hostcpu"
+ "gvisor.dev/gvisor/pkg/sentry/inet"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/epoll"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/futex"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/sched"
+ ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+ "gvisor.dev/gvisor/pkg/sentry/limits"
+ "gvisor.dev/gvisor/pkg/sentry/loader"
+ "gvisor.dev/gvisor/pkg/sentry/mm"
+ "gvisor.dev/gvisor/pkg/sentry/pgalloc"
+ "gvisor.dev/gvisor/pkg/sentry/platform"
+ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/port"
+ sentrytime "gvisor.dev/gvisor/pkg/sentry/time"
+ "gvisor.dev/gvisor/pkg/sentry/unimpl"
+ uspb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto"
+ "gvisor.dev/gvisor/pkg/sentry/uniqueid"
+ "gvisor.dev/gvisor/pkg/state"
+ "gvisor.dev/gvisor/pkg/tcpip"
)
// Kernel represents an emulated Linux kernel. It must be initialized by calling
@@ -155,7 +155,7 @@ type Kernel struct {
// cpuClockTicker increments cpuClock.
cpuClockTicker *ktime.Timer `state:"nosave"`
- // fdMapUids is an ever-increasing counter for generating FDMap uids.
+ // fdMapUids is an ever-increasing counter for generating FDTable uids.
//
// fdMapUids is mutable, and is accessed using atomic memory operations.
fdMapUids uint64
@@ -381,13 +381,27 @@ func (k *Kernel) SaveTo(w io.Writer) error {
// flushMountSourceRefs flushes the MountSources for all mounted filesystems
// and open FDs.
func (k *Kernel) flushMountSourceRefs() error {
- // Flush all mount sources for currently mounted filesystems.
+ // Flush all mount sources for currently mounted filesystems in the
+ // root mount namespace.
k.mounts.FlushMountSourceRefs()
+ // Some tasks may have other mount namespaces; flush those as well.
+ flushed := make(map[*fs.MountNamespace]struct{})
+ k.tasks.mu.RLock()
+ k.tasks.forEachThreadGroupLocked(func(tg *ThreadGroup) {
+ if _, ok := flushed[tg.mounts]; ok {
+ // Already flushed.
+ return
+ }
+ tg.mounts.FlushMountSourceRefs()
+ flushed[tg.mounts] = struct{}{}
+ })
+ k.tasks.mu.RUnlock()
+
// There may be some open FDs whose filesystems have been unmounted. We
// must flush those as well.
- return k.tasks.forEachFDPaused(func(desc descriptor) error {
- desc.file.Dirent.Inode.MountSource.FlushDirentRefs()
+ return k.tasks.forEachFDPaused(func(file *fs.File) error {
+ file.Dirent.Inode.MountSource.FlushDirentRefs()
return nil
})
}
@@ -396,35 +410,35 @@ func (k *Kernel) flushMountSourceRefs() error {
// task.
//
// Precondition: Must be called with the kernel paused.
-func (ts *TaskSet) forEachFDPaused(f func(descriptor) error) error {
+func (ts *TaskSet) forEachFDPaused(f func(*fs.File) error) (err error) {
ts.mu.RLock()
defer ts.mu.RUnlock()
for t := range ts.Root.tids {
// We can skip locking Task.mu here since the kernel is paused.
- if t.fds == nil {
+ if t.fdTable == nil {
continue
}
- for _, desc := range t.fds.files {
- if err := f(desc); err != nil {
- return err
+ t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
+ if lastErr := f(file); lastErr != nil && err == nil {
+ err = lastErr
}
- }
+ })
}
- return nil
+ return err
}
func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
- return ts.forEachFDPaused(func(desc descriptor) error {
- if flags := desc.file.Flags(); !flags.Write {
+ return ts.forEachFDPaused(func(file *fs.File) error {
+ if flags := file.Flags(); !flags.Write {
return nil
}
- if sattr := desc.file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) {
+ if sattr := file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) {
return nil
}
// Here we need all metadata synced.
- syncErr := desc.file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll)
+ syncErr := file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll)
if err := fs.SaveFileFsyncError(syncErr); err != nil {
- name, _ := desc.file.Dirent.FullName(nil /* root */)
+ name, _ := file.Dirent.FullName(nil /* root */)
// Wrap this error in ErrSaveRejection
// so that it will trigger a save
// error, rather than a panic. This
@@ -469,14 +483,12 @@ func (ts *TaskSet) unregisterEpollWaiters() {
defer ts.mu.RUnlock()
for t := range ts.Root.tids {
// We can skip locking Task.mu here since the kernel is paused.
- if fdmap := t.fds; fdmap != nil {
- for _, desc := range fdmap.files {
- if desc.file != nil {
- if e, ok := desc.file.FileOperations.(*epoll.EventPoll); ok {
- e.UnregisterEpollWaiters()
- }
+ if t.fdTable != nil {
+ t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
+ if e, ok := file.FileOperations.(*epoll.EventPoll); ok {
+ e.UnregisterEpollWaiters()
}
- }
+ })
}
}
}
@@ -524,6 +536,8 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack) error {
}
log.Infof("Memory load took [%s].", time.Since(memoryStart))
+ log.Infof("Overall load took [%s]", time.Since(loadStart))
+
// Ensure that all pending asynchronous work is complete:
// - namedpipe opening
// - inode file opening
@@ -588,9 +602,9 @@ type CreateProcessArgs struct {
// Credentials is the initial credentials.
Credentials *auth.Credentials
- // FDMap is the initial set of file descriptors. If CreateProcess succeeds,
- // it takes a reference on FDMap.
- FDMap *FDMap
+ // FDTable is the initial set of file descriptors. If CreateProcess succeeds,
+ // it takes a reference on FDTable.
+ FDTable *FDTable
// Umask is the initial umask.
Umask uint
@@ -611,12 +625,18 @@ type CreateProcessArgs struct {
// AbstractSocketNamespace is the initial Abstract Socket namespace.
AbstractSocketNamespace *AbstractSocketNamespace
+ // MountNamespace optionally contains the mount namespace for this
+ // process. If nil, the kernel's mount namespace is used.
+ //
+ // Anyone setting MountNamespace must donate a reference (i.e.
+ // increment it).
+ MountNamespace *fs.MountNamespace
+
// Root optionally contains the dirent that serves as the root for the
// process. If nil, the mount namespace's root is used as the process'
// root.
//
- // Anyone setting Root must donate a reference (i.e. increment it) to
- // keep it alive until it is decremented by CreateProcess.
+ // Anyone setting Root must donate a reference (i.e. increment it).
Root *fs.Dirent
// ContainerID is the container that the process belongs to.
@@ -659,7 +679,7 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
return ctx.args.Credentials
case fs.CtxRoot:
if ctx.args.Root != nil {
- // Take a refernce on the root dirent that will be
+ // Take a reference on the root dirent that will be
// given to the caller.
ctx.args.Root.IncRef()
return ctx.args.Root
@@ -715,20 +735,29 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
return nil, 0, fmt.Errorf("no kernel MountNamespace")
}
- tg := k.newThreadGroup(k.tasks.Root, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock)
+ // Grab the mount namespace.
+ mounts := args.MountNamespace
+ if mounts == nil {
+ // If no MountNamespace was configured, then use the kernel's
+ // root mount namespace, with an extra reference that will be
+ // donated to the task.
+ mounts = k.mounts
+ mounts.IncRef()
+ }
+
+ tg := k.newThreadGroup(mounts, k.tasks.Root, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock)
ctx := args.NewContext(k)
// Grab the root directory.
root := args.Root
if root == nil {
- root = fs.RootFromContext(ctx)
- // Is the root STILL nil?
- if root == nil {
- return nil, 0, fmt.Errorf("CreateProcessArgs.Root was not provided, and failed to get root from context")
- }
+ // If no Root was configured, then get it from the
+ // MountNamespace.
+ root = mounts.Root()
}
+ // The call to newFSContext below will take a reference on root, so we
+ // don't need to hold this one.
defer root.DecRef()
- args.Root = nil
// Grab the working directory.
remainingTraversals := uint(args.MaxSymlinkTraversals)
@@ -760,9 +789,9 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
return nil, 0, errors.New(se.String())
}
- // Take a reference on the FDMap, which will be transferred to
+ // Take a reference on the FDTable, which will be transferred to
// TaskSet.NewTask().
- args.FDMap.IncRef()
+ args.FDTable.IncRef()
// Create the task.
config := &TaskConfig{
@@ -770,7 +799,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
ThreadGroup: tg,
TaskContext: tc,
FSContext: newFSContext(root, wd, args.Umask),
- FDMap: args.FDMap,
+ FDTable: args.FDTable,
Credentials: args.Credentials,
AllowedCPUMask: sched.NewFullCPUSet(k.applicationCores),
UTSNamespace: args.UTSNamespace,
@@ -842,7 +871,7 @@ func (k *Kernel) pauseTimeLocked() {
}
// By precondition, nothing else can be interacting with PIDNamespace.tids
- // or FDMap.files, so we can iterate them without synchronization. (We
+ // or FDTable.files, so we can iterate them without synchronization. (We
// can't hold the TaskSet mutex when pausing thread group timers because
// thread group timers call ThreadGroup.SendSignal, which takes the TaskSet
// mutex, while holding the Timer mutex.)
@@ -853,14 +882,14 @@ func (k *Kernel) pauseTimeLocked() {
it.PauseTimer()
}
}
- // This means we'll iterate FDMaps shared by multiple tasks repeatedly,
+ // This means we'll iterate FDTables shared by multiple tasks repeatedly,
// but ktime.Timer.Pause is idempotent so this is harmless.
- if fdm := t.fds; fdm != nil {
- for _, desc := range fdm.files {
- if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok {
+ if t.fdTable != nil {
+ t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
+ if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
tfd.PauseTimer()
}
- }
+ })
}
}
k.timekeeper.PauseUpdates()
@@ -885,12 +914,12 @@ func (k *Kernel) resumeTimeLocked() {
it.ResumeTimer()
}
}
- if fdm := t.fds; fdm != nil {
- for _, desc := range fdm.files {
- if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok {
+ if t.fdTable != nil {
+ t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
+ if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
tfd.ResumeTimer()
}
- }
+ })
}
}
}