summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/kernel/kernel.go
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/kernel/kernel.go')
-rw-r--r--pkg/sentry/kernel/kernel.go177
1 files changed, 113 insertions, 64 deletions
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 15dae0f5b..d6c21adb7 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -220,13 +220,18 @@ type Kernel struct {
// danglingEndpoints is used to save / restore tcpip.DanglingEndpoints.
danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"`
- // sockets is the list of all network sockets the system. Protected by
- // extMu.
+ // sockets is the list of all network sockets in the system.
+ // Protected by extMu.
+ // TODO(gvisor.dev/issue/1624): Only used by VFS1.
sockets socketList
- // nextSocketEntry is the next entry number to use in sockets. Protected
+ // socketsVFS2 records all network sockets in the system. Protected by
+ // extMu.
+ socketsVFS2 map[*vfs.FileDescription]*SocketRecord
+
+ // nextSocketRecord is the next entry number to use in sockets. Protected
// by extMu.
- nextSocketEntry uint64
+ nextSocketRecord uint64
// deviceRegistry is used to save/restore device.SimpleDevices.
deviceRegistry struct{} `state:".(*device.Registry)"`
@@ -248,7 +253,7 @@ type Kernel struct {
// SpecialOpts contains special kernel options.
SpecialOpts
- // VFS keeps the filesystem state used across the kernel.
+ // vfs keeps the filesystem state used across the kernel.
vfs vfs.VirtualFilesystem
// hostMount is the Mount used for file descriptors that were imported
@@ -376,7 +381,8 @@ func (k *Kernel) Init(args InitKernelArgs) error {
k.netlinkPorts = port.New()
if VFS2Enabled {
- if err := k.vfs.Init(); err != nil {
+ ctx := k.SupervisorContext()
+ if err := k.vfs.Init(ctx); err != nil {
return fmt.Errorf("failed to initialize VFS: %v", err)
}
@@ -384,19 +390,19 @@ func (k *Kernel) Init(args InitKernelArgs) error {
if err != nil {
return fmt.Errorf("failed to create pipefs filesystem: %v", err)
}
- defer pipeFilesystem.DecRef()
+ defer pipeFilesystem.DecRef(ctx)
pipeMount, err := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{})
if err != nil {
return fmt.Errorf("failed to create pipefs mount: %v", err)
}
k.pipeMount = pipeMount
- tmpfsFilesystem, tmpfsRoot, err := tmpfs.NewFilesystem(k.SupervisorContext(), &k.vfs, auth.NewRootCredentials(k.rootUserNamespace))
+ tmpfsFilesystem, tmpfsRoot, err := tmpfs.NewFilesystem(ctx, &k.vfs, auth.NewRootCredentials(k.rootUserNamespace))
if err != nil {
return fmt.Errorf("failed to create tmpfs filesystem: %v", err)
}
- defer tmpfsFilesystem.DecRef()
- defer tmpfsRoot.DecRef()
+ defer tmpfsFilesystem.DecRef(ctx)
+ defer tmpfsRoot.DecRef(ctx)
shmMount, err := k.vfs.NewDisconnectedMount(tmpfsFilesystem, tmpfsRoot, &vfs.MountOptions{})
if err != nil {
return fmt.Errorf("failed to create tmpfs mount: %v", err)
@@ -407,12 +413,14 @@ func (k *Kernel) Init(args InitKernelArgs) error {
if err != nil {
return fmt.Errorf("failed to create sockfs filesystem: %v", err)
}
- defer socketFilesystem.DecRef()
+ defer socketFilesystem.DecRef(ctx)
socketMount, err := k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{})
if err != nil {
return fmt.Errorf("failed to create sockfs mount: %v", err)
}
k.socketMount = socketMount
+
+ k.socketsVFS2 = make(map[*vfs.FileDescription]*SocketRecord)
}
return nil
@@ -430,8 +438,8 @@ func (k *Kernel) SaveTo(w wire.Writer) error {
defer k.extMu.Unlock()
// Stop time.
- k.pauseTimeLocked()
- defer k.resumeTimeLocked()
+ k.pauseTimeLocked(ctx)
+ defer k.resumeTimeLocked(ctx)
// Evict all evictable MemoryFile allocations.
k.mf.StartEvictions()
@@ -447,12 +455,12 @@ func (k *Kernel) SaveTo(w wire.Writer) error {
// Remove all epoll waiter objects from underlying wait queues.
// NOTE: for programs to resume execution in future snapshot scenarios,
// we will need to re-establish these waiter objects after saving.
- k.tasks.unregisterEpollWaiters()
+ k.tasks.unregisterEpollWaiters(ctx)
// Clear the dirent cache before saving because Dirents must be Loaded in a
// particular order (parents before children), and Loading dirents from a cache
// breaks that order.
- if err := k.flushMountSourceRefs(); err != nil {
+ if err := k.flushMountSourceRefs(ctx); err != nil {
return err
}
@@ -505,7 +513,11 @@ func (k *Kernel) SaveTo(w wire.Writer) error {
// flushMountSourceRefs flushes the MountSources for all mounted filesystems
// and open FDs.
-func (k *Kernel) flushMountSourceRefs() error {
+func (k *Kernel) flushMountSourceRefs(ctx context.Context) error {
+ if VFS2Enabled {
+ return nil // Not relevant.
+ }
+
// Flush all mount sources for currently mounted filesystems in each task.
flushed := make(map[*fs.MountNamespace]struct{})
k.tasks.mu.RLock()
@@ -521,7 +533,7 @@ func (k *Kernel) flushMountSourceRefs() error {
// There may be some open FDs whose filesystems have been unmounted. We
// must flush those as well.
- return k.tasks.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error {
+ return k.tasks.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error {
file.Dirent.Inode.MountSource.FlushDirentRefs()
return nil
})
@@ -531,12 +543,7 @@ func (k *Kernel) flushMountSourceRefs() error {
// each task.
//
// Precondition: Must be called with the kernel paused.
-func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) (err error) {
- // TODO(gvisor.dev/issue/1663): Add save support for VFS2.
- if VFS2Enabled {
- return nil
- }
-
+func (ts *TaskSet) forEachFDPaused(ctx context.Context, f func(*fs.File, *vfs.FileDescription) error) (err error) {
ts.mu.RLock()
defer ts.mu.RUnlock()
for t := range ts.Root.tids {
@@ -544,7 +551,7 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error)
if t.fdTable == nil {
continue
}
- t.fdTable.forEach(func(_ int32, file *fs.File, fileVFS2 *vfs.FileDescription, _ FDFlags) {
+ t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fileVFS2 *vfs.FileDescription, _ FDFlags) {
if lastErr := f(file, fileVFS2); lastErr != nil && err == nil {
err = lastErr
}
@@ -555,7 +562,11 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error)
func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
- return ts.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error {
+ if VFS2Enabled {
+ return nil
+ }
+
+ return ts.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error {
if flags := file.Flags(); !flags.Write {
return nil
}
@@ -602,7 +613,7 @@ func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error {
return nil
}
-func (ts *TaskSet) unregisterEpollWaiters() {
+func (ts *TaskSet) unregisterEpollWaiters(ctx context.Context) {
// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
if VFS2Enabled {
return
@@ -623,7 +634,7 @@ func (ts *TaskSet) unregisterEpollWaiters() {
if _, ok := processed[t.fdTable]; ok {
continue
}
- t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
+ t.fdTable.forEach(ctx, func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
if e, ok := file.FileOperations.(*epoll.EventPoll); ok {
e.UnregisterEpollWaiters()
}
@@ -887,20 +898,21 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
opener fsbridge.Lookup
fsContext *FSContext
mntns *fs.MountNamespace
+ mntnsVFS2 *vfs.MountNamespace
)
if VFS2Enabled {
- mntnsVFS2 := args.MountNamespaceVFS2
+ mntnsVFS2 = args.MountNamespaceVFS2
if mntnsVFS2 == nil {
// MountNamespaceVFS2 adds a reference to the namespace, which is
// transferred to the new process.
mntnsVFS2 = k.globalInit.Leader().MountNamespaceVFS2()
}
// Get the root directory from the MountNamespace.
- root := args.MountNamespaceVFS2.Root()
+ root := mntnsVFS2.Root()
// The call to newFSContext below will take a reference on root, so we
// don't need to hold this one.
- defer root.DecRef()
+ defer root.DecRef(ctx)
// Grab the working directory.
wd := root // Default.
@@ -918,7 +930,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
if err != nil {
return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
}
- defer wd.DecRef()
+ defer wd.DecRef(ctx)
}
opener = fsbridge.NewVFSLookup(mntnsVFS2, root, wd)
fsContext = NewFSContextVFS2(root, wd, args.Umask)
@@ -933,7 +945,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
root := mntns.Root()
// The call to newFSContext below will take a reference on root, so we
// don't need to hold this one.
- defer root.DecRef()
+ defer root.DecRef(ctx)
// Grab the working directory.
remainingTraversals := args.MaxSymlinkTraversals
@@ -944,7 +956,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
if err != nil {
return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
}
- defer wd.DecRef()
+ defer wd.DecRef(ctx)
}
opener = fsbridge.NewFSLookup(mntns, root, wd)
fsContext = newFSContext(root, wd, args.Umask)
@@ -1007,7 +1019,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
UTSNamespace: args.UTSNamespace,
IPCNamespace: args.IPCNamespace,
AbstractSocketNamespace: args.AbstractSocketNamespace,
- MountNamespaceVFS2: args.MountNamespaceVFS2,
+ MountNamespaceVFS2: mntnsVFS2,
ContainerID: args.ContainerID,
}
t, err := k.tasks.NewTask(config)
@@ -1054,7 +1066,7 @@ func (k *Kernel) Start() error {
// If k was created by LoadKernelFrom, timers were stopped during
// Kernel.SaveTo and need to be resumed. If k was created by NewKernel,
// this is a no-op.
- k.resumeTimeLocked()
+ k.resumeTimeLocked(k.SupervisorContext())
// Start task goroutines.
k.tasks.mu.RLock()
defer k.tasks.mu.RUnlock()
@@ -1066,9 +1078,10 @@ func (k *Kernel) Start() error {
// pauseTimeLocked pauses all Timers and Timekeeper updates.
//
-// Preconditions: Any task goroutines running in k must be stopped. k.extMu
-// must be locked.
-func (k *Kernel) pauseTimeLocked() {
+// Preconditions:
+// * Any task goroutines running in k must be stopped.
+// * k.extMu must be locked.
+func (k *Kernel) pauseTimeLocked(ctx context.Context) {
// k.cpuClockTicker may be nil since Kernel.SaveTo() may be called before
// Kernel.Start().
if k.cpuClockTicker != nil {
@@ -1090,7 +1103,7 @@ func (k *Kernel) pauseTimeLocked() {
// This means we'll iterate FDTables shared by multiple tasks repeatedly,
// but ktime.Timer.Pause is idempotent so this is harmless.
if t.fdTable != nil {
- t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
+ t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
if VFS2Enabled {
if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok {
tfd.PauseTimer()
@@ -1110,9 +1123,10 @@ func (k *Kernel) pauseTimeLocked() {
// pauseTimeLocked has not been previously called, resumeTimeLocked has no
// effect.
//
-// Preconditions: Any task goroutines running in k must be stopped. k.extMu
-// must be locked.
-func (k *Kernel) resumeTimeLocked() {
+// Preconditions:
+// * Any task goroutines running in k must be stopped.
+// * k.extMu must be locked.
+func (k *Kernel) resumeTimeLocked(ctx context.Context) {
if k.cpuClockTicker != nil {
k.cpuClockTicker.Resume()
}
@@ -1126,7 +1140,7 @@ func (k *Kernel) resumeTimeLocked() {
}
}
if t.fdTable != nil {
- t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
+ t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
if VFS2Enabled {
if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok {
tfd.ResumeTimer()
@@ -1262,6 +1276,13 @@ func (k *Kernel) Pause() {
k.tasks.aioGoroutines.Wait()
}
+// ReceiveTaskStates receives full states for all tasks.
+func (k *Kernel) ReceiveTaskStates() {
+ k.extMu.Lock()
+ k.tasks.PullFullState()
+ k.extMu.Unlock()
+}
+
// Unpause ends the effect of a previous call to Pause. If Unpause is called
// without a matching preceding call to Pause, Unpause may panic.
func (k *Kernel) Unpause() {
@@ -1498,20 +1519,27 @@ func (k *Kernel) SupervisorContext() context.Context {
}
}
-// SocketEntry represents a socket recorded in Kernel.sockets. It implements
+// SocketRecord represents a socket recorded in Kernel.socketsVFS2.
+//
+// +stateify savable
+type SocketRecord struct {
+ k *Kernel
+ Sock *refs.WeakRef // TODO(gvisor.dev/issue/1624): Only used by VFS1.
+ SockVFS2 *vfs.FileDescription // Only used by VFS2.
+ ID uint64 // Socket table entry number.
+}
+
+// SocketRecordVFS1 represents a socket recorded in Kernel.sockets. It implements
// refs.WeakRefUser for sockets stored in the socket table.
//
// +stateify savable
-type SocketEntry struct {
+type SocketRecordVFS1 struct {
socketEntry
- k *Kernel
- Sock *refs.WeakRef
- SockVFS2 *vfs.FileDescription
- ID uint64 // Socket table entry number.
+ SocketRecord
}
// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
-func (s *SocketEntry) WeakRefGone() {
+func (s *SocketRecordVFS1) WeakRefGone(context.Context) {
s.k.extMu.Lock()
s.k.sockets.Remove(s)
s.k.extMu.Unlock()
@@ -1522,9 +1550,14 @@ func (s *SocketEntry) WeakRefGone() {
// Precondition: Caller must hold a reference to sock.
func (k *Kernel) RecordSocket(sock *fs.File) {
k.extMu.Lock()
- id := k.nextSocketEntry
- k.nextSocketEntry++
- s := &SocketEntry{k: k, ID: id}
+ id := k.nextSocketRecord
+ k.nextSocketRecord++
+ s := &SocketRecordVFS1{
+ SocketRecord: SocketRecord{
+ k: k,
+ ID: id,
+ },
+ }
s.Sock = refs.NewWeakRef(sock, s)
k.sockets.PushBack(s)
k.extMu.Unlock()
@@ -1536,29 +1569,45 @@ func (k *Kernel) RecordSocket(sock *fs.File) {
// Precondition: Caller must hold a reference to sock.
//
// Note that the socket table will not hold a reference on the
-// vfs.FileDescription, because we do not support weak refs on VFS2 files.
+// vfs.FileDescription.
func (k *Kernel) RecordSocketVFS2(sock *vfs.FileDescription) {
k.extMu.Lock()
- id := k.nextSocketEntry
- k.nextSocketEntry++
- s := &SocketEntry{
+ if _, ok := k.socketsVFS2[sock]; ok {
+ panic(fmt.Sprintf("Socket %p added twice", sock))
+ }
+ id := k.nextSocketRecord
+ k.nextSocketRecord++
+ s := &SocketRecord{
k: k,
ID: id,
SockVFS2: sock,
}
- k.sockets.PushBack(s)
+ k.socketsVFS2[sock] = s
+ k.extMu.Unlock()
+}
+
+// DeleteSocketVFS2 removes a VFS2 socket from the system-wide socket table.
+func (k *Kernel) DeleteSocketVFS2(sock *vfs.FileDescription) {
+ k.extMu.Lock()
+ delete(k.socketsVFS2, sock)
k.extMu.Unlock()
}
// ListSockets returns a snapshot of all sockets.
//
-// Callers of ListSockets() in VFS2 should use SocketEntry.SockVFS2.TryIncRef()
+// Callers of ListSockets() in VFS2 should use SocketRecord.SockVFS2.TryIncRef()
// to get a reference on a socket in the table.
-func (k *Kernel) ListSockets() []*SocketEntry {
+func (k *Kernel) ListSockets() []*SocketRecord {
k.extMu.Lock()
- var socks []*SocketEntry
- for s := k.sockets.Front(); s != nil; s = s.Next() {
- socks = append(socks, s)
+ var socks []*SocketRecord
+ if VFS2Enabled {
+ for _, s := range k.socketsVFS2 {
+ socks = append(socks, s)
+ }
+ } else {
+ for s := k.sockets.Front(); s != nil; s = s.Next() {
+ socks = append(socks, &s.SocketRecord)
+ }
}
k.extMu.Unlock()
return socks
@@ -1600,7 +1649,7 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
return vfs.VirtualDentry{}
}
mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
- defer mntns.DecRef()
+ defer mntns.DecRef(ctx)
// Root() takes a reference on the root dirent for us.
return mntns.Root()
case vfs.CtxMountNamespace: