summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/kernel/kernel.go
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/kernel/kernel.go')
-rw-r--r--pkg/sentry/kernel/kernel.go104
1 files changed, 84 insertions, 20 deletions
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 6feda8fa1..de8a95854 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -50,6 +50,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/timerfd"
"gvisor.dev/gvisor/pkg/sentry/fsbridge"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
"gvisor.dev/gvisor/pkg/sentry/hostcpu"
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -225,6 +226,11 @@ type Kernel struct {
// by extMu.
nextSocketEntry uint64
+ // socketMount is a disconnected vfs.Mount, not included in k.vfs,
+ // representing a sockfs.filesystem. socketMount is used to back
+ // VirtualDentries representing anonymous sockets.
+ socketMount *vfs.Mount
+
// deviceRegistry is used to save/restore device.SimpleDevices.
deviceRegistry struct{} `state:".(*device.Registry)"`
@@ -348,6 +354,19 @@ func (k *Kernel) Init(args InitKernelArgs) error {
k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic}
k.futexes = futex.NewManager()
k.netlinkPorts = port.New()
+ if VFS2Enabled {
+ if err := k.vfs.Init(); err != nil {
+ return fmt.Errorf("failed to initialize VFS: %v", err)
+ }
+ fs := sockfs.NewFilesystem(&k.vfs)
+ // NewDisconnectedMount will take an additional reference on fs.
+ defer fs.DecRef()
+ sm, err := k.vfs.NewDisconnectedMount(fs, nil, &vfs.MountOptions{})
+ if err != nil {
+ return fmt.Errorf("failed to initialize socket mount: %v", err)
+ }
+ k.socketMount = sm
+ }
return nil
}
@@ -545,15 +564,25 @@ func (ts *TaskSet) unregisterEpollWaiters() {
ts.mu.RLock()
defer ts.mu.RUnlock()
+
+ // Tasks that belong to the same process could potentially point to the
+ // same FDTable. So we retain a map of processed ones to avoid
+ // processing the same FDTable multiple times.
+ processed := make(map[*FDTable]struct{})
for t := range ts.Root.tids {
// We can skip locking Task.mu here since the kernel is paused.
- if t.fdTable != nil {
- t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
- if e, ok := file.FileOperations.(*epoll.EventPoll); ok {
- e.UnregisterEpollWaiters()
- }
- })
+ if t.fdTable == nil {
+ continue
+ }
+ if _, ok := processed[t.fdTable]; ok {
+ continue
}
+ t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
+ if e, ok := file.FileOperations.(*epoll.EventPoll); ok {
+ e.UnregisterEpollWaiters()
+ }
+ })
+ processed[t.fdTable] = struct{}{}
}
}
@@ -1015,14 +1044,17 @@ func (k *Kernel) pauseTimeLocked() {
// This means we'll iterate FDTables shared by multiple tasks repeatedly,
// but ktime.Timer.Pause is idempotent so this is harmless.
if t.fdTable != nil {
- // TODO(gvisor.dev/issue/1663): Add save support for VFS2.
- if !VFS2Enabled {
- t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
+ t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
+ if VFS2Enabled {
+ if tfd, ok := fd.Impl().(*vfs.TimerFileDescription); ok {
+ tfd.PauseTimer()
+ }
+ } else {
if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
tfd.PauseTimer()
}
- })
- }
+ }
+ })
}
}
k.timekeeper.PauseUpdates()
@@ -1047,15 +1079,18 @@ func (k *Kernel) resumeTimeLocked() {
it.ResumeTimer()
}
}
- // TODO(gvisor.dev/issue/1663): Add save support for VFS2.
- if !VFS2Enabled {
- if t.fdTable != nil {
- t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
+ if t.fdTable != nil {
+ t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
+ if VFS2Enabled {
+ if tfd, ok := fd.Impl().(*vfs.TimerFileDescription); ok {
+ tfd.ResumeTimer()
+ }
+ } else {
if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
tfd.ResumeTimer()
}
- })
- }
+ }
+ })
}
}
}
@@ -1416,9 +1451,10 @@ func (k *Kernel) SupervisorContext() context.Context {
// +stateify savable
type SocketEntry struct {
socketEntry
- k *Kernel
- Sock *refs.WeakRef
- ID uint64 // Socket table entry number.
+ k *Kernel
+ Sock *refs.WeakRef
+ SockVFS2 *vfs.FileDescription
+ ID uint64 // Socket table entry number.
}
// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
@@ -1441,7 +1477,30 @@ func (k *Kernel) RecordSocket(sock *fs.File) {
k.extMu.Unlock()
}
+// RecordSocketVFS2 adds a VFS2 socket to the system-wide socket table for
+// tracking.
+//
+// Precondition: Caller must hold a reference to sock.
+//
+// Note that the socket table will not hold a reference on the
+// vfs.FileDescription, because we do not support weak refs on VFS2 files.
+func (k *Kernel) RecordSocketVFS2(sock *vfs.FileDescription) {
+ k.extMu.Lock()
+ id := k.nextSocketEntry
+ k.nextSocketEntry++
+ s := &SocketEntry{
+ k: k,
+ ID: id,
+ SockVFS2: sock,
+ }
+ k.sockets.PushBack(s)
+ k.extMu.Unlock()
+}
+
// ListSockets returns a snapshot of all sockets.
+//
+// Callers of ListSockets() in VFS2 should use SocketEntry.SockVFS2.TryIncRef()
+// to get a reference on a socket in the table.
func (k *Kernel) ListSockets() []*SocketEntry {
k.extMu.Lock()
var socks []*SocketEntry
@@ -1452,6 +1511,11 @@ func (k *Kernel) ListSockets() []*SocketEntry {
return socks
}
+// SocketMount returns the global socket mount.
+func (k *Kernel) SocketMount() *vfs.Mount {
+ return k.socketMount
+}
+
// supervisorContext is a privileged context.
type supervisorContext struct {
context.NoopSleeper