diff options
Diffstat (limited to 'pkg/sentry/kernel')
-rw-r--r-- | pkg/sentry/kernel/epoll/epoll.go | 2 | ||||
-rw-r--r-- | pkg/sentry/kernel/fd_table.go | 24 | ||||
-rw-r--r-- | pkg/sentry/kernel/kernel.go | 40 | ||||
-rw-r--r-- | pkg/sentry/kernel/semaphore/semaphore.go | 1 | ||||
-rw-r--r-- | pkg/sentry/kernel/sessions.go | 2 | ||||
-rw-r--r-- | pkg/sentry/kernel/task.go | 29 | ||||
-rw-r--r-- | pkg/sentry/kernel/task_clone.go | 9 | ||||
-rw-r--r-- | pkg/sentry/kernel/task_start.go | 4 | ||||
-rw-r--r-- | pkg/sentry/kernel/thread_group.go | 7 |
9 files changed, 68 insertions, 50 deletions
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go index 8bffb78fc..592650923 100644 --- a/pkg/sentry/kernel/epoll/epoll.go +++ b/pkg/sentry/kernel/epoll/epoll.go @@ -296,8 +296,10 @@ func (*readyCallback) Callback(w *waiter.Entry) { e.waitingList.Remove(entry) e.readyList.PushBack(entry) entry.curList = &e.readyList + e.listsMu.Unlock() e.Notify(waiter.EventIn) + return } e.listsMu.Unlock() diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index 58001d56c..d09d97825 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -191,10 +191,12 @@ func (f *FDTable) Size() int { return int(size) } -// forEach iterates over all non-nil files. +// forEach iterates over all non-nil files in sorted order. // // It is the caller's responsibility to acquire an appropriate lock. func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) { + // retries tracks the number of failed TryIncRef attempts for the same FD. + retries := 0 fd := int32(0) for { file, fileVFS2, flags, ok := f.getAll(fd) @@ -204,17 +206,26 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes switch { case file != nil: if !file.TryIncRef() { + retries++ + if retries > 1000 { + panic(fmt.Sprintf("File in FD table has been destroyed. FD: %d, File: %+v, FileOps: %+v", fd, file, file.FileOperations)) + } continue // Race caught. } fn(fd, file, nil, flags) file.DecRef() case fileVFS2 != nil: if !fileVFS2.TryIncRef() { + retries++ + if retries > 1000 { + panic(fmt.Sprintf("File in FD table has been destroyed. FD: %d, File: %+v, Impl: %+v", fd, fileVFS2, fileVFS2.Impl())) + } continue // Race caught. } fn(fd, nil, fileVFS2, flags) fileVFS2.DecRef() } + retries = 0 fd++ } } @@ -327,7 +338,7 @@ func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDesc fd = f.next } for fd < end { - if d, _, _ := f.get(fd); d == nil { + if d, _, _ := f.getVFS2(fd); d == nil { f.setVFS2(fd, file, flags) if fd == f.next { // Update next search start position. @@ -447,7 +458,10 @@ func (f *FDTable) GetVFS2(fd int32) (*vfs.FileDescription, FDFlags) { } } -// GetFDs returns a list of valid fds. +// GetFDs returns a sorted list of valid fds. +// +// Precondition: The caller must be running on the task goroutine, or Task.mu +// must be locked. func (f *FDTable) GetFDs() []int32 { fds := make([]int32, 0, int(atomic.LoadInt32(&f.used))) f.forEach(func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) { @@ -522,7 +536,9 @@ func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) { case orig2 != nil: orig2.IncRef() } - f.setAll(fd, nil, nil, FDFlags{}) // Zap entry. + if orig != nil || orig2 != nil { + f.setAll(fd, nil, nil, FDFlags{}) // Zap entry. + } return orig, orig2 } diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 1d627564f..6feda8fa1 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -467,6 +467,11 @@ func (k *Kernel) flushMountSourceRefs() error { // // Precondition: Must be called with the kernel paused. func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) (err error) { + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. + if VFS2Enabled { + return nil + } + ts.mu.RLock() defer ts.mu.RUnlock() for t := range ts.Root.tids { @@ -484,7 +489,7 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) } func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error { - // TODO(gvisor.dev/issues/1663): Add save support for VFS2. + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. return ts.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error { if flags := file.Flags(); !flags.Write { return nil @@ -533,6 +538,11 @@ func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { } func (ts *TaskSet) unregisterEpollWaiters() { + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. + if VFS2Enabled { + return + } + ts.mu.RLock() defer ts.mu.RUnlock() for t := range ts.Root.tids { @@ -1005,11 +1015,14 @@ func (k *Kernel) pauseTimeLocked() { // This means we'll iterate FDTables shared by multiple tasks repeatedly, // but ktime.Timer.Pause is idempotent so this is harmless. if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { - if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { - tfd.PauseTimer() - } - }) + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. + if !VFS2Enabled { + t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { + if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { + tfd.PauseTimer() + } + }) + } } } k.timekeeper.PauseUpdates() @@ -1034,12 +1047,15 @@ func (k *Kernel) resumeTimeLocked() { it.ResumeTimer() } } - if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { - if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { - tfd.ResumeTimer() - } - }) + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. + if !VFS2Enabled { + if t.fdTable != nil { + t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { + if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { + tfd.ResumeTimer() + } + }) + } } } } diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go index 1000f3287..c00fa1138 100644 --- a/pkg/sentry/kernel/semaphore/semaphore.go +++ b/pkg/sentry/kernel/semaphore/semaphore.go @@ -554,6 +554,7 @@ func (s *sem) wakeWaiters() { for w := s.waiters.Front(); w != nil; { if s.value < w.value { // Still blocked, skip it. + w = w.Next() continue } w.ch <- struct{}{} diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go index 047b5214d..0e19286de 100644 --- a/pkg/sentry/kernel/sessions.go +++ b/pkg/sentry/kernel/sessions.go @@ -246,7 +246,7 @@ func (pg *ProcessGroup) SendSignal(info *arch.SignalInfo) error { var lastErr error for tg := range tasks.Root.tgids { - if tg.ProcessGroup() == pg { + if tg.processGroup == pg { tg.signalHandlers.mu.Lock() infoCopy := *info if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil { diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index c0dbbe890..8452ddf5b 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -555,13 +555,6 @@ type Task struct { // // startTime is protected by mu. startTime ktime.Time - - // oomScoreAdj is the task's OOM score adjustment. This is currently not - // used but is maintained for consistency. - // TODO(gvisor.dev/issue/1967) - // - // oomScoreAdj is protected by mu, and is owned by the task goroutine. - oomScoreAdj int32 } func (t *Task) savePtraceTracer() *Task { @@ -856,27 +849,17 @@ func (t *Task) ContainerID() string { return t.containerID } -// OOMScoreAdj gets the task's OOM score adjustment. -func (t *Task) OOMScoreAdj() (int32, error) { - t.mu.Lock() - defer t.mu.Unlock() - if t.ExitState() == TaskExitDead { - return 0, syserror.ESRCH - } - return t.oomScoreAdj, nil +// OOMScoreAdj gets the task's thread group's OOM score adjustment. +func (t *Task) OOMScoreAdj() int32 { + return atomic.LoadInt32(&t.tg.oomScoreAdj) } -// SetOOMScoreAdj sets the task's OOM score adjustment. The value should be -// between -1000 and 1000 inclusive. +// SetOOMScoreAdj sets the task's thread group's OOM score adjustment. The +// value should be between -1000 and 1000 inclusive. func (t *Task) SetOOMScoreAdj(adj int32) error { - t.mu.Lock() - defer t.mu.Unlock() - if t.ExitState() == TaskExitDead { - return syserror.ESRCH - } if adj > 1000 || adj < -1000 { return syserror.EINVAL } - t.oomScoreAdj = adj + atomic.StoreInt32(&t.tg.oomScoreAdj, adj) return nil } diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index dda502bb8..e1ecca99e 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -15,6 +15,8 @@ package kernel import ( + "sync/atomic" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/sentry/inet" @@ -260,15 +262,11 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { sh = sh.Fork() } tg = t.k.NewThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy()) + tg.oomScoreAdj = atomic.LoadInt32(&t.tg.oomScoreAdj) rseqAddr = t.rseqAddr rseqSignature = t.rseqSignature } - adj, err := t.OOMScoreAdj() - if err != nil { - return 0, nil, err - } - cfg := &TaskConfig{ Kernel: t.k, ThreadGroup: tg, @@ -287,7 +285,6 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { RSeqAddr: rseqAddr, RSeqSignature: rseqSignature, ContainerID: t.ContainerID(), - OOMScoreAdj: adj, } if opts.NewThreadGroup { cfg.Parent = t diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index 2bbf48bb8..a5035bb7f 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -93,9 +93,6 @@ type TaskConfig struct { // ContainerID is the container the new task belongs to. ContainerID string - - // oomScoreAdj is the task's OOM score adjustment. - OOMScoreAdj int32 } // NewTask creates a new task defined by cfg. @@ -146,7 +143,6 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { rseqSignature: cfg.RSeqSignature, futexWaiter: futex.NewWaiter(), containerID: cfg.ContainerID, - oomScoreAdj: cfg.OOMScoreAdj, } t.creds.Store(cfg.Credentials) t.endStopCond.L = &t.tg.signalHandlers.mu diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go index 268f62e9d..52849f5b3 100644 --- a/pkg/sentry/kernel/thread_group.go +++ b/pkg/sentry/kernel/thread_group.go @@ -254,6 +254,13 @@ type ThreadGroup struct { // // tty is protected by the signal mutex. tty *TTY + + // oomScoreAdj is the thread group's OOM score adjustment. This is + // currently not used but is maintained for consistency. + // TODO(gvisor.dev/issue/1967) + // + // oomScoreAdj is accessed using atomic memory operations. + oomScoreAdj int32 } // NewThreadGroup returns a new, empty thread group in PID namespace pidns. The |