diff options
Diffstat (limited to 'pkg/sentry/kernel')
-rw-r--r-- | pkg/sentry/kernel/epoll/epoll_state.go | 13 | ||||
-rw-r--r-- | pkg/sentry/kernel/fd_table.go | 24 | ||||
-rw-r--r-- | pkg/sentry/kernel/kernel.go | 40 | ||||
-rw-r--r-- | pkg/sentry/kernel/semaphore/semaphore.go | 1 | ||||
-rw-r--r-- | pkg/sentry/kernel/sessions.go | 2 | ||||
-rw-r--r-- | pkg/sentry/kernel/task.go | 16 | ||||
-rw-r--r-- | pkg/sentry/kernel/task_clone.go | 3 | ||||
-rw-r--r-- | pkg/sentry/kernel/thread_group.go | 7 |
8 files changed, 84 insertions, 22 deletions
diff --git a/pkg/sentry/kernel/epoll/epoll_state.go b/pkg/sentry/kernel/epoll/epoll_state.go index a0d35d350..8e9f200d0 100644 --- a/pkg/sentry/kernel/epoll/epoll_state.go +++ b/pkg/sentry/kernel/epoll/epoll_state.go @@ -38,11 +38,14 @@ func (e *EventPoll) afterLoad() { } } - for it := e.waitingList.Front(); it != nil; it = it.Next() { - if it.id.File.Readiness(it.mask) != 0 { - e.waitingList.Remove(it) - e.readyList.PushBack(it) - it.curList = &e.readyList + for it := e.waitingList.Front(); it != nil; { + entry := it + it = it.Next() + + if entry.id.File.Readiness(entry.mask) != 0 { + e.waitingList.Remove(entry) + e.readyList.PushBack(entry) + entry.curList = &e.readyList e.Notify(waiter.EventIn) } } diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index 58001d56c..d09d97825 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -191,10 +191,12 @@ func (f *FDTable) Size() int { return int(size) } -// forEach iterates over all non-nil files. +// forEach iterates over all non-nil files in sorted order. // // It is the caller's responsibility to acquire an appropriate lock. func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) { + // retries tracks the number of failed TryIncRef attempts for the same FD. + retries := 0 fd := int32(0) for { file, fileVFS2, flags, ok := f.getAll(fd) @@ -204,17 +206,26 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes switch { case file != nil: if !file.TryIncRef() { + retries++ + if retries > 1000 { + panic(fmt.Sprintf("File in FD table has been destroyed. FD: %d, File: %+v, FileOps: %+v", fd, file, file.FileOperations)) + } continue // Race caught. } fn(fd, file, nil, flags) file.DecRef() case fileVFS2 != nil: if !fileVFS2.TryIncRef() { + retries++ + if retries > 1000 { + panic(fmt.Sprintf("File in FD table has been destroyed. FD: %d, File: %+v, Impl: %+v", fd, fileVFS2, fileVFS2.Impl())) + } continue // Race caught. } fn(fd, nil, fileVFS2, flags) fileVFS2.DecRef() } + retries = 0 fd++ } } @@ -327,7 +338,7 @@ func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDesc fd = f.next } for fd < end { - if d, _, _ := f.get(fd); d == nil { + if d, _, _ := f.getVFS2(fd); d == nil { f.setVFS2(fd, file, flags) if fd == f.next { // Update next search start position. @@ -447,7 +458,10 @@ func (f *FDTable) GetVFS2(fd int32) (*vfs.FileDescription, FDFlags) { } } -// GetFDs returns a list of valid fds. +// GetFDs returns a sorted list of valid fds. +// +// Precondition: The caller must be running on the task goroutine, or Task.mu +// must be locked. func (f *FDTable) GetFDs() []int32 { fds := make([]int32, 0, int(atomic.LoadInt32(&f.used))) f.forEach(func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) { @@ -522,7 +536,9 @@ func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) { case orig2 != nil: orig2.IncRef() } - f.setAll(fd, nil, nil, FDFlags{}) // Zap entry. + if orig != nil || orig2 != nil { + f.setAll(fd, nil, nil, FDFlags{}) // Zap entry. + } return orig, orig2 } diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 1d627564f..6feda8fa1 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -467,6 +467,11 @@ func (k *Kernel) flushMountSourceRefs() error { // // Precondition: Must be called with the kernel paused. func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) (err error) { + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. + if VFS2Enabled { + return nil + } + ts.mu.RLock() defer ts.mu.RUnlock() for t := range ts.Root.tids { @@ -484,7 +489,7 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) } func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error { - // TODO(gvisor.dev/issues/1663): Add save support for VFS2. + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. return ts.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error { if flags := file.Flags(); !flags.Write { return nil @@ -533,6 +538,11 @@ func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { } func (ts *TaskSet) unregisterEpollWaiters() { + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. + if VFS2Enabled { + return + } + ts.mu.RLock() defer ts.mu.RUnlock() for t := range ts.Root.tids { @@ -1005,11 +1015,14 @@ func (k *Kernel) pauseTimeLocked() { // This means we'll iterate FDTables shared by multiple tasks repeatedly, // but ktime.Timer.Pause is idempotent so this is harmless. if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { - if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { - tfd.PauseTimer() - } - }) + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. + if !VFS2Enabled { + t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { + if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { + tfd.PauseTimer() + } + }) + } } } k.timekeeper.PauseUpdates() @@ -1034,12 +1047,15 @@ func (k *Kernel) resumeTimeLocked() { it.ResumeTimer() } } - if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { - if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { - tfd.ResumeTimer() - } - }) + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. + if !VFS2Enabled { + if t.fdTable != nil { + t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { + if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { + tfd.ResumeTimer() + } + }) + } } } } diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go index 1000f3287..c00fa1138 100644 --- a/pkg/sentry/kernel/semaphore/semaphore.go +++ b/pkg/sentry/kernel/semaphore/semaphore.go @@ -554,6 +554,7 @@ func (s *sem) wakeWaiters() { for w := s.waiters.Front(); w != nil; { if s.value < w.value { // Still blocked, skip it. + w = w.Next() continue } w.ch <- struct{}{} diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go index 047b5214d..0e19286de 100644 --- a/pkg/sentry/kernel/sessions.go +++ b/pkg/sentry/kernel/sessions.go @@ -246,7 +246,7 @@ func (pg *ProcessGroup) SendSignal(info *arch.SignalInfo) error { var lastErr error for tg := range tasks.Root.tgids { - if tg.ProcessGroup() == pg { + if tg.processGroup == pg { tg.signalHandlers.mu.Lock() infoCopy := *info if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil { diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index 2cee2e6ed..8452ddf5b 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -37,6 +37,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -847,3 +848,18 @@ func (t *Task) AbstractSockets() *AbstractSocketNamespace { func (t *Task) ContainerID() string { return t.containerID } + +// OOMScoreAdj gets the task's thread group's OOM score adjustment. +func (t *Task) OOMScoreAdj() int32 { + return atomic.LoadInt32(&t.tg.oomScoreAdj) +} + +// SetOOMScoreAdj sets the task's thread group's OOM score adjustment. The +// value should be between -1000 and 1000 inclusive. +func (t *Task) SetOOMScoreAdj(adj int32) error { + if adj > 1000 || adj < -1000 { + return syserror.EINVAL + } + atomic.StoreInt32(&t.tg.oomScoreAdj, adj) + return nil +} diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index 78866f280..e1ecca99e 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -15,6 +15,8 @@ package kernel import ( + "sync/atomic" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/sentry/inet" @@ -260,6 +262,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { sh = sh.Fork() } tg = t.k.NewThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy()) + tg.oomScoreAdj = atomic.LoadInt32(&t.tg.oomScoreAdj) rseqAddr = t.rseqAddr rseqSignature = t.rseqSignature } diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go index 268f62e9d..52849f5b3 100644 --- a/pkg/sentry/kernel/thread_group.go +++ b/pkg/sentry/kernel/thread_group.go @@ -254,6 +254,13 @@ type ThreadGroup struct { // // tty is protected by the signal mutex. tty *TTY + + // oomScoreAdj is the thread group's OOM score adjustment. This is + // currently not used but is maintained for consistency. + // TODO(gvisor.dev/issue/1967) + // + // oomScoreAdj is accessed using atomic memory operations. + oomScoreAdj int32 } // NewThreadGroup returns a new, empty thread group in PID namespace pidns. The |