summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/kernel')
-rw-r--r--pkg/sentry/kernel/BUILD3
-rw-r--r--pkg/sentry/kernel/epoll/BUILD1
-rw-r--r--pkg/sentry/kernel/epoll/epoll.go22
-rw-r--r--pkg/sentry/kernel/fd_table.go79
-rw-r--r--pkg/sentry/kernel/kernel.go152
-rw-r--r--pkg/sentry/kernel/pipe/pipe.go13
-rw-r--r--pkg/sentry/kernel/pipe/vfs.go162
-rw-r--r--pkg/sentry/kernel/ptrace.go1
-rw-r--r--pkg/sentry/kernel/rseq.go2
-rw-r--r--pkg/sentry/kernel/semaphore/semaphore.go1
-rw-r--r--pkg/sentry/kernel/sessions.go2
-rw-r--r--pkg/sentry/kernel/shm/shm.go2
-rw-r--r--pkg/sentry/kernel/syscalls.go43
-rw-r--r--pkg/sentry/kernel/syscalls_state.go36
-rw-r--r--pkg/sentry/kernel/task.go52
-rw-r--r--pkg/sentry/kernel/task_clone.go9
-rw-r--r--pkg/sentry/kernel/task_context.go3
-rw-r--r--pkg/sentry/kernel/task_identity.go2
-rw-r--r--pkg/sentry/kernel/task_run.go3
-rw-r--r--pkg/sentry/kernel/task_sched.go4
-rw-r--r--pkg/sentry/kernel/task_signals.go17
-rw-r--r--pkg/sentry/kernel/task_start.go4
-rw-r--r--pkg/sentry/kernel/task_syscall.go26
-rw-r--r--pkg/sentry/kernel/thread_group.go7
-rw-r--r--pkg/sentry/kernel/time/time.go10
25 files changed, 458 insertions, 198 deletions
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index beba29a09..e47af66d6 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -169,6 +169,9 @@ go_library(
"//pkg/sentry/fs/lock",
"//pkg/sentry/fs/timerfd",
"//pkg/sentry/fsbridge",
+ "//pkg/sentry/fsimpl/kernfs",
+ "//pkg/sentry/fsimpl/pipefs",
+ "//pkg/sentry/fsimpl/sockfs",
"//pkg/sentry/hostcpu",
"//pkg/sentry/inet",
"//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD
index dedf0fa15..75eedd5a2 100644
--- a/pkg/sentry/kernel/epoll/BUILD
+++ b/pkg/sentry/kernel/epoll/BUILD
@@ -24,6 +24,7 @@ go_library(
],
visibility = ["//pkg/sentry:internal"],
deps = [
+ "//pkg/abi/linux",
"//pkg/context",
"//pkg/refs",
"//pkg/sentry/fs",
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index 8bffb78fc..3d78cd48f 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -20,6 +20,7 @@ import (
"fmt"
"syscall"
+ "gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -30,19 +31,6 @@ import (
"gvisor.dev/gvisor/pkg/waiter"
)
-// Event describes the event mask that was observed and the user data to be
-// returned when one of the events occurs. It has this format to match the linux
-// format to avoid extra copying/allocation when writing events to userspace.
-type Event struct {
- // Events is the event mask containing the set of events that have been
- // observed on an entry.
- Events uint32
-
- // Data is an opaque 64-bit value provided by the caller when adding the
- // entry, and returned to the caller when the entry reports an event.
- Data [2]int32
-}
-
// EntryFlags is a bitmask that holds an entry's flags.
type EntryFlags int
@@ -227,9 +215,9 @@ func (e *EventPoll) Readiness(mask waiter.EventMask) waiter.EventMask {
}
// ReadEvents returns up to max available events.
-func (e *EventPoll) ReadEvents(max int) []Event {
+func (e *EventPoll) ReadEvents(max int) []linux.EpollEvent {
var local pollEntryList
- var ret []Event
+ var ret []linux.EpollEvent
e.listsMu.Lock()
@@ -251,7 +239,7 @@ func (e *EventPoll) ReadEvents(max int) []Event {
}
// Add event to the array that will be returned to caller.
- ret = append(ret, Event{
+ ret = append(ret, linux.EpollEvent{
Events: uint32(ready),
Data: entry.userData,
})
@@ -296,8 +284,10 @@ func (*readyCallback) Callback(w *waiter.Entry) {
e.waitingList.Remove(entry)
e.readyList.PushBack(entry)
entry.curList = &e.readyList
+ e.listsMu.Unlock()
e.Notify(waiter.EventIn)
+ return
}
e.listsMu.Unlock()
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index 58001d56c..ed40b5303 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -191,10 +191,12 @@ func (f *FDTable) Size() int {
return int(size)
}
-// forEach iterates over all non-nil files.
+// forEach iterates over all non-nil files in sorted order.
//
// It is the caller's responsibility to acquire an appropriate lock.
func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) {
+ // retries tracks the number of failed TryIncRef attempts for the same FD.
+ retries := 0
fd := int32(0)
for {
file, fileVFS2, flags, ok := f.getAll(fd)
@@ -204,17 +206,26 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes
switch {
case file != nil:
if !file.TryIncRef() {
+ retries++
+ if retries > 1000 {
+ panic(fmt.Sprintf("File in FD table has been destroyed. FD: %d, File: %+v, FileOps: %+v", fd, file, file.FileOperations))
+ }
continue // Race caught.
}
fn(fd, file, nil, flags)
file.DecRef()
case fileVFS2 != nil:
if !fileVFS2.TryIncRef() {
+ retries++
+ if retries > 1000 {
+ panic(fmt.Sprintf("File in FD table has been destroyed. FD: %d, File: %+v, Impl: %+v", fd, fileVFS2, fileVFS2.Impl()))
+ }
continue // Race caught.
}
fn(fd, nil, fileVFS2, flags)
fileVFS2.DecRef()
}
+ retries = 0
fd++
}
}
@@ -296,6 +307,61 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
return fds, nil
}
+// NewFDsVFS2 allocates new FDs guaranteed to be the lowest number available
+// greater than or equal to the fd parameter. All files will share the set
+// flags. Success is guaranteed to be all or none.
+func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDescription, flags FDFlags) (fds []int32, err error) {
+ if fd < 0 {
+ // Don't accept negative FDs.
+ return nil, syscall.EINVAL
+ }
+
+ // Default limit.
+ end := int32(math.MaxInt32)
+
+ // Ensure we don't get past the provided limit.
+ if limitSet := limits.FromContext(ctx); limitSet != nil {
+ lim := limitSet.Get(limits.NumberOfFiles)
+ if lim.Cur != limits.Infinity {
+ end = int32(lim.Cur)
+ }
+ if fd >= end {
+ return nil, syscall.EMFILE
+ }
+ }
+
+ f.mu.Lock()
+ defer f.mu.Unlock()
+
+ // From f.next to find available fd.
+ if fd < f.next {
+ fd = f.next
+ }
+
+ // Install all entries.
+ for i := fd; i < end && len(fds) < len(files); i++ {
+ if d, _, _ := f.getVFS2(i); d == nil {
+ f.setVFS2(i, files[len(fds)], flags) // Set the descriptor.
+ fds = append(fds, i) // Record the file descriptor.
+ }
+ }
+
+ // Failure? Unwind existing FDs.
+ if len(fds) < len(files) {
+ for _, i := range fds {
+ f.setVFS2(i, nil, FDFlags{}) // Zap entry.
+ }
+ return nil, syscall.EMFILE
+ }
+
+ if fd == f.next {
+ // Update next search start position.
+ f.next = fds[len(fds)-1] + 1
+ }
+
+ return fds, nil
+}
+
// NewFDVFS2 allocates a file descriptor greater than or equal to minfd for
// the given file description. If it succeeds, it takes a reference on file.
func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) {
@@ -327,7 +393,7 @@ func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDesc
fd = f.next
}
for fd < end {
- if d, _, _ := f.get(fd); d == nil {
+ if d, _, _ := f.getVFS2(fd); d == nil {
f.setVFS2(fd, file, flags)
if fd == f.next {
// Update next search start position.
@@ -447,7 +513,10 @@ func (f *FDTable) GetVFS2(fd int32) (*vfs.FileDescription, FDFlags) {
}
}
-// GetFDs returns a list of valid fds.
+// GetFDs returns a sorted list of valid fds.
+//
+// Precondition: The caller must be running on the task goroutine, or Task.mu
+// must be locked.
func (f *FDTable) GetFDs() []int32 {
fds := make([]int32, 0, int(atomic.LoadInt32(&f.used)))
f.forEach(func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) {
@@ -522,7 +591,9 @@ func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) {
case orig2 != nil:
orig2.IncRef()
}
- f.setAll(fd, nil, nil, FDFlags{}) // Zap entry.
+ if orig != nil || orig2 != nil {
+ f.setAll(fd, nil, nil, FDFlags{}) // Zap entry.
+ }
return orig, orig2
}
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 1d627564f..c91b9dce2 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -50,6 +50,8 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/timerfd"
"gvisor.dev/gvisor/pkg/sentry/fsbridge"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
"gvisor.dev/gvisor/pkg/sentry/hostcpu"
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -248,6 +250,22 @@ type Kernel struct {
// VFS keeps the filesystem state used across the kernel.
vfs vfs.VirtualFilesystem
+ // hostMount is the Mount used for file descriptors that were imported
+ // from the host.
+ hostMount *vfs.Mount
+
+ // pipeMount is the Mount used for pipes created by the pipe() and pipe2()
+ // syscalls (as opposed to named pipes created by mknod()).
+ pipeMount *vfs.Mount
+
+ // socketMount is the Mount used for sockets created by the socket() and
+ // socketpair() syscalls. There are several cases where a socket dentry will
+ // not be contained in socketMount:
+ // 1. Socket files created by mknod()
+ // 2. Socket fds imported from the host (Kernel.hostMount is used for these)
+ // 3. Socket files created by binding Unix sockets to a file path
+ socketMount *vfs.Mount
+
// If set to true, report address space activation waits as if the task is in
// external wait so that the watchdog doesn't report the task stuck.
SleepForAddressSpaceActivation bool
@@ -348,6 +366,29 @@ func (k *Kernel) Init(args InitKernelArgs) error {
k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic}
k.futexes = futex.NewManager()
k.netlinkPorts = port.New()
+
+ if VFS2Enabled {
+ if err := k.vfs.Init(); err != nil {
+ return fmt.Errorf("failed to initialize VFS: %v", err)
+ }
+
+ pipeFilesystem := pipefs.NewFilesystem(&k.vfs)
+ defer pipeFilesystem.DecRef()
+ pipeMount, err := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{})
+ if err != nil {
+ return fmt.Errorf("failed to create pipefs mount: %v", err)
+ }
+ k.pipeMount = pipeMount
+
+ socketFilesystem := sockfs.NewFilesystem(&k.vfs)
+ defer socketFilesystem.DecRef()
+ socketMount, err := k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{})
+ if err != nil {
+ return fmt.Errorf("failed to create sockfs mount: %v", err)
+ }
+ k.socketMount = socketMount
+ }
+
return nil
}
@@ -467,6 +508,11 @@ func (k *Kernel) flushMountSourceRefs() error {
//
// Precondition: Must be called with the kernel paused.
func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) (err error) {
+ // TODO(gvisor.dev/issue/1663): Add save support for VFS2.
+ if VFS2Enabled {
+ return nil
+ }
+
ts.mu.RLock()
defer ts.mu.RUnlock()
for t := range ts.Root.tids {
@@ -484,7 +530,7 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error)
}
func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
- // TODO(gvisor.dev/issues/1663): Add save support for VFS2.
+ // TODO(gvisor.dev/issue/1663): Add save support for VFS2.
return ts.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error {
if flags := file.Flags(); !flags.Write {
return nil
@@ -533,17 +579,32 @@ func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error {
}
func (ts *TaskSet) unregisterEpollWaiters() {
+ // TODO(gvisor.dev/issue/1663): Add save support for VFS2.
+ if VFS2Enabled {
+ return
+ }
+
ts.mu.RLock()
defer ts.mu.RUnlock()
+
+ // Tasks that belong to the same process could potentially point to the
+ // same FDTable. So we retain a map of processed ones to avoid
+ // processing the same FDTable multiple times.
+ processed := make(map[*FDTable]struct{})
for t := range ts.Root.tids {
// We can skip locking Task.mu here since the kernel is paused.
- if t.fdTable != nil {
- t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
- if e, ok := file.FileOperations.(*epoll.EventPoll); ok {
- e.UnregisterEpollWaiters()
- }
- })
+ if t.fdTable == nil {
+ continue
+ }
+ if _, ok := processed[t.fdTable]; ok {
+ continue
}
+ t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
+ if e, ok := file.FileOperations.(*epoll.EventPoll); ok {
+ e.UnregisterEpollWaiters()
+ }
+ })
+ processed[t.fdTable] = struct{}{}
}
}
@@ -1005,9 +1066,15 @@ func (k *Kernel) pauseTimeLocked() {
// This means we'll iterate FDTables shared by multiple tasks repeatedly,
// but ktime.Timer.Pause is idempotent so this is harmless.
if t.fdTable != nil {
- t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
- if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
- tfd.PauseTimer()
+ t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
+ if VFS2Enabled {
+ if tfd, ok := fd.Impl().(*vfs.TimerFileDescription); ok {
+ tfd.PauseTimer()
+ }
+ } else {
+ if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
+ tfd.PauseTimer()
+ }
}
})
}
@@ -1035,9 +1102,15 @@ func (k *Kernel) resumeTimeLocked() {
}
}
if t.fdTable != nil {
- t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
- if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
- tfd.ResumeTimer()
+ t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
+ if VFS2Enabled {
+ if tfd, ok := fd.Impl().(*vfs.TimerFileDescription); ok {
+ tfd.ResumeTimer()
+ }
+ } else {
+ if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
+ tfd.ResumeTimer()
+ }
}
})
}
@@ -1400,9 +1473,10 @@ func (k *Kernel) SupervisorContext() context.Context {
// +stateify savable
type SocketEntry struct {
socketEntry
- k *Kernel
- Sock *refs.WeakRef
- ID uint64 // Socket table entry number.
+ k *Kernel
+ Sock *refs.WeakRef
+ SockVFS2 *vfs.FileDescription
+ ID uint64 // Socket table entry number.
}
// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
@@ -1425,7 +1499,30 @@ func (k *Kernel) RecordSocket(sock *fs.File) {
k.extMu.Unlock()
}
+// RecordSocketVFS2 adds a VFS2 socket to the system-wide socket table for
+// tracking.
+//
+// Precondition: Caller must hold a reference to sock.
+//
+// Note that the socket table will not hold a reference on the
+// vfs.FileDescription, because we do not support weak refs on VFS2 files.
+func (k *Kernel) RecordSocketVFS2(sock *vfs.FileDescription) {
+ k.extMu.Lock()
+ id := k.nextSocketEntry
+ k.nextSocketEntry++
+ s := &SocketEntry{
+ k: k,
+ ID: id,
+ SockVFS2: sock,
+ }
+ k.sockets.PushBack(s)
+ k.extMu.Unlock()
+}
+
// ListSockets returns a snapshot of all sockets.
+//
+// Callers of ListSockets() in VFS2 should use SocketEntry.SockVFS2.TryIncRef()
+// to get a reference on a socket in the table.
func (k *Kernel) ListSockets() []*SocketEntry {
k.extMu.Lock()
var socks []*SocketEntry
@@ -1533,3 +1630,26 @@ func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) {
func (k *Kernel) VFS() *vfs.VirtualFilesystem {
return &k.vfs
}
+
+// SetHostMount sets the hostfs mount.
+func (k *Kernel) SetHostMount(mnt *vfs.Mount) {
+ if k.hostMount != nil {
+ panic("Kernel.hostMount cannot be set more than once")
+ }
+ k.hostMount = mnt
+}
+
+// HostMount returns the hostfs mount.
+func (k *Kernel) HostMount() *vfs.Mount {
+ return k.hostMount
+}
+
+// PipeMount returns the pipefs mount.
+func (k *Kernel) PipeMount() *vfs.Mount {
+ return k.pipeMount
+}
+
+// SocketMount returns the sockfs mount.
+func (k *Kernel) SocketMount() *vfs.Mount {
+ return k.socketMount
+}
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 725e9db7d..62c8691f1 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -255,7 +255,8 @@ func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) {
// POSIX requires that a write smaller than atomicIOBytes (PIPE_BUF) be
// atomic, but requires no atomicity for writes larger than this.
wanted := ops.left()
- if avail := p.max - p.view.Size(); wanted > avail {
+ avail := p.max - p.view.Size()
+ if wanted > avail {
if wanted <= p.atomicIOBytes {
return 0, syserror.ErrWouldBlock
}
@@ -268,8 +269,14 @@ func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) {
return done, err
}
- if wanted > done {
- // Partial write due to full pipe.
+ if done < avail {
+ // Non-failure, but short write.
+ return done, nil
+ }
+ if done < wanted {
+ // Partial write due to full pipe. Note that this could also be
+ // the short write case above, we would expect a second call
+ // and the write to return zero bytes in this case.
return done, syserror.ErrWouldBlock
}
diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go
index a5675bd70..b54f08a30 100644
--- a/pkg/sentry/kernel/pipe/vfs.go
+++ b/pkg/sentry/kernel/pipe/vfs.go
@@ -49,38 +49,42 @@ type VFSPipe struct {
}
// NewVFSPipe returns an initialized VFSPipe.
-func NewVFSPipe(sizeBytes, atomicIOBytes int64) *VFSPipe {
+func NewVFSPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *VFSPipe {
var vp VFSPipe
- initPipe(&vp.pipe, true /* isNamed */, sizeBytes, atomicIOBytes)
+ initPipe(&vp.pipe, isNamed, sizeBytes, atomicIOBytes)
return &vp
}
-// NewVFSPipeFD opens a named pipe. Named pipes have special blocking semantics
-// during open:
+// ReaderWriterPair returns read-only and write-only FDs for vp.
//
-// "Normally, opening the FIFO blocks until the other end is opened also. A
-// process can open a FIFO in nonblocking mode. In this case, opening for
-// read-only will succeed even if no-one has opened on the write side yet,
-// opening for write-only will fail with ENXIO (no such device or address)
-// unless the other end has already been opened. Under Linux, opening a FIFO
-// for read and write will succeed both in blocking and nonblocking mode. POSIX
-// leaves this behavior undefined. This can be used to open a FIFO for writing
-// while there are no readers available." - fifo(7)
-func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) {
+// Preconditions: statusFlags should not contain an open access mode.
+func (vp *VFSPipe) ReaderWriterPair(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, *vfs.FileDescription) {
+ return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags)
+}
+
+// Open opens the pipe represented by vp.
+func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, error) {
vp.mu.Lock()
defer vp.mu.Unlock()
- readable := vfs.MayReadFileWithOpenFlags(flags)
- writable := vfs.MayWriteFileWithOpenFlags(flags)
+ readable := vfs.MayReadFileWithOpenFlags(statusFlags)
+ writable := vfs.MayWriteFileWithOpenFlags(statusFlags)
if !readable && !writable {
return nil, syserror.EINVAL
}
- vfd, err := vp.open(vfsd, vfsfd, flags)
- if err != nil {
- return nil, err
- }
+ fd := vp.newFD(mnt, vfsd, statusFlags)
+ // Named pipes have special blocking semantics during open:
+ //
+ // "Normally, opening the FIFO blocks until the other end is opened also. A
+ // process can open a FIFO in nonblocking mode. In this case, opening for
+ // read-only will succeed even if no-one has opened on the write side yet,
+ // opening for write-only will fail with ENXIO (no such device or address)
+ // unless the other end has already been opened. Under Linux, opening a
+ // FIFO for read and write will succeed both in blocking and nonblocking
+ // mode. POSIX leaves this behavior undefined. This can be used to open a
+ // FIFO for writing while there are no readers available." - fifo(7)
switch {
case readable && writable:
// Pipes opened for read-write always succeed without blocking.
@@ -89,23 +93,26 @@ func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, vfsd *vfs.Dentry, vfsfd *vf
case readable:
newHandleLocked(&vp.rWakeup)
- // If this pipe is being opened as nonblocking and there's no
+ // If this pipe is being opened as blocking and there's no
// writer, we have to wait for a writer to open the other end.
- if flags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() && !waitFor(&vp.mu, &vp.wWakeup, ctx) {
+ if vp.pipe.isNamed && statusFlags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() && !waitFor(&vp.mu, &vp.wWakeup, ctx) {
+ fd.DecRef()
return nil, syserror.EINTR
}
case writable:
newHandleLocked(&vp.wWakeup)
- if !vp.pipe.HasReaders() {
- // Nonblocking, write-only opens fail with ENXIO when
- // the read side isn't open yet.
- if flags&linux.O_NONBLOCK != 0 {
+ if vp.pipe.isNamed && !vp.pipe.HasReaders() {
+ // Non-blocking, write-only opens fail with ENXIO when the read
+ // side isn't open yet.
+ if statusFlags&linux.O_NONBLOCK != 0 {
+ fd.DecRef()
return nil, syserror.ENXIO
}
// Wait for a reader to open the other end.
if !waitFor(&vp.mu, &vp.rWakeup, ctx) {
+ fd.DecRef()
return nil, syserror.EINTR
}
}
@@ -114,96 +121,93 @@ func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, vfsd *vfs.Dentry, vfsfd *vf
panic("invalid pipe flags: must be readable, writable, or both")
}
- return vfd, nil
+ return fd, nil
}
// Preconditions: vp.mu must be held.
-func (vp *VFSPipe) open(vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) {
- var fd VFSPipeFD
- fd.flags = flags
- fd.readable = vfs.MayReadFileWithOpenFlags(flags)
- fd.writable = vfs.MayWriteFileWithOpenFlags(flags)
- fd.vfsfd = vfsfd
- fd.pipe = &vp.pipe
+func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) *vfs.FileDescription {
+ fd := &VFSPipeFD{
+ pipe: &vp.pipe,
+ }
+ fd.vfsfd.Init(fd, statusFlags, mnt, vfsd, &vfs.FileDescriptionOptions{
+ DenyPRead: true,
+ DenyPWrite: true,
+ UseDentryMetadata: true,
+ })
switch {
- case fd.readable && fd.writable:
+ case fd.vfsfd.IsReadable() && fd.vfsfd.IsWritable():
vp.pipe.rOpen()
vp.pipe.wOpen()
- case fd.readable:
+ case fd.vfsfd.IsReadable():
vp.pipe.rOpen()
- case fd.writable:
+ case fd.vfsfd.IsWritable():
vp.pipe.wOpen()
default:
panic("invalid pipe flags: must be readable, writable, or both")
}
- return &fd, nil
+ return &fd.vfsfd
}
-// VFSPipeFD implements a subset of vfs.FileDescriptionImpl for pipes. It is
-// expected that filesystesm will use this in a struct implementing
-// vfs.FileDescriptionImpl.
+// VFSPipeFD implements vfs.FileDescriptionImpl for pipes.
type VFSPipeFD struct {
- pipe *Pipe
- flags uint32
- readable bool
- writable bool
- vfsfd *vfs.FileDescription
+ vfsfd vfs.FileDescription
+ vfs.FileDescriptionDefaultImpl
+ vfs.DentryMetadataFileDescriptionImpl
+
+ pipe *Pipe
}
// Release implements vfs.FileDescriptionImpl.Release.
func (fd *VFSPipeFD) Release() {
var event waiter.EventMask
- if fd.readable {
+ if fd.vfsfd.IsReadable() {
fd.pipe.rClose()
- event |= waiter.EventIn
+ event |= waiter.EventOut
}
- if fd.writable {
+ if fd.vfsfd.IsWritable() {
fd.pipe.wClose()
- event |= waiter.EventOut
+ event |= waiter.EventIn | waiter.EventHUp
}
if event == 0 {
panic("invalid pipe flags: must be readable, writable, or both")
}
- if fd.writable {
- fd.vfsfd.VirtualDentry().Mount().EndWrite()
- }
-
fd.pipe.Notify(event)
}
-// OnClose implements vfs.FileDescriptionImpl.OnClose.
-func (fd *VFSPipeFD) OnClose(_ context.Context) error {
- return nil
+// Readiness implements waiter.Waitable.Readiness.
+func (fd *VFSPipeFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+ switch {
+ case fd.vfsfd.IsReadable() && fd.vfsfd.IsWritable():
+ return fd.pipe.rwReadiness()
+ case fd.vfsfd.IsReadable():
+ return fd.pipe.rReadiness()
+ case fd.vfsfd.IsWritable():
+ return fd.pipe.wReadiness()
+ default:
+ panic("pipe FD is neither readable nor writable")
+ }
}
-// PRead implements vfs.FileDescriptionImpl.PRead.
-func (fd *VFSPipeFD) PRead(_ context.Context, _ usermem.IOSequence, _ int64, _ vfs.ReadOptions) (int64, error) {
- return 0, syserror.ESPIPE
+// EventRegister implements waiter.Waitable.EventRegister.
+func (fd *VFSPipeFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ fd.pipe.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (fd *VFSPipeFD) EventUnregister(e *waiter.Entry) {
+ fd.pipe.EventUnregister(e)
}
// Read implements vfs.FileDescriptionImpl.Read.
func (fd *VFSPipeFD) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
- if !fd.readable {
- return 0, syserror.EINVAL
- }
-
return fd.pipe.Read(ctx, dst)
}
-// PWrite implements vfs.FileDescriptionImpl.PWrite.
-func (fd *VFSPipeFD) PWrite(_ context.Context, _ usermem.IOSequence, _ int64, _ vfs.WriteOptions) (int64, error) {
- return 0, syserror.ESPIPE
-}
-
// Write implements vfs.FileDescriptionImpl.Write.
func (fd *VFSPipeFD) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
- if !fd.writable {
- return 0, syserror.EINVAL
- }
-
return fd.pipe.Write(ctx, src)
}
@@ -211,3 +215,17 @@ func (fd *VFSPipeFD) Write(ctx context.Context, src usermem.IOSequence, _ vfs.Wr
func (fd *VFSPipeFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
return fd.pipe.Ioctl(ctx, uio, args)
}
+
+// PipeSize implements fcntl(F_GETPIPE_SZ).
+func (fd *VFSPipeFD) PipeSize() int64 {
+ // Inline Pipe.FifoSize() rather than calling it with nil Context and
+ // fs.File and ignoring the returned error (which is always nil).
+ fd.pipe.mu.Lock()
+ defer fd.pipe.mu.Unlock()
+ return fd.pipe.max
+}
+
+// SetPipeSize implements fcntl(F_SETPIPE_SZ).
+func (fd *VFSPipeFD) SetPipeSize(size int64) (int64, error) {
+ return fd.pipe.SetFifoSize(size)
+}
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index 35ad97d5d..e23e796ef 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -184,7 +184,6 @@ func (t *Task) CanTrace(target *Task, attach bool) bool {
if targetCreds.PermittedCaps&^callerCreds.PermittedCaps != 0 {
return false
}
- // TODO: Yama LSM
return true
}
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index ded95f532..18416643b 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -304,7 +304,7 @@ func (t *Task) rseqAddrInterrupt() {
}
var cs linux.RSeqCriticalSection
- if err := cs.CopyIn(t, critAddr); err != nil {
+ if _, err := cs.CopyIn(t, critAddr); err != nil {
t.Debugf("Failed to copy critical section from %#x for rseq: %v", critAddr, err)
t.forceSignal(linux.SIGSEGV, false /* unconditional */)
t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index 1000f3287..c00fa1138 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -554,6 +554,7 @@ func (s *sem) wakeWaiters() {
for w := s.waiters.Front(); w != nil; {
if s.value < w.value {
// Still blocked, skip it.
+ w = w.Next()
continue
}
w.ch <- struct{}{}
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index 047b5214d..0e19286de 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -246,7 +246,7 @@ func (pg *ProcessGroup) SendSignal(info *arch.SignalInfo) error {
var lastErr error
for tg := range tasks.Root.tgids {
- if tg.ProcessGroup() == pg {
+ if tg.processGroup == pg {
tg.signalHandlers.mu.Lock()
infoCopy := *info
if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil {
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 208569057..f66cfcc7f 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -461,7 +461,7 @@ func (s *Shm) AddMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.A
func (s *Shm) RemoveMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.AddrRange, _ uint64, _ bool) {
s.mu.Lock()
defer s.mu.Unlock()
- // TODO(b/38173783): RemoveMapping may be called during task exit, when ctx
+ // RemoveMapping may be called during task exit, when ctx
// is context.Background. Gracefully handle missing clocks. Failing to
// update the detach time in these cases is ok, since no one can observe the
// omission.
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index 93c4fe969..84156d5a1 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -209,65 +209,61 @@ type Stracer interface {
// SyscallEnter is called on syscall entry.
//
// The returned private data is passed to SyscallExit.
- //
- // TODO(gvisor.dev/issue/155): remove kernel imports from the strace
- // package so that the type can be used directly.
SyscallEnter(t *Task, sysno uintptr, args arch.SyscallArguments, flags uint32) interface{}
// SyscallExit is called on syscall exit.
SyscallExit(context interface{}, t *Task, sysno, rval uintptr, err error)
}
-// SyscallTable is a lookup table of system calls. Critically, a SyscallTable
-// is *immutable*. In order to make supporting suspend and resume sane, they
-// must be uniquely registered and may not change during operation.
+// SyscallTable is a lookup table of system calls.
//
-// +stateify savable
+// Note that a SyscallTable is not savable directly. Instead, they are saved as
+// an OS/Arch pair and lookup happens again on restore.
type SyscallTable struct {
// OS is the operating system that this syscall table implements.
- OS abi.OS `state:"wait"`
+ OS abi.OS
// Arch is the architecture that this syscall table targets.
- Arch arch.Arch `state:"wait"`
+ Arch arch.Arch
// The OS version that this syscall table implements.
- Version Version `state:"manual"`
+ Version Version
// AuditNumber is a numeric constant that represents the syscall table. If
// non-zero, auditNumber must be one of the AUDIT_ARCH_* values defined by
// linux/audit.h.
- AuditNumber uint32 `state:"manual"`
+ AuditNumber uint32
// Table is the collection of functions.
- Table map[uintptr]Syscall `state:"manual"`
+ Table map[uintptr]Syscall
// lookup is a fixed-size array that holds the syscalls (indexed by
// their numbers). It is used for fast look ups.
- lookup []SyscallFn `state:"manual"`
+ lookup []SyscallFn
// Emulate is a collection of instruction addresses to emulate. The
// keys are addresses, and the values are system call numbers.
- Emulate map[usermem.Addr]uintptr `state:"manual"`
+ Emulate map[usermem.Addr]uintptr
// The function to call in case of a missing system call.
- Missing MissingFn `state:"manual"`
+ Missing MissingFn
// Stracer traces this syscall table.
- Stracer Stracer `state:"manual"`
+ Stracer Stracer
// External is used to handle an external callback.
- External func(*Kernel) `state:"manual"`
+ External func(*Kernel)
// ExternalFilterBefore is called before External is called before the syscall is executed.
// External is not called if it returns false.
- ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"`
+ ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool
// ExternalFilterAfter is called before External is called after the syscall is executed.
// External is not called if it returns false.
- ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"`
+ ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool
// FeatureEnable stores the strace and one-shot enable bits.
- FeatureEnable SyscallFlagsTable `state:"manual"`
+ FeatureEnable SyscallFlagsTable
}
// allSyscallTables contains all known tables.
@@ -330,6 +326,13 @@ func RegisterSyscallTable(s *SyscallTable) {
allSyscallTables = append(allSyscallTables, s)
}
+// FlushSyscallTablesTestOnly flushes the syscall tables for tests. Used for
+// parameterized VFSv2 tests.
+// TODO(gvisor.dv/issue/1624): Remove when VFS1 is no longer supported.
+func FlushSyscallTablesTestOnly() {
+ allSyscallTables = nil
+}
+
// Lookup returns the syscall implementation, if one exists.
func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn {
if sysno < uintptr(len(s.lookup)) {
diff --git a/pkg/sentry/kernel/syscalls_state.go b/pkg/sentry/kernel/syscalls_state.go
index 00358326b..90f890495 100644
--- a/pkg/sentry/kernel/syscalls_state.go
+++ b/pkg/sentry/kernel/syscalls_state.go
@@ -14,16 +14,34 @@
package kernel
-import "fmt"
+import (
+ "fmt"
-// afterLoad is invoked by stateify.
-func (s *SyscallTable) afterLoad() {
- otherTable, ok := LookupSyscallTable(s.OS, s.Arch)
- if !ok {
- // Couldn't find a reference?
- panic(fmt.Sprintf("syscall table not found for OS %v Arch %v", s.OS, s.Arch))
+ "gvisor.dev/gvisor/pkg/abi"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+)
+
+// syscallTableInfo is used to reload the SyscallTable.
+//
+// +stateify savable
+type syscallTableInfo struct {
+ OS abi.OS
+ Arch arch.Arch
+}
+
+// saveSt saves the SyscallTable.
+func (tc *TaskContext) saveSt() syscallTableInfo {
+ return syscallTableInfo{
+ OS: tc.st.OS,
+ Arch: tc.st.Arch,
}
+}
- // Copy the table.
- *s = *otherTable
+// loadSt loads the SyscallTable.
+func (tc *TaskContext) loadSt(sti syscallTableInfo) {
+ st, ok := LookupSyscallTable(sti.OS, sti.Arch)
+ if !ok {
+ panic(fmt.Sprintf("syscall table not found for OS %v, Arch %v", sti.OS, sti.Arch))
+ }
+ tc.st = st // Save the table reference.
}
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index c0dbbe890..f48247c94 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -484,7 +484,7 @@ type Task struct {
// bit.
//
// numaPolicy and numaNodeMask are protected by mu.
- numaPolicy int32
+ numaPolicy linux.NumaPolicy
numaNodeMask uint64
// netns is the task's network namespace. netns is never nil.
@@ -555,13 +555,6 @@ type Task struct {
//
// startTime is protected by mu.
startTime ktime.Time
-
- // oomScoreAdj is the task's OOM score adjustment. This is currently not
- // used but is maintained for consistency.
- // TODO(gvisor.dev/issue/1967)
- //
- // oomScoreAdj is protected by mu, and is owned by the task goroutine.
- oomScoreAdj int32
}
func (t *Task) savePtraceTracer() *Task {
@@ -784,6 +777,15 @@ func (t *Task) NewFDs(fd int32, files []*fs.File, flags FDFlags) ([]int32, error
return t.fdTable.NewFDs(t, fd, files, flags)
}
+// NewFDsVFS2 is a convenience wrapper for t.FDTable().NewFDsVFS2.
+//
+// This automatically passes the task as the context.
+//
+// Precondition: same as FDTable.
+func (t *Task) NewFDsVFS2(fd int32, files []*vfs.FileDescription, flags FDFlags) ([]int32, error) {
+ return t.fdTable.NewFDsVFS2(t, fd, files, flags)
+}
+
// NewFDFrom is a convenience wrapper for t.FDTable().NewFDs with a single file.
//
// This automatically passes the task as the context.
@@ -856,27 +858,29 @@ func (t *Task) ContainerID() string {
return t.containerID
}
-// OOMScoreAdj gets the task's OOM score adjustment.
-func (t *Task) OOMScoreAdj() (int32, error) {
- t.mu.Lock()
- defer t.mu.Unlock()
- if t.ExitState() == TaskExitDead {
- return 0, syserror.ESRCH
- }
- return t.oomScoreAdj, nil
+// OOMScoreAdj gets the task's thread group's OOM score adjustment.
+func (t *Task) OOMScoreAdj() int32 {
+ return atomic.LoadInt32(&t.tg.oomScoreAdj)
}
-// SetOOMScoreAdj sets the task's OOM score adjustment. The value should be
-// between -1000 and 1000 inclusive.
+// SetOOMScoreAdj sets the task's thread group's OOM score adjustment. The
+// value should be between -1000 and 1000 inclusive.
func (t *Task) SetOOMScoreAdj(adj int32) error {
- t.mu.Lock()
- defer t.mu.Unlock()
- if t.ExitState() == TaskExitDead {
- return syserror.ESRCH
- }
if adj > 1000 || adj < -1000 {
return syserror.EINVAL
}
- t.oomScoreAdj = adj
+ atomic.StoreInt32(&t.tg.oomScoreAdj, adj)
return nil
}
+
+// UID returns t's uid.
+// TODO(gvisor.dev/issue/170): This method is not namespaced yet.
+func (t *Task) UID() uint32 {
+ return uint32(t.Credentials().EffectiveKUID)
+}
+
+// GID returns t's gid.
+// TODO(gvisor.dev/issue/170): This method is not namespaced yet.
+func (t *Task) GID() uint32 {
+ return uint32(t.Credentials().EffectiveKGID)
+}
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index dda502bb8..e1ecca99e 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -15,6 +15,8 @@
package kernel
import (
+ "sync/atomic"
+
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/bpf"
"gvisor.dev/gvisor/pkg/sentry/inet"
@@ -260,15 +262,11 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
sh = sh.Fork()
}
tg = t.k.NewThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy())
+ tg.oomScoreAdj = atomic.LoadInt32(&t.tg.oomScoreAdj)
rseqAddr = t.rseqAddr
rseqSignature = t.rseqSignature
}
- adj, err := t.OOMScoreAdj()
- if err != nil {
- return 0, nil, err
- }
-
cfg := &TaskConfig{
Kernel: t.k,
ThreadGroup: tg,
@@ -287,7 +285,6 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
RSeqAddr: rseqAddr,
RSeqSignature: rseqSignature,
ContainerID: t.ContainerID(),
- OOMScoreAdj: adj,
}
if opts.NewThreadGroup {
cfg.Parent = t
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 0158b1788..9fa528384 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -49,7 +49,7 @@ type TaskContext struct {
fu *futex.Manager
// st is the task's syscall table.
- st *SyscallTable
+ st *SyscallTable `state:".(syscallTableInfo)"`
}
// release releases all resources held by the TaskContext. release is called by
@@ -58,7 +58,6 @@ func (tc *TaskContext) release() {
// Nil out pointers so that if the task is saved after release, it doesn't
// follow the pointers to possibly now-invalid objects.
if tc.MemoryManager != nil {
- // TODO(b/38173783)
tc.MemoryManager.DecUsers(context.Background())
tc.MemoryManager = nil
}
diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go
index ce3e6ef28..0325967e4 100644
--- a/pkg/sentry/kernel/task_identity.go
+++ b/pkg/sentry/kernel/task_identity.go
@@ -455,7 +455,7 @@ func (t *Task) SetKeepCaps(k bool) {
t.creds.Store(creds)
}
-// updateCredsForExec updates t.creds to reflect an execve().
+// updateCredsForExecLocked updates t.creds to reflect an execve().
//
// NOTE(b/30815691): We currently do not implement privileged executables
// (set-user/group-ID bits and file capabilities). This allows us to make a lot
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index 799cbcd93..d654dd997 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -96,6 +96,7 @@ func (t *Task) run(threadID uintptr) {
t.tg.liveGoroutines.Done()
t.tg.pidns.owner.liveGoroutines.Done()
t.tg.pidns.owner.runningGoroutines.Done()
+ t.p.Release()
// Keep argument alive because stack trace for dead variables may not be correct.
runtime.KeepAlive(threadID)
@@ -353,7 +354,7 @@ func (app *runApp) execute(t *Task) taskRunState {
default:
// What happened? Can't continue.
t.Warningf("Unexpected SwitchToApp error: %v", err)
- t.PrepareExit(ExitStatus{Code: t.ExtractErrno(err, -1)})
+ t.PrepareExit(ExitStatus{Code: ExtractErrno(err, -1)})
return (*runExit)(nil)
}
}
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
index 8b148db35..09366b60c 100644
--- a/pkg/sentry/kernel/task_sched.go
+++ b/pkg/sentry/kernel/task_sched.go
@@ -653,14 +653,14 @@ func (t *Task) SetNiceness(n int) {
}
// NumaPolicy returns t's current numa policy.
-func (t *Task) NumaPolicy() (policy int32, nodeMask uint64) {
+func (t *Task) NumaPolicy() (policy linux.NumaPolicy, nodeMask uint64) {
t.mu.Lock()
defer t.mu.Unlock()
return t.numaPolicy, t.numaNodeMask
}
// SetNumaPolicy sets t's numa policy.
-func (t *Task) SetNumaPolicy(policy int32, nodeMask uint64) {
+func (t *Task) SetNumaPolicy(policy linux.NumaPolicy, nodeMask uint64) {
t.mu.Lock()
defer t.mu.Unlock()
t.numaPolicy = policy
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index 8802db142..7d25e98f7 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -174,7 +174,7 @@ func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunS
fallthrough
case (sre == ERESTARTSYS && !act.IsRestart()):
t.Debugf("Not restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo)
- t.Arch().SetReturn(uintptr(-t.ExtractErrno(syserror.EINTR, -1)))
+ t.Arch().SetReturn(uintptr(-ExtractErrno(syserror.EINTR, -1)))
default:
t.Debugf("Restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo)
t.Arch().RestartSyscall()
@@ -263,6 +263,19 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct)
if t.haveSavedSignalMask {
mask = t.savedSignalMask
}
+
+ // Set up the restorer.
+ // x86-64 should always uses SA_RESTORER, but this flag is optional on other platforms.
+ // Please see the linux code as reference:
+ // linux/arch/x86/kernel/signal.c:__setup_rt_frame()
+ // If SA_RESTORER is not configured, we can use the sigreturn trampolines
+ // the vdso provides instead.
+ // Please see the linux code as reference:
+ // linux/arch/arm64/kernel/signal.c:setup_return()
+ if act.Flags&linux.SA_RESTORER == 0 {
+ act.Restorer = t.MemoryManager().VDSOSigReturn()
+ }
+
if err := t.Arch().SignalSetup(st, &act, info, &alt, mask); err != nil {
return err
}
@@ -513,8 +526,6 @@ func (t *Task) canReceiveSignalLocked(sig linux.Signal) bool {
if t.stop != nil {
return false
}
- // - TODO(b/38173783): No special case for when t is also the sending task,
- // because the identity of the sender is unknown.
// - Do not choose tasks that have already been interrupted, as they may be
// busy handling another signal.
if len(t.interruptChan) != 0 {
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index 2bbf48bb8..a5035bb7f 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -93,9 +93,6 @@ type TaskConfig struct {
// ContainerID is the container the new task belongs to.
ContainerID string
-
- // oomScoreAdj is the task's OOM score adjustment.
- OOMScoreAdj int32
}
// NewTask creates a new task defined by cfg.
@@ -146,7 +143,6 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
rseqSignature: cfg.RSeqSignature,
futexWaiter: futex.NewWaiter(),
containerID: cfg.ContainerID,
- oomScoreAdj: cfg.OOMScoreAdj,
}
t.creds.Store(cfg.Credentials)
t.endStopCond.L = &t.tg.signalHandlers.mu
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index d555d69a8..c9db78e06 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -194,6 +194,19 @@ func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval u
//
// The syscall path is very hot; avoid defer.
func (t *Task) doSyscall() taskRunState {
+ // Save value of the register which is clobbered in the following
+ // t.Arch().SetReturn(-ENOSYS) operation. This is dedicated to arm64.
+ //
+ // On x86, register rax was shared by syscall number and return
+ // value, and at the entry of the syscall handler, the rax was
+ // saved to regs.orig_rax which was exposed to user space.
+ // But on arm64, syscall number was passed through X8, and the X0
+ // was shared by the first syscall argument and return value. The
+ // X0 was saved to regs.orig_x0 which was not exposed to user space.
+ // So we have to do the same operation here to save the X0 value
+ // into the task context.
+ t.Arch().SyscallSaveOrig()
+
sysno := t.Arch().SyscallNo()
args := t.Arch().SyscallArgs()
@@ -269,6 +282,7 @@ func (*runSyscallAfterSyscallEnterStop) execute(t *Task) taskRunState {
return (*runSyscallExit)(nil)
}
args := t.Arch().SyscallArgs()
+
return t.doSyscallInvoke(sysno, args)
}
@@ -298,7 +312,7 @@ func (t *Task) doSyscallInvoke(sysno uintptr, args arch.SyscallArguments) taskRu
return ctrl.next
}
} else if err != nil {
- t.Arch().SetReturn(uintptr(-t.ExtractErrno(err, int(sysno))))
+ t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno))))
t.haveSyscallReturn = true
} else {
t.Arch().SetReturn(rval)
@@ -417,7 +431,7 @@ func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, calle
// A return is not emulated in this case.
return (*runApp)(nil)
}
- t.Arch().SetReturn(uintptr(-t.ExtractErrno(err, int(sysno))))
+ t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno))))
}
t.Arch().SetIP(t.Arch().Value(caller))
t.Arch().SetStack(t.Arch().Stack() + uintptr(t.Arch().Width()))
@@ -427,7 +441,7 @@ func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, calle
// ExtractErrno extracts an integer error number from the error.
// The syscall number is purely for context in the error case. Use -1 if
// syscall number is unknown.
-func (t *Task) ExtractErrno(err error, sysno int) int {
+func ExtractErrno(err error, sysno int) int {
switch err := err.(type) {
case nil:
return 0
@@ -441,11 +455,11 @@ func (t *Task) ExtractErrno(err error, sysno int) int {
// handled (and the SIGBUS is delivered).
return int(syscall.EFAULT)
case *os.PathError:
- return t.ExtractErrno(err.Err, sysno)
+ return ExtractErrno(err.Err, sysno)
case *os.LinkError:
- return t.ExtractErrno(err.Err, sysno)
+ return ExtractErrno(err.Err, sysno)
case *os.SyscallError:
- return t.ExtractErrno(err.Err, sysno)
+ return ExtractErrno(err.Err, sysno)
default:
if errno, ok := syserror.TranslateError(err); ok {
return int(errno)
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 268f62e9d..52849f5b3 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -254,6 +254,13 @@ type ThreadGroup struct {
//
// tty is protected by the signal mutex.
tty *TTY
+
+ // oomScoreAdj is the thread group's OOM score adjustment. This is
+ // currently not used but is maintained for consistency.
+ // TODO(gvisor.dev/issue/1967)
+ //
+ // oomScoreAdj is accessed using atomic memory operations.
+ oomScoreAdj int32
}
// NewThreadGroup returns a new, empty thread group in PID namespace pidns. The
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
index 706de83ef..e959700f2 100644
--- a/pkg/sentry/kernel/time/time.go
+++ b/pkg/sentry/kernel/time/time.go
@@ -245,7 +245,7 @@ type Clock interface {
type WallRateClock struct{}
// WallTimeUntil implements Clock.WallTimeUntil.
-func (WallRateClock) WallTimeUntil(t, now Time) time.Duration {
+func (*WallRateClock) WallTimeUntil(t, now Time) time.Duration {
return t.Sub(now)
}
@@ -254,16 +254,16 @@ func (WallRateClock) WallTimeUntil(t, now Time) time.Duration {
type NoClockEvents struct{}
// Readiness implements waiter.Waitable.Readiness.
-func (NoClockEvents) Readiness(mask waiter.EventMask) waiter.EventMask {
+func (*NoClockEvents) Readiness(mask waiter.EventMask) waiter.EventMask {
return 0
}
// EventRegister implements waiter.Waitable.EventRegister.
-func (NoClockEvents) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+func (*NoClockEvents) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
}
// EventUnregister implements waiter.Waitable.EventUnregister.
-func (NoClockEvents) EventUnregister(e *waiter.Entry) {
+func (*NoClockEvents) EventUnregister(e *waiter.Entry) {
}
// ClockEventsQueue implements waiter.Waitable by wrapping waiter.Queue and
@@ -273,7 +273,7 @@ type ClockEventsQueue struct {
}
// Readiness implements waiter.Waitable.Readiness.
-func (ClockEventsQueue) Readiness(mask waiter.EventMask) waiter.EventMask {
+func (*ClockEventsQueue) Readiness(mask waiter.EventMask) waiter.EventMask {
return 0
}