diff options
Diffstat (limited to 'pkg/sentry/kernel')
25 files changed, 458 insertions, 198 deletions
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index beba29a09..e47af66d6 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -169,6 +169,9 @@ go_library( "//pkg/sentry/fs/lock", "//pkg/sentry/fs/timerfd", "//pkg/sentry/fsbridge", + "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/fsimpl/pipefs", + "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/hostcpu", "//pkg/sentry/inet", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD index dedf0fa15..75eedd5a2 100644 --- a/pkg/sentry/kernel/epoll/BUILD +++ b/pkg/sentry/kernel/epoll/BUILD @@ -24,6 +24,7 @@ go_library( ], visibility = ["//pkg/sentry:internal"], deps = [ + "//pkg/abi/linux", "//pkg/context", "//pkg/refs", "//pkg/sentry/fs", diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go index 8bffb78fc..3d78cd48f 100644 --- a/pkg/sentry/kernel/epoll/epoll.go +++ b/pkg/sentry/kernel/epoll/epoll.go @@ -20,6 +20,7 @@ import ( "fmt" "syscall" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -30,19 +31,6 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) -// Event describes the event mask that was observed and the user data to be -// returned when one of the events occurs. It has this format to match the linux -// format to avoid extra copying/allocation when writing events to userspace. -type Event struct { - // Events is the event mask containing the set of events that have been - // observed on an entry. - Events uint32 - - // Data is an opaque 64-bit value provided by the caller when adding the - // entry, and returned to the caller when the entry reports an event. - Data [2]int32 -} - // EntryFlags is a bitmask that holds an entry's flags. type EntryFlags int @@ -227,9 +215,9 @@ func (e *EventPoll) Readiness(mask waiter.EventMask) waiter.EventMask { } // ReadEvents returns up to max available events. -func (e *EventPoll) ReadEvents(max int) []Event { +func (e *EventPoll) ReadEvents(max int) []linux.EpollEvent { var local pollEntryList - var ret []Event + var ret []linux.EpollEvent e.listsMu.Lock() @@ -251,7 +239,7 @@ func (e *EventPoll) ReadEvents(max int) []Event { } // Add event to the array that will be returned to caller. - ret = append(ret, Event{ + ret = append(ret, linux.EpollEvent{ Events: uint32(ready), Data: entry.userData, }) @@ -296,8 +284,10 @@ func (*readyCallback) Callback(w *waiter.Entry) { e.waitingList.Remove(entry) e.readyList.PushBack(entry) entry.curList = &e.readyList + e.listsMu.Unlock() e.Notify(waiter.EventIn) + return } e.listsMu.Unlock() diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index 58001d56c..ed40b5303 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -191,10 +191,12 @@ func (f *FDTable) Size() int { return int(size) } -// forEach iterates over all non-nil files. +// forEach iterates over all non-nil files in sorted order. // // It is the caller's responsibility to acquire an appropriate lock. func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) { + // retries tracks the number of failed TryIncRef attempts for the same FD. + retries := 0 fd := int32(0) for { file, fileVFS2, flags, ok := f.getAll(fd) @@ -204,17 +206,26 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes switch { case file != nil: if !file.TryIncRef() { + retries++ + if retries > 1000 { + panic(fmt.Sprintf("File in FD table has been destroyed. FD: %d, File: %+v, FileOps: %+v", fd, file, file.FileOperations)) + } continue // Race caught. } fn(fd, file, nil, flags) file.DecRef() case fileVFS2 != nil: if !fileVFS2.TryIncRef() { + retries++ + if retries > 1000 { + panic(fmt.Sprintf("File in FD table has been destroyed. FD: %d, File: %+v, Impl: %+v", fd, fileVFS2, fileVFS2.Impl())) + } continue // Race caught. } fn(fd, nil, fileVFS2, flags) fileVFS2.DecRef() } + retries = 0 fd++ } } @@ -296,6 +307,61 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags return fds, nil } +// NewFDsVFS2 allocates new FDs guaranteed to be the lowest number available +// greater than or equal to the fd parameter. All files will share the set +// flags. Success is guaranteed to be all or none. +func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDescription, flags FDFlags) (fds []int32, err error) { + if fd < 0 { + // Don't accept negative FDs. + return nil, syscall.EINVAL + } + + // Default limit. + end := int32(math.MaxInt32) + + // Ensure we don't get past the provided limit. + if limitSet := limits.FromContext(ctx); limitSet != nil { + lim := limitSet.Get(limits.NumberOfFiles) + if lim.Cur != limits.Infinity { + end = int32(lim.Cur) + } + if fd >= end { + return nil, syscall.EMFILE + } + } + + f.mu.Lock() + defer f.mu.Unlock() + + // From f.next to find available fd. + if fd < f.next { + fd = f.next + } + + // Install all entries. + for i := fd; i < end && len(fds) < len(files); i++ { + if d, _, _ := f.getVFS2(i); d == nil { + f.setVFS2(i, files[len(fds)], flags) // Set the descriptor. + fds = append(fds, i) // Record the file descriptor. + } + } + + // Failure? Unwind existing FDs. + if len(fds) < len(files) { + for _, i := range fds { + f.setVFS2(i, nil, FDFlags{}) // Zap entry. + } + return nil, syscall.EMFILE + } + + if fd == f.next { + // Update next search start position. + f.next = fds[len(fds)-1] + 1 + } + + return fds, nil +} + // NewFDVFS2 allocates a file descriptor greater than or equal to minfd for // the given file description. If it succeeds, it takes a reference on file. func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) { @@ -327,7 +393,7 @@ func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDesc fd = f.next } for fd < end { - if d, _, _ := f.get(fd); d == nil { + if d, _, _ := f.getVFS2(fd); d == nil { f.setVFS2(fd, file, flags) if fd == f.next { // Update next search start position. @@ -447,7 +513,10 @@ func (f *FDTable) GetVFS2(fd int32) (*vfs.FileDescription, FDFlags) { } } -// GetFDs returns a list of valid fds. +// GetFDs returns a sorted list of valid fds. +// +// Precondition: The caller must be running on the task goroutine, or Task.mu +// must be locked. func (f *FDTable) GetFDs() []int32 { fds := make([]int32, 0, int(atomic.LoadInt32(&f.used))) f.forEach(func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) { @@ -522,7 +591,9 @@ func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) { case orig2 != nil: orig2.IncRef() } - f.setAll(fd, nil, nil, FDFlags{}) // Zap entry. + if orig != nil || orig2 != nil { + f.setAll(fd, nil, nil, FDFlags{}) // Zap entry. + } return orig, orig2 } diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 1d627564f..c91b9dce2 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -50,6 +50,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/timerfd" "gvisor.dev/gvisor/pkg/sentry/fsbridge" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/hostcpu" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -248,6 +250,22 @@ type Kernel struct { // VFS keeps the filesystem state used across the kernel. vfs vfs.VirtualFilesystem + // hostMount is the Mount used for file descriptors that were imported + // from the host. + hostMount *vfs.Mount + + // pipeMount is the Mount used for pipes created by the pipe() and pipe2() + // syscalls (as opposed to named pipes created by mknod()). + pipeMount *vfs.Mount + + // socketMount is the Mount used for sockets created by the socket() and + // socketpair() syscalls. There are several cases where a socket dentry will + // not be contained in socketMount: + // 1. Socket files created by mknod() + // 2. Socket fds imported from the host (Kernel.hostMount is used for these) + // 3. Socket files created by binding Unix sockets to a file path + socketMount *vfs.Mount + // If set to true, report address space activation waits as if the task is in // external wait so that the watchdog doesn't report the task stuck. SleepForAddressSpaceActivation bool @@ -348,6 +366,29 @@ func (k *Kernel) Init(args InitKernelArgs) error { k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic} k.futexes = futex.NewManager() k.netlinkPorts = port.New() + + if VFS2Enabled { + if err := k.vfs.Init(); err != nil { + return fmt.Errorf("failed to initialize VFS: %v", err) + } + + pipeFilesystem := pipefs.NewFilesystem(&k.vfs) + defer pipeFilesystem.DecRef() + pipeMount, err := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{}) + if err != nil { + return fmt.Errorf("failed to create pipefs mount: %v", err) + } + k.pipeMount = pipeMount + + socketFilesystem := sockfs.NewFilesystem(&k.vfs) + defer socketFilesystem.DecRef() + socketMount, err := k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{}) + if err != nil { + return fmt.Errorf("failed to create sockfs mount: %v", err) + } + k.socketMount = socketMount + } + return nil } @@ -467,6 +508,11 @@ func (k *Kernel) flushMountSourceRefs() error { // // Precondition: Must be called with the kernel paused. func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) (err error) { + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. + if VFS2Enabled { + return nil + } + ts.mu.RLock() defer ts.mu.RUnlock() for t := range ts.Root.tids { @@ -484,7 +530,7 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) } func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error { - // TODO(gvisor.dev/issues/1663): Add save support for VFS2. + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. return ts.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error { if flags := file.Flags(); !flags.Write { return nil @@ -533,17 +579,32 @@ func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { } func (ts *TaskSet) unregisterEpollWaiters() { + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. + if VFS2Enabled { + return + } + ts.mu.RLock() defer ts.mu.RUnlock() + + // Tasks that belong to the same process could potentially point to the + // same FDTable. So we retain a map of processed ones to avoid + // processing the same FDTable multiple times. + processed := make(map[*FDTable]struct{}) for t := range ts.Root.tids { // We can skip locking Task.mu here since the kernel is paused. - if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { - if e, ok := file.FileOperations.(*epoll.EventPoll); ok { - e.UnregisterEpollWaiters() - } - }) + if t.fdTable == nil { + continue + } + if _, ok := processed[t.fdTable]; ok { + continue } + t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { + if e, ok := file.FileOperations.(*epoll.EventPoll); ok { + e.UnregisterEpollWaiters() + } + }) + processed[t.fdTable] = struct{}{} } } @@ -1005,9 +1066,15 @@ func (k *Kernel) pauseTimeLocked() { // This means we'll iterate FDTables shared by multiple tasks repeatedly, // but ktime.Timer.Pause is idempotent so this is harmless. if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { - if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { - tfd.PauseTimer() + t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { + if VFS2Enabled { + if tfd, ok := fd.Impl().(*vfs.TimerFileDescription); ok { + tfd.PauseTimer() + } + } else { + if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { + tfd.PauseTimer() + } } }) } @@ -1035,9 +1102,15 @@ func (k *Kernel) resumeTimeLocked() { } } if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { - if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { - tfd.ResumeTimer() + t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { + if VFS2Enabled { + if tfd, ok := fd.Impl().(*vfs.TimerFileDescription); ok { + tfd.ResumeTimer() + } + } else { + if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { + tfd.ResumeTimer() + } } }) } @@ -1400,9 +1473,10 @@ func (k *Kernel) SupervisorContext() context.Context { // +stateify savable type SocketEntry struct { socketEntry - k *Kernel - Sock *refs.WeakRef - ID uint64 // Socket table entry number. + k *Kernel + Sock *refs.WeakRef + SockVFS2 *vfs.FileDescription + ID uint64 // Socket table entry number. } // WeakRefGone implements refs.WeakRefUser.WeakRefGone. @@ -1425,7 +1499,30 @@ func (k *Kernel) RecordSocket(sock *fs.File) { k.extMu.Unlock() } +// RecordSocketVFS2 adds a VFS2 socket to the system-wide socket table for +// tracking. +// +// Precondition: Caller must hold a reference to sock. +// +// Note that the socket table will not hold a reference on the +// vfs.FileDescription, because we do not support weak refs on VFS2 files. +func (k *Kernel) RecordSocketVFS2(sock *vfs.FileDescription) { + k.extMu.Lock() + id := k.nextSocketEntry + k.nextSocketEntry++ + s := &SocketEntry{ + k: k, + ID: id, + SockVFS2: sock, + } + k.sockets.PushBack(s) + k.extMu.Unlock() +} + // ListSockets returns a snapshot of all sockets. +// +// Callers of ListSockets() in VFS2 should use SocketEntry.SockVFS2.TryIncRef() +// to get a reference on a socket in the table. func (k *Kernel) ListSockets() []*SocketEntry { k.extMu.Lock() var socks []*SocketEntry @@ -1533,3 +1630,26 @@ func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) { func (k *Kernel) VFS() *vfs.VirtualFilesystem { return &k.vfs } + +// SetHostMount sets the hostfs mount. +func (k *Kernel) SetHostMount(mnt *vfs.Mount) { + if k.hostMount != nil { + panic("Kernel.hostMount cannot be set more than once") + } + k.hostMount = mnt +} + +// HostMount returns the hostfs mount. +func (k *Kernel) HostMount() *vfs.Mount { + return k.hostMount +} + +// PipeMount returns the pipefs mount. +func (k *Kernel) PipeMount() *vfs.Mount { + return k.pipeMount +} + +// SocketMount returns the sockfs mount. +func (k *Kernel) SocketMount() *vfs.Mount { + return k.socketMount +} diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go index 725e9db7d..62c8691f1 100644 --- a/pkg/sentry/kernel/pipe/pipe.go +++ b/pkg/sentry/kernel/pipe/pipe.go @@ -255,7 +255,8 @@ func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) { // POSIX requires that a write smaller than atomicIOBytes (PIPE_BUF) be // atomic, but requires no atomicity for writes larger than this. wanted := ops.left() - if avail := p.max - p.view.Size(); wanted > avail { + avail := p.max - p.view.Size() + if wanted > avail { if wanted <= p.atomicIOBytes { return 0, syserror.ErrWouldBlock } @@ -268,8 +269,14 @@ func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) { return done, err } - if wanted > done { - // Partial write due to full pipe. + if done < avail { + // Non-failure, but short write. + return done, nil + } + if done < wanted { + // Partial write due to full pipe. Note that this could also be + // the short write case above, we would expect a second call + // and the write to return zero bytes in this case. return done, syserror.ErrWouldBlock } diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index a5675bd70..b54f08a30 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -49,38 +49,42 @@ type VFSPipe struct { } // NewVFSPipe returns an initialized VFSPipe. -func NewVFSPipe(sizeBytes, atomicIOBytes int64) *VFSPipe { +func NewVFSPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *VFSPipe { var vp VFSPipe - initPipe(&vp.pipe, true /* isNamed */, sizeBytes, atomicIOBytes) + initPipe(&vp.pipe, isNamed, sizeBytes, atomicIOBytes) return &vp } -// NewVFSPipeFD opens a named pipe. Named pipes have special blocking semantics -// during open: +// ReaderWriterPair returns read-only and write-only FDs for vp. // -// "Normally, opening the FIFO blocks until the other end is opened also. A -// process can open a FIFO in nonblocking mode. In this case, opening for -// read-only will succeed even if no-one has opened on the write side yet, -// opening for write-only will fail with ENXIO (no such device or address) -// unless the other end has already been opened. Under Linux, opening a FIFO -// for read and write will succeed both in blocking and nonblocking mode. POSIX -// leaves this behavior undefined. This can be used to open a FIFO for writing -// while there are no readers available." - fifo(7) -func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) { +// Preconditions: statusFlags should not contain an open access mode. +func (vp *VFSPipe) ReaderWriterPair(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, *vfs.FileDescription) { + return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags) +} + +// Open opens the pipe represented by vp. +func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, error) { vp.mu.Lock() defer vp.mu.Unlock() - readable := vfs.MayReadFileWithOpenFlags(flags) - writable := vfs.MayWriteFileWithOpenFlags(flags) + readable := vfs.MayReadFileWithOpenFlags(statusFlags) + writable := vfs.MayWriteFileWithOpenFlags(statusFlags) if !readable && !writable { return nil, syserror.EINVAL } - vfd, err := vp.open(vfsd, vfsfd, flags) - if err != nil { - return nil, err - } + fd := vp.newFD(mnt, vfsd, statusFlags) + // Named pipes have special blocking semantics during open: + // + // "Normally, opening the FIFO blocks until the other end is opened also. A + // process can open a FIFO in nonblocking mode. In this case, opening for + // read-only will succeed even if no-one has opened on the write side yet, + // opening for write-only will fail with ENXIO (no such device or address) + // unless the other end has already been opened. Under Linux, opening a + // FIFO for read and write will succeed both in blocking and nonblocking + // mode. POSIX leaves this behavior undefined. This can be used to open a + // FIFO for writing while there are no readers available." - fifo(7) switch { case readable && writable: // Pipes opened for read-write always succeed without blocking. @@ -89,23 +93,26 @@ func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, vfsd *vfs.Dentry, vfsfd *vf case readable: newHandleLocked(&vp.rWakeup) - // If this pipe is being opened as nonblocking and there's no + // If this pipe is being opened as blocking and there's no // writer, we have to wait for a writer to open the other end. - if flags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() && !waitFor(&vp.mu, &vp.wWakeup, ctx) { + if vp.pipe.isNamed && statusFlags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() && !waitFor(&vp.mu, &vp.wWakeup, ctx) { + fd.DecRef() return nil, syserror.EINTR } case writable: newHandleLocked(&vp.wWakeup) - if !vp.pipe.HasReaders() { - // Nonblocking, write-only opens fail with ENXIO when - // the read side isn't open yet. - if flags&linux.O_NONBLOCK != 0 { + if vp.pipe.isNamed && !vp.pipe.HasReaders() { + // Non-blocking, write-only opens fail with ENXIO when the read + // side isn't open yet. + if statusFlags&linux.O_NONBLOCK != 0 { + fd.DecRef() return nil, syserror.ENXIO } // Wait for a reader to open the other end. if !waitFor(&vp.mu, &vp.rWakeup, ctx) { + fd.DecRef() return nil, syserror.EINTR } } @@ -114,96 +121,93 @@ func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, vfsd *vfs.Dentry, vfsfd *vf panic("invalid pipe flags: must be readable, writable, or both") } - return vfd, nil + return fd, nil } // Preconditions: vp.mu must be held. -func (vp *VFSPipe) open(vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) { - var fd VFSPipeFD - fd.flags = flags - fd.readable = vfs.MayReadFileWithOpenFlags(flags) - fd.writable = vfs.MayWriteFileWithOpenFlags(flags) - fd.vfsfd = vfsfd - fd.pipe = &vp.pipe +func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) *vfs.FileDescription { + fd := &VFSPipeFD{ + pipe: &vp.pipe, + } + fd.vfsfd.Init(fd, statusFlags, mnt, vfsd, &vfs.FileDescriptionOptions{ + DenyPRead: true, + DenyPWrite: true, + UseDentryMetadata: true, + }) switch { - case fd.readable && fd.writable: + case fd.vfsfd.IsReadable() && fd.vfsfd.IsWritable(): vp.pipe.rOpen() vp.pipe.wOpen() - case fd.readable: + case fd.vfsfd.IsReadable(): vp.pipe.rOpen() - case fd.writable: + case fd.vfsfd.IsWritable(): vp.pipe.wOpen() default: panic("invalid pipe flags: must be readable, writable, or both") } - return &fd, nil + return &fd.vfsfd } -// VFSPipeFD implements a subset of vfs.FileDescriptionImpl for pipes. It is -// expected that filesystesm will use this in a struct implementing -// vfs.FileDescriptionImpl. +// VFSPipeFD implements vfs.FileDescriptionImpl for pipes. type VFSPipeFD struct { - pipe *Pipe - flags uint32 - readable bool - writable bool - vfsfd *vfs.FileDescription + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + + pipe *Pipe } // Release implements vfs.FileDescriptionImpl.Release. func (fd *VFSPipeFD) Release() { var event waiter.EventMask - if fd.readable { + if fd.vfsfd.IsReadable() { fd.pipe.rClose() - event |= waiter.EventIn + event |= waiter.EventOut } - if fd.writable { + if fd.vfsfd.IsWritable() { fd.pipe.wClose() - event |= waiter.EventOut + event |= waiter.EventIn | waiter.EventHUp } if event == 0 { panic("invalid pipe flags: must be readable, writable, or both") } - if fd.writable { - fd.vfsfd.VirtualDentry().Mount().EndWrite() - } - fd.pipe.Notify(event) } -// OnClose implements vfs.FileDescriptionImpl.OnClose. -func (fd *VFSPipeFD) OnClose(_ context.Context) error { - return nil +// Readiness implements waiter.Waitable.Readiness. +func (fd *VFSPipeFD) Readiness(mask waiter.EventMask) waiter.EventMask { + switch { + case fd.vfsfd.IsReadable() && fd.vfsfd.IsWritable(): + return fd.pipe.rwReadiness() + case fd.vfsfd.IsReadable(): + return fd.pipe.rReadiness() + case fd.vfsfd.IsWritable(): + return fd.pipe.wReadiness() + default: + panic("pipe FD is neither readable nor writable") + } } -// PRead implements vfs.FileDescriptionImpl.PRead. -func (fd *VFSPipeFD) PRead(_ context.Context, _ usermem.IOSequence, _ int64, _ vfs.ReadOptions) (int64, error) { - return 0, syserror.ESPIPE +// EventRegister implements waiter.Waitable.EventRegister. +func (fd *VFSPipeFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + fd.pipe.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (fd *VFSPipeFD) EventUnregister(e *waiter.Entry) { + fd.pipe.EventUnregister(e) } // Read implements vfs.FileDescriptionImpl.Read. func (fd *VFSPipeFD) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { - if !fd.readable { - return 0, syserror.EINVAL - } - return fd.pipe.Read(ctx, dst) } -// PWrite implements vfs.FileDescriptionImpl.PWrite. -func (fd *VFSPipeFD) PWrite(_ context.Context, _ usermem.IOSequence, _ int64, _ vfs.WriteOptions) (int64, error) { - return 0, syserror.ESPIPE -} - // Write implements vfs.FileDescriptionImpl.Write. func (fd *VFSPipeFD) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) { - if !fd.writable { - return 0, syserror.EINVAL - } - return fd.pipe.Write(ctx, src) } @@ -211,3 +215,17 @@ func (fd *VFSPipeFD) Write(ctx context.Context, src usermem.IOSequence, _ vfs.Wr func (fd *VFSPipeFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { return fd.pipe.Ioctl(ctx, uio, args) } + +// PipeSize implements fcntl(F_GETPIPE_SZ). +func (fd *VFSPipeFD) PipeSize() int64 { + // Inline Pipe.FifoSize() rather than calling it with nil Context and + // fs.File and ignoring the returned error (which is always nil). + fd.pipe.mu.Lock() + defer fd.pipe.mu.Unlock() + return fd.pipe.max +} + +// SetPipeSize implements fcntl(F_SETPIPE_SZ). +func (fd *VFSPipeFD) SetPipeSize(size int64) (int64, error) { + return fd.pipe.SetFifoSize(size) +} diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go index 35ad97d5d..e23e796ef 100644 --- a/pkg/sentry/kernel/ptrace.go +++ b/pkg/sentry/kernel/ptrace.go @@ -184,7 +184,6 @@ func (t *Task) CanTrace(target *Task, attach bool) bool { if targetCreds.PermittedCaps&^callerCreds.PermittedCaps != 0 { return false } - // TODO: Yama LSM return true } diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go index ded95f532..18416643b 100644 --- a/pkg/sentry/kernel/rseq.go +++ b/pkg/sentry/kernel/rseq.go @@ -304,7 +304,7 @@ func (t *Task) rseqAddrInterrupt() { } var cs linux.RSeqCriticalSection - if err := cs.CopyIn(t, critAddr); err != nil { + if _, err := cs.CopyIn(t, critAddr); err != nil { t.Debugf("Failed to copy critical section from %#x for rseq: %v", critAddr, err) t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go index 1000f3287..c00fa1138 100644 --- a/pkg/sentry/kernel/semaphore/semaphore.go +++ b/pkg/sentry/kernel/semaphore/semaphore.go @@ -554,6 +554,7 @@ func (s *sem) wakeWaiters() { for w := s.waiters.Front(); w != nil; { if s.value < w.value { // Still blocked, skip it. + w = w.Next() continue } w.ch <- struct{}{} diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go index 047b5214d..0e19286de 100644 --- a/pkg/sentry/kernel/sessions.go +++ b/pkg/sentry/kernel/sessions.go @@ -246,7 +246,7 @@ func (pg *ProcessGroup) SendSignal(info *arch.SignalInfo) error { var lastErr error for tg := range tasks.Root.tgids { - if tg.ProcessGroup() == pg { + if tg.processGroup == pg { tg.signalHandlers.mu.Lock() infoCopy := *info if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil { diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go index 208569057..f66cfcc7f 100644 --- a/pkg/sentry/kernel/shm/shm.go +++ b/pkg/sentry/kernel/shm/shm.go @@ -461,7 +461,7 @@ func (s *Shm) AddMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.A func (s *Shm) RemoveMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.AddrRange, _ uint64, _ bool) { s.mu.Lock() defer s.mu.Unlock() - // TODO(b/38173783): RemoveMapping may be called during task exit, when ctx + // RemoveMapping may be called during task exit, when ctx // is context.Background. Gracefully handle missing clocks. Failing to // update the detach time in these cases is ok, since no one can observe the // omission. diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go index 93c4fe969..84156d5a1 100644 --- a/pkg/sentry/kernel/syscalls.go +++ b/pkg/sentry/kernel/syscalls.go @@ -209,65 +209,61 @@ type Stracer interface { // SyscallEnter is called on syscall entry. // // The returned private data is passed to SyscallExit. - // - // TODO(gvisor.dev/issue/155): remove kernel imports from the strace - // package so that the type can be used directly. SyscallEnter(t *Task, sysno uintptr, args arch.SyscallArguments, flags uint32) interface{} // SyscallExit is called on syscall exit. SyscallExit(context interface{}, t *Task, sysno, rval uintptr, err error) } -// SyscallTable is a lookup table of system calls. Critically, a SyscallTable -// is *immutable*. In order to make supporting suspend and resume sane, they -// must be uniquely registered and may not change during operation. +// SyscallTable is a lookup table of system calls. // -// +stateify savable +// Note that a SyscallTable is not savable directly. Instead, they are saved as +// an OS/Arch pair and lookup happens again on restore. type SyscallTable struct { // OS is the operating system that this syscall table implements. - OS abi.OS `state:"wait"` + OS abi.OS // Arch is the architecture that this syscall table targets. - Arch arch.Arch `state:"wait"` + Arch arch.Arch // The OS version that this syscall table implements. - Version Version `state:"manual"` + Version Version // AuditNumber is a numeric constant that represents the syscall table. If // non-zero, auditNumber must be one of the AUDIT_ARCH_* values defined by // linux/audit.h. - AuditNumber uint32 `state:"manual"` + AuditNumber uint32 // Table is the collection of functions. - Table map[uintptr]Syscall `state:"manual"` + Table map[uintptr]Syscall // lookup is a fixed-size array that holds the syscalls (indexed by // their numbers). It is used for fast look ups. - lookup []SyscallFn `state:"manual"` + lookup []SyscallFn // Emulate is a collection of instruction addresses to emulate. The // keys are addresses, and the values are system call numbers. - Emulate map[usermem.Addr]uintptr `state:"manual"` + Emulate map[usermem.Addr]uintptr // The function to call in case of a missing system call. - Missing MissingFn `state:"manual"` + Missing MissingFn // Stracer traces this syscall table. - Stracer Stracer `state:"manual"` + Stracer Stracer // External is used to handle an external callback. - External func(*Kernel) `state:"manual"` + External func(*Kernel) // ExternalFilterBefore is called before External is called before the syscall is executed. // External is not called if it returns false. - ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"` + ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool // ExternalFilterAfter is called before External is called after the syscall is executed. // External is not called if it returns false. - ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"` + ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool // FeatureEnable stores the strace and one-shot enable bits. - FeatureEnable SyscallFlagsTable `state:"manual"` + FeatureEnable SyscallFlagsTable } // allSyscallTables contains all known tables. @@ -330,6 +326,13 @@ func RegisterSyscallTable(s *SyscallTable) { allSyscallTables = append(allSyscallTables, s) } +// FlushSyscallTablesTestOnly flushes the syscall tables for tests. Used for +// parameterized VFSv2 tests. +// TODO(gvisor.dv/issue/1624): Remove when VFS1 is no longer supported. +func FlushSyscallTablesTestOnly() { + allSyscallTables = nil +} + // Lookup returns the syscall implementation, if one exists. func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn { if sysno < uintptr(len(s.lookup)) { diff --git a/pkg/sentry/kernel/syscalls_state.go b/pkg/sentry/kernel/syscalls_state.go index 00358326b..90f890495 100644 --- a/pkg/sentry/kernel/syscalls_state.go +++ b/pkg/sentry/kernel/syscalls_state.go @@ -14,16 +14,34 @@ package kernel -import "fmt" +import ( + "fmt" -// afterLoad is invoked by stateify. -func (s *SyscallTable) afterLoad() { - otherTable, ok := LookupSyscallTable(s.OS, s.Arch) - if !ok { - // Couldn't find a reference? - panic(fmt.Sprintf("syscall table not found for OS %v Arch %v", s.OS, s.Arch)) + "gvisor.dev/gvisor/pkg/abi" + "gvisor.dev/gvisor/pkg/sentry/arch" +) + +// syscallTableInfo is used to reload the SyscallTable. +// +// +stateify savable +type syscallTableInfo struct { + OS abi.OS + Arch arch.Arch +} + +// saveSt saves the SyscallTable. +func (tc *TaskContext) saveSt() syscallTableInfo { + return syscallTableInfo{ + OS: tc.st.OS, + Arch: tc.st.Arch, } +} - // Copy the table. - *s = *otherTable +// loadSt loads the SyscallTable. +func (tc *TaskContext) loadSt(sti syscallTableInfo) { + st, ok := LookupSyscallTable(sti.OS, sti.Arch) + if !ok { + panic(fmt.Sprintf("syscall table not found for OS %v, Arch %v", sti.OS, sti.Arch)) + } + tc.st = st // Save the table reference. } diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index c0dbbe890..f48247c94 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -484,7 +484,7 @@ type Task struct { // bit. // // numaPolicy and numaNodeMask are protected by mu. - numaPolicy int32 + numaPolicy linux.NumaPolicy numaNodeMask uint64 // netns is the task's network namespace. netns is never nil. @@ -555,13 +555,6 @@ type Task struct { // // startTime is protected by mu. startTime ktime.Time - - // oomScoreAdj is the task's OOM score adjustment. This is currently not - // used but is maintained for consistency. - // TODO(gvisor.dev/issue/1967) - // - // oomScoreAdj is protected by mu, and is owned by the task goroutine. - oomScoreAdj int32 } func (t *Task) savePtraceTracer() *Task { @@ -784,6 +777,15 @@ func (t *Task) NewFDs(fd int32, files []*fs.File, flags FDFlags) ([]int32, error return t.fdTable.NewFDs(t, fd, files, flags) } +// NewFDsVFS2 is a convenience wrapper for t.FDTable().NewFDsVFS2. +// +// This automatically passes the task as the context. +// +// Precondition: same as FDTable. +func (t *Task) NewFDsVFS2(fd int32, files []*vfs.FileDescription, flags FDFlags) ([]int32, error) { + return t.fdTable.NewFDsVFS2(t, fd, files, flags) +} + // NewFDFrom is a convenience wrapper for t.FDTable().NewFDs with a single file. // // This automatically passes the task as the context. @@ -856,27 +858,29 @@ func (t *Task) ContainerID() string { return t.containerID } -// OOMScoreAdj gets the task's OOM score adjustment. -func (t *Task) OOMScoreAdj() (int32, error) { - t.mu.Lock() - defer t.mu.Unlock() - if t.ExitState() == TaskExitDead { - return 0, syserror.ESRCH - } - return t.oomScoreAdj, nil +// OOMScoreAdj gets the task's thread group's OOM score adjustment. +func (t *Task) OOMScoreAdj() int32 { + return atomic.LoadInt32(&t.tg.oomScoreAdj) } -// SetOOMScoreAdj sets the task's OOM score adjustment. The value should be -// between -1000 and 1000 inclusive. +// SetOOMScoreAdj sets the task's thread group's OOM score adjustment. The +// value should be between -1000 and 1000 inclusive. func (t *Task) SetOOMScoreAdj(adj int32) error { - t.mu.Lock() - defer t.mu.Unlock() - if t.ExitState() == TaskExitDead { - return syserror.ESRCH - } if adj > 1000 || adj < -1000 { return syserror.EINVAL } - t.oomScoreAdj = adj + atomic.StoreInt32(&t.tg.oomScoreAdj, adj) return nil } + +// UID returns t's uid. +// TODO(gvisor.dev/issue/170): This method is not namespaced yet. +func (t *Task) UID() uint32 { + return uint32(t.Credentials().EffectiveKUID) +} + +// GID returns t's gid. +// TODO(gvisor.dev/issue/170): This method is not namespaced yet. +func (t *Task) GID() uint32 { + return uint32(t.Credentials().EffectiveKGID) +} diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index dda502bb8..e1ecca99e 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -15,6 +15,8 @@ package kernel import ( + "sync/atomic" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/sentry/inet" @@ -260,15 +262,11 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { sh = sh.Fork() } tg = t.k.NewThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy()) + tg.oomScoreAdj = atomic.LoadInt32(&t.tg.oomScoreAdj) rseqAddr = t.rseqAddr rseqSignature = t.rseqSignature } - adj, err := t.OOMScoreAdj() - if err != nil { - return 0, nil, err - } - cfg := &TaskConfig{ Kernel: t.k, ThreadGroup: tg, @@ -287,7 +285,6 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { RSeqAddr: rseqAddr, RSeqSignature: rseqSignature, ContainerID: t.ContainerID(), - OOMScoreAdj: adj, } if opts.NewThreadGroup { cfg.Parent = t diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go index 0158b1788..9fa528384 100644 --- a/pkg/sentry/kernel/task_context.go +++ b/pkg/sentry/kernel/task_context.go @@ -49,7 +49,7 @@ type TaskContext struct { fu *futex.Manager // st is the task's syscall table. - st *SyscallTable + st *SyscallTable `state:".(syscallTableInfo)"` } // release releases all resources held by the TaskContext. release is called by @@ -58,7 +58,6 @@ func (tc *TaskContext) release() { // Nil out pointers so that if the task is saved after release, it doesn't // follow the pointers to possibly now-invalid objects. if tc.MemoryManager != nil { - // TODO(b/38173783) tc.MemoryManager.DecUsers(context.Background()) tc.MemoryManager = nil } diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go index ce3e6ef28..0325967e4 100644 --- a/pkg/sentry/kernel/task_identity.go +++ b/pkg/sentry/kernel/task_identity.go @@ -455,7 +455,7 @@ func (t *Task) SetKeepCaps(k bool) { t.creds.Store(creds) } -// updateCredsForExec updates t.creds to reflect an execve(). +// updateCredsForExecLocked updates t.creds to reflect an execve(). // // NOTE(b/30815691): We currently do not implement privileged executables // (set-user/group-ID bits and file capabilities). This allows us to make a lot diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go index 799cbcd93..d654dd997 100644 --- a/pkg/sentry/kernel/task_run.go +++ b/pkg/sentry/kernel/task_run.go @@ -96,6 +96,7 @@ func (t *Task) run(threadID uintptr) { t.tg.liveGoroutines.Done() t.tg.pidns.owner.liveGoroutines.Done() t.tg.pidns.owner.runningGoroutines.Done() + t.p.Release() // Keep argument alive because stack trace for dead variables may not be correct. runtime.KeepAlive(threadID) @@ -353,7 +354,7 @@ func (app *runApp) execute(t *Task) taskRunState { default: // What happened? Can't continue. t.Warningf("Unexpected SwitchToApp error: %v", err) - t.PrepareExit(ExitStatus{Code: t.ExtractErrno(err, -1)}) + t.PrepareExit(ExitStatus{Code: ExtractErrno(err, -1)}) return (*runExit)(nil) } } diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go index 8b148db35..09366b60c 100644 --- a/pkg/sentry/kernel/task_sched.go +++ b/pkg/sentry/kernel/task_sched.go @@ -653,14 +653,14 @@ func (t *Task) SetNiceness(n int) { } // NumaPolicy returns t's current numa policy. -func (t *Task) NumaPolicy() (policy int32, nodeMask uint64) { +func (t *Task) NumaPolicy() (policy linux.NumaPolicy, nodeMask uint64) { t.mu.Lock() defer t.mu.Unlock() return t.numaPolicy, t.numaNodeMask } // SetNumaPolicy sets t's numa policy. -func (t *Task) SetNumaPolicy(policy int32, nodeMask uint64) { +func (t *Task) SetNumaPolicy(policy linux.NumaPolicy, nodeMask uint64) { t.mu.Lock() defer t.mu.Unlock() t.numaPolicy = policy diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go index 8802db142..7d25e98f7 100644 --- a/pkg/sentry/kernel/task_signals.go +++ b/pkg/sentry/kernel/task_signals.go @@ -174,7 +174,7 @@ func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunS fallthrough case (sre == ERESTARTSYS && !act.IsRestart()): t.Debugf("Not restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo) - t.Arch().SetReturn(uintptr(-t.ExtractErrno(syserror.EINTR, -1))) + t.Arch().SetReturn(uintptr(-ExtractErrno(syserror.EINTR, -1))) default: t.Debugf("Restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo) t.Arch().RestartSyscall() @@ -263,6 +263,19 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct) if t.haveSavedSignalMask { mask = t.savedSignalMask } + + // Set up the restorer. + // x86-64 should always uses SA_RESTORER, but this flag is optional on other platforms. + // Please see the linux code as reference: + // linux/arch/x86/kernel/signal.c:__setup_rt_frame() + // If SA_RESTORER is not configured, we can use the sigreturn trampolines + // the vdso provides instead. + // Please see the linux code as reference: + // linux/arch/arm64/kernel/signal.c:setup_return() + if act.Flags&linux.SA_RESTORER == 0 { + act.Restorer = t.MemoryManager().VDSOSigReturn() + } + if err := t.Arch().SignalSetup(st, &act, info, &alt, mask); err != nil { return err } @@ -513,8 +526,6 @@ func (t *Task) canReceiveSignalLocked(sig linux.Signal) bool { if t.stop != nil { return false } - // - TODO(b/38173783): No special case for when t is also the sending task, - // because the identity of the sender is unknown. // - Do not choose tasks that have already been interrupted, as they may be // busy handling another signal. if len(t.interruptChan) != 0 { diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index 2bbf48bb8..a5035bb7f 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -93,9 +93,6 @@ type TaskConfig struct { // ContainerID is the container the new task belongs to. ContainerID string - - // oomScoreAdj is the task's OOM score adjustment. - OOMScoreAdj int32 } // NewTask creates a new task defined by cfg. @@ -146,7 +143,6 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { rseqSignature: cfg.RSeqSignature, futexWaiter: futex.NewWaiter(), containerID: cfg.ContainerID, - oomScoreAdj: cfg.OOMScoreAdj, } t.creds.Store(cfg.Credentials) t.endStopCond.L = &t.tg.signalHandlers.mu diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go index d555d69a8..c9db78e06 100644 --- a/pkg/sentry/kernel/task_syscall.go +++ b/pkg/sentry/kernel/task_syscall.go @@ -194,6 +194,19 @@ func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval u // // The syscall path is very hot; avoid defer. func (t *Task) doSyscall() taskRunState { + // Save value of the register which is clobbered in the following + // t.Arch().SetReturn(-ENOSYS) operation. This is dedicated to arm64. + // + // On x86, register rax was shared by syscall number and return + // value, and at the entry of the syscall handler, the rax was + // saved to regs.orig_rax which was exposed to user space. + // But on arm64, syscall number was passed through X8, and the X0 + // was shared by the first syscall argument and return value. The + // X0 was saved to regs.orig_x0 which was not exposed to user space. + // So we have to do the same operation here to save the X0 value + // into the task context. + t.Arch().SyscallSaveOrig() + sysno := t.Arch().SyscallNo() args := t.Arch().SyscallArgs() @@ -269,6 +282,7 @@ func (*runSyscallAfterSyscallEnterStop) execute(t *Task) taskRunState { return (*runSyscallExit)(nil) } args := t.Arch().SyscallArgs() + return t.doSyscallInvoke(sysno, args) } @@ -298,7 +312,7 @@ func (t *Task) doSyscallInvoke(sysno uintptr, args arch.SyscallArguments) taskRu return ctrl.next } } else if err != nil { - t.Arch().SetReturn(uintptr(-t.ExtractErrno(err, int(sysno)))) + t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno)))) t.haveSyscallReturn = true } else { t.Arch().SetReturn(rval) @@ -417,7 +431,7 @@ func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, calle // A return is not emulated in this case. return (*runApp)(nil) } - t.Arch().SetReturn(uintptr(-t.ExtractErrno(err, int(sysno)))) + t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno)))) } t.Arch().SetIP(t.Arch().Value(caller)) t.Arch().SetStack(t.Arch().Stack() + uintptr(t.Arch().Width())) @@ -427,7 +441,7 @@ func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, calle // ExtractErrno extracts an integer error number from the error. // The syscall number is purely for context in the error case. Use -1 if // syscall number is unknown. -func (t *Task) ExtractErrno(err error, sysno int) int { +func ExtractErrno(err error, sysno int) int { switch err := err.(type) { case nil: return 0 @@ -441,11 +455,11 @@ func (t *Task) ExtractErrno(err error, sysno int) int { // handled (and the SIGBUS is delivered). return int(syscall.EFAULT) case *os.PathError: - return t.ExtractErrno(err.Err, sysno) + return ExtractErrno(err.Err, sysno) case *os.LinkError: - return t.ExtractErrno(err.Err, sysno) + return ExtractErrno(err.Err, sysno) case *os.SyscallError: - return t.ExtractErrno(err.Err, sysno) + return ExtractErrno(err.Err, sysno) default: if errno, ok := syserror.TranslateError(err); ok { return int(errno) diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go index 268f62e9d..52849f5b3 100644 --- a/pkg/sentry/kernel/thread_group.go +++ b/pkg/sentry/kernel/thread_group.go @@ -254,6 +254,13 @@ type ThreadGroup struct { // // tty is protected by the signal mutex. tty *TTY + + // oomScoreAdj is the thread group's OOM score adjustment. This is + // currently not used but is maintained for consistency. + // TODO(gvisor.dev/issue/1967) + // + // oomScoreAdj is accessed using atomic memory operations. + oomScoreAdj int32 } // NewThreadGroup returns a new, empty thread group in PID namespace pidns. The diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go index 706de83ef..e959700f2 100644 --- a/pkg/sentry/kernel/time/time.go +++ b/pkg/sentry/kernel/time/time.go @@ -245,7 +245,7 @@ type Clock interface { type WallRateClock struct{} // WallTimeUntil implements Clock.WallTimeUntil. -func (WallRateClock) WallTimeUntil(t, now Time) time.Duration { +func (*WallRateClock) WallTimeUntil(t, now Time) time.Duration { return t.Sub(now) } @@ -254,16 +254,16 @@ func (WallRateClock) WallTimeUntil(t, now Time) time.Duration { type NoClockEvents struct{} // Readiness implements waiter.Waitable.Readiness. -func (NoClockEvents) Readiness(mask waiter.EventMask) waiter.EventMask { +func (*NoClockEvents) Readiness(mask waiter.EventMask) waiter.EventMask { return 0 } // EventRegister implements waiter.Waitable.EventRegister. -func (NoClockEvents) EventRegister(e *waiter.Entry, mask waiter.EventMask) { +func (*NoClockEvents) EventRegister(e *waiter.Entry, mask waiter.EventMask) { } // EventUnregister implements waiter.Waitable.EventUnregister. -func (NoClockEvents) EventUnregister(e *waiter.Entry) { +func (*NoClockEvents) EventUnregister(e *waiter.Entry) { } // ClockEventsQueue implements waiter.Waitable by wrapping waiter.Queue and @@ -273,7 +273,7 @@ type ClockEventsQueue struct { } // Readiness implements waiter.Waitable.Readiness. -func (ClockEventsQueue) Readiness(mask waiter.EventMask) waiter.EventMask { +func (*ClockEventsQueue) Readiness(mask waiter.EventMask) waiter.EventMask { return 0 } |