diff options
Diffstat (limited to 'pkg/sentry/kernel')
32 files changed, 756 insertions, 396 deletions
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index 90dd4a047..0ee60569c 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -184,6 +184,7 @@ go_library( "task_exit.go", "task_futex.go", "task_identity.go", + "task_image.go", "task_list.go", "task_log.go", "task_net.go", @@ -224,6 +225,7 @@ go_library( "//pkg/cpuid", "//pkg/eventchannel", "//pkg/fspath", + "//pkg/goid", "//pkg/log", "//pkg/marshal", "//pkg/marshal/primitive", diff --git a/pkg/sentry/kernel/aio.go b/pkg/sentry/kernel/aio.go index 0ac78c0b8..ec36d1a49 100644 --- a/pkg/sentry/kernel/aio.go +++ b/pkg/sentry/kernel/aio.go @@ -15,10 +15,7 @@ package kernel import ( - "time" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/log" ) // AIOCallback is an function that does asynchronous I/O on behalf of a task. @@ -26,7 +23,7 @@ type AIOCallback func(context.Context) // QueueAIO queues an AIOCallback which will be run asynchronously. func (t *Task) QueueAIO(cb AIOCallback) { - ctx := taskAsyncContext{t: t} + ctx := t.AsyncContext() wg := &t.TaskSet().aioGoroutines wg.Add(1) go func() { @@ -34,48 +31,3 @@ func (t *Task) QueueAIO(cb AIOCallback) { wg.Done() }() } - -type taskAsyncContext struct { - context.NoopSleeper - t *Task -} - -// Debugf implements log.Logger.Debugf. -func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) { - ctx.t.Debugf(format, v...) -} - -// Infof implements log.Logger.Infof. -func (ctx taskAsyncContext) Infof(format string, v ...interface{}) { - ctx.t.Infof(format, v...) -} - -// Warningf implements log.Logger.Warningf. -func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) { - ctx.t.Warningf(format, v...) -} - -// IsLogging implements log.Logger.IsLogging. -func (ctx taskAsyncContext) IsLogging(level log.Level) bool { - return ctx.t.IsLogging(level) -} - -// Deadline implements context.Context.Deadline. -func (ctx taskAsyncContext) Deadline() (time.Time, bool) { - return ctx.t.Deadline() -} - -// Done implements context.Context.Done. -func (ctx taskAsyncContext) Done() <-chan struct{} { - return ctx.t.Done() -} - -// Err implements context.Context.Err. -func (ctx taskAsyncContext) Err() error { - return ctx.t.Err() -} - -// Value implements context.Context.Value. -func (ctx taskAsyncContext) Value(key interface{}) interface{} { - return ctx.t.Value(key) -} diff --git a/pkg/sentry/kernel/auth/id.go b/pkg/sentry/kernel/auth/id.go index 4c32ee703..994486ea8 100644 --- a/pkg/sentry/kernel/auth/id.go +++ b/pkg/sentry/kernel/auth/id.go @@ -62,18 +62,28 @@ const ( // field is displayed as 4294967295 (-1 as an unsigned integer);" - // user_namespaces(7) OverflowUID = UID(65534) + + // OverflowGID is the group equivalent to OverflowUID. OverflowGID = GID(65534) // NobodyKUID is the user ID usually reserved for the least privileged user // "nobody". NobodyKUID = KUID(65534) + + // NobodyKGID is the group equivalent to NobodyKUID. NobodyKGID = KGID(65534) // RootKUID is the user ID usually used for the most privileged user "root". RootKUID = KUID(0) + + // RootKGID is the group equivalent to RootKUID. RootKGID = KGID(0) - RootUID = UID(0) - RootGID = GID(0) + + // RootUID is the root user. + RootUID = UID(0) + + // RootGID is the root group. + RootGID = GID(0) ) // Ok returns true if uid is not -1. diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go index bb94769c4..a8596410f 100644 --- a/pkg/sentry/kernel/context.go +++ b/pkg/sentry/kernel/context.go @@ -15,8 +15,6 @@ package kernel import ( - "time" - "gvisor.dev/gvisor/pkg/context" ) @@ -98,18 +96,3 @@ func TaskFromContext(ctx context.Context) *Task { } return nil } - -// Deadline implements context.Context.Deadline. -func (*Task) Deadline() (time.Time, bool) { - return time.Time{}, false -} - -// Done implements context.Context.Done. -func (*Task) Done() <-chan struct{} { - return nil -} - -// Err implements context.Context.Err. -func (*Task) Err() error { - return nil -} diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go index 15519f0df..61aeca044 100644 --- a/pkg/sentry/kernel/epoll/epoll.go +++ b/pkg/sentry/kernel/epoll/epoll.go @@ -273,7 +273,7 @@ func (e *EventPoll) ReadEvents(max int) []linux.EpollEvent { // // Callback is called when one of the files we're polling becomes ready. It // moves said file to the readyList if it's currently in the waiting list. -func (p *pollEntry) Callback(*waiter.Entry) { +func (p *pollEntry) Callback(*waiter.Entry, waiter.EventMask) { e := p.epoll e.listsMu.Lock() @@ -306,9 +306,8 @@ func (e *EventPoll) initEntryReadiness(entry *pollEntry) { f.EventRegister(&entry.waiter, entry.mask) // Check if the file happens to already be in a ready state. - ready := f.Readiness(entry.mask) & entry.mask - if ready != 0 { - entry.Callback(&entry.waiter) + if ready := f.Readiness(entry.mask) & entry.mask; ready != 0 { + entry.Callback(&entry.waiter, ready) } } diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD index 2b3955598..f855f038b 100644 --- a/pkg/sentry/kernel/fasync/BUILD +++ b/pkg/sentry/kernel/fasync/BUILD @@ -8,11 +8,13 @@ go_library( visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", + "//pkg/sentry/arch", "//pkg/sentry/fs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", "//pkg/sync", + "//pkg/syserror", "//pkg/waiter", ], ) diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go index 153d2cd9b..b66d61c6f 100644 --- a/pkg/sentry/kernel/fasync/fasync.go +++ b/pkg/sentry/kernel/fasync/fasync.go @@ -17,22 +17,45 @@ package fasync import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) -// New creates a new fs.FileAsync. -func New() fs.FileAsync { - return &FileAsync{} +// Table to convert waiter event masks into si_band siginfo codes. +// Taken from fs/fcntl.c:band_table. +var bandTable = map[waiter.EventMask]int64{ + // POLL_IN + waiter.EventIn: linux.EPOLLIN | linux.EPOLLRDNORM, + // POLL_OUT + waiter.EventOut: linux.EPOLLOUT | linux.EPOLLWRNORM | linux.EPOLLWRBAND, + // POLL_ERR + waiter.EventErr: linux.EPOLLERR, + // POLL_PRI + waiter.EventPri: linux.EPOLLPRI | linux.EPOLLRDBAND, + // POLL_HUP + waiter.EventHUp: linux.EPOLLHUP | linux.EPOLLERR, } -// NewVFS2 creates a new vfs.FileAsync. -func NewVFS2() vfs.FileAsync { - return &FileAsync{} +// New returns a function that creates a new fs.FileAsync with the given file +// descriptor. +func New(fd int) func() fs.FileAsync { + return func() fs.FileAsync { + return &FileAsync{fd: fd} + } +} + +// NewVFS2 returns a function that creates a new vfs.FileAsync with the given +// file descriptor. +func NewVFS2(fd int) func() vfs.FileAsync { + return func() vfs.FileAsync { + return &FileAsync{fd: fd} + } } // FileAsync sends signals when the registered file is ready for IO. @@ -42,6 +65,12 @@ type FileAsync struct { // e is immutable after first use (which is protected by mu below). e waiter.Entry + // fd is the file descriptor to notify about. + // It is immutable, set at allocation time. This matches Linux semantics in + // fs/fcntl.c:fasync_helper. + // The fd value is passed to the signal recipient in siginfo.si_fd. + fd int + // regMu protects registeration and unregistration actions on e. // // regMu must be held while registration decisions are being made @@ -56,6 +85,10 @@ type FileAsync struct { mu sync.Mutex `state:"nosave"` requester *auth.Credentials registered bool + // signal is the signal to deliver upon I/O being available. + // The default value ("zero signal") means the default SIGIO signal will be + // delivered. + signal linux.Signal // Only one of the following is allowed to be non-nil. recipientPG *kernel.ProcessGroup @@ -64,10 +97,10 @@ type FileAsync struct { } // Callback sends a signal. -func (a *FileAsync) Callback(e *waiter.Entry) { +func (a *FileAsync) Callback(e *waiter.Entry, mask waiter.EventMask) { a.mu.Lock() + defer a.mu.Unlock() if !a.registered { - a.mu.Unlock() return } t := a.recipientT @@ -80,19 +113,34 @@ func (a *FileAsync) Callback(e *waiter.Entry) { } if t == nil { // No recipient has been registered. - a.mu.Unlock() return } c := t.Credentials() // Logic from sigio_perm in fs/fcntl.c. - if a.requester.EffectiveKUID == 0 || + permCheck := (a.requester.EffectiveKUID == 0 || a.requester.EffectiveKUID == c.SavedKUID || a.requester.EffectiveKUID == c.RealKUID || a.requester.RealKUID == c.SavedKUID || - a.requester.RealKUID == c.RealKUID { - t.SendSignal(kernel.SignalInfoPriv(linux.SIGIO)) + a.requester.RealKUID == c.RealKUID) + if !permCheck { + return } - a.mu.Unlock() + signalInfo := &arch.SignalInfo{ + Signo: int32(linux.SIGIO), + Code: arch.SignalInfoKernel, + } + if a.signal != 0 { + signalInfo.Signo = int32(a.signal) + signalInfo.SetFD(uint32(a.fd)) + var band int64 + for m, bandCode := range bandTable { + if m&mask != 0 { + band |= bandCode + } + } + signalInfo.SetBand(band) + } + t.SendSignal(signalInfo) } // Register sets the file which will be monitored for IO events. @@ -186,3 +234,25 @@ func (a *FileAsync) ClearOwner() { a.recipientTG = nil a.recipientPG = nil } + +// Signal returns which signal will be sent to the signal recipient. +// A value of zero means the signal to deliver wasn't customized, which means +// the default signal (SIGIO) will be delivered. +func (a *FileAsync) Signal() linux.Signal { + a.mu.Lock() + defer a.mu.Unlock() + return a.signal +} + +// SetSignal overrides which signal to send when I/O is available. +// The default behavior can be reset by specifying signal zero, which means +// to send SIGIO. +func (a *FileAsync) SetSignal(signal linux.Signal) error { + if signal != 0 && !signal.IsValid() { + return syserror.EINVAL + } + a.mu.Lock() + defer a.mu.Unlock() + a.signal = signal + return nil +} diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go index 470d8bf83..f17f9c59c 100644 --- a/pkg/sentry/kernel/fd_table_unsafe.go +++ b/pkg/sentry/kernel/fd_table_unsafe.go @@ -121,18 +121,21 @@ func (f *FDTable) setAll(ctx context.Context, fd int32, file *fs.File, fileVFS2 panic("VFS1 and VFS2 files set") } - slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice)) + slicePtr := (*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice)) // Grow the table as required. - if last := int32(len(slice)); fd >= last { + if last := int32(len(*slicePtr)); fd >= last { end := fd + 1 if end < 2*last { end = 2 * last } - slice = append(slice, make([]unsafe.Pointer, end-last)...) - atomic.StorePointer(&f.slice, unsafe.Pointer(&slice)) + newSlice := append(*slicePtr, make([]unsafe.Pointer, end-last)...) + slicePtr = &newSlice + atomic.StorePointer(&f.slice, unsafe.Pointer(slicePtr)) } + slice := *slicePtr + var desc *descriptor if file != nil || fileVFS2 != nil { desc = &descriptor{ diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 9b2be44d4..303ae8056 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -214,9 +214,11 @@ type Kernel struct { // netlinkPorts manages allocation of netlink socket port IDs. netlinkPorts *port.Manager - // saveErr is the error causing the sandbox to exit during save, if - // any. It is protected by extMu. - saveErr error `state:"nosave"` + // saveStatus is nil if the sandbox has not been saved, errSaved or + // errAutoSaved if it has been saved successfully, or the error causing the + // sandbox to exit during save. + // It is protected by extMu. + saveStatus error `state:"nosave"` // danglingEndpoints is used to save / restore tcpip.DanglingEndpoints. danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"` @@ -632,7 +634,7 @@ func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { defer k.tasks.mu.RUnlock() for t := range k.tasks.Root.tids { // We can skip locking Task.mu here since the kernel is paused. - if mm := t.tc.MemoryManager; mm != nil { + if mm := t.image.MemoryManager; mm != nil { if _, ok := invalidated[mm]; !ok { if err := mm.InvalidateUnsavable(ctx); err != nil { return err @@ -642,7 +644,7 @@ func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { } // I really wish we just had a sync.Map of all MMs... if r, ok := t.runState.(*runSyscallAfterExecStop); ok { - if err := r.tc.MemoryManager.InvalidateUnsavable(ctx); err != nil { + if err := r.image.MemoryManager.InvalidateUnsavable(ctx); err != nil { return err } } @@ -1017,7 +1019,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, Features: k.featureSet, } - tc, se := k.LoadTaskImage(ctx, loadArgs) + image, se := k.LoadTaskImage(ctx, loadArgs) if se != nil { return nil, 0, errors.New(se.String()) } @@ -1030,7 +1032,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, config := &TaskConfig{ Kernel: k, ThreadGroup: tg, - TaskContext: tc, + TaskImage: image, FSContext: fsContext, FDTable: args.FDTable, Credentials: args.Credentials, @@ -1046,7 +1048,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, if err != nil { return nil, 0, err } - t.traceExecEvent(tc) // Simulate exec for tracing. + t.traceExecEvent(image) // Simulate exec for tracing. // Success. cu.Release() @@ -1359,6 +1361,13 @@ func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error { // not have meaningful trace data. Rebuilding here ensures that we can do so // after tracing has been enabled. func (k *Kernel) RebuildTraceContexts() { + // We need to pause all task goroutines because Task.rebuildTraceContext() + // replaces Task.traceContext and Task.traceTask, which are + // task-goroutine-exclusive (i.e. the task goroutine assumes that it can + // access them without synchronization) for performance. + k.Pause() + defer k.Unpause() + k.extMu.Lock() defer k.extMu.Unlock() k.tasks.mu.RLock() @@ -1424,8 +1433,8 @@ func (k *Kernel) GlobalInit() *ThreadGroup { return k.globalInit } -// TestOnly_SetGlobalInit sets the thread group with ID 1 in the root PID namespace. -func (k *Kernel) TestOnly_SetGlobalInit(tg *ThreadGroup) { +// TestOnlySetGlobalInit sets the thread group with ID 1 in the root PID namespace. +func (k *Kernel) TestOnlySetGlobalInit(tg *ThreadGroup) { k.globalInit = tg } @@ -1474,12 +1483,42 @@ func (k *Kernel) NetlinkPorts() *port.Manager { return k.netlinkPorts } -// SaveError returns the sandbox error that caused the kernel to exit during -// save. -func (k *Kernel) SaveError() error { +var ( + errSaved = errors.New("sandbox has been successfully saved") + errAutoSaved = errors.New("sandbox has been successfully auto-saved") +) + +// SaveStatus returns the sandbox save status. If it was saved successfully, +// autosaved indicates whether save was triggered by autosave. If it was not +// saved successfully, err indicates the sandbox error that caused the kernel to +// exit during save. +func (k *Kernel) SaveStatus() (saved, autosaved bool, err error) { k.extMu.Lock() defer k.extMu.Unlock() - return k.saveErr + switch k.saveStatus { + case nil: + return false, false, nil + case errSaved: + return true, false, nil + case errAutoSaved: + return true, true, nil + default: + return false, false, k.saveStatus + } +} + +// SetSaveSuccess sets the flag indicating that save completed successfully, if +// no status was already set. +func (k *Kernel) SetSaveSuccess(autosave bool) { + k.extMu.Lock() + defer k.extMu.Unlock() + if k.saveStatus == nil { + if autosave { + k.saveStatus = errAutoSaved + } else { + k.saveStatus = errSaved + } + } } // SetSaveError sets the sandbox error that caused the kernel to exit during @@ -1487,8 +1526,8 @@ func (k *Kernel) SaveError() error { func (k *Kernel) SetSaveError(err error) { k.extMu.Lock() defer k.extMu.Unlock() - if k.saveErr == nil { - k.saveErr = err + if k.saveStatus == nil { + k.saveStatus = err } } diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index 7b23cbe86..2d47d2e82 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -63,10 +63,19 @@ func NewVFSPipe(isNamed bool, sizeBytes int64) *VFSPipe { // ReaderWriterPair returns read-only and write-only FDs for vp. // // Preconditions: statusFlags should not contain an open access mode. -func (vp *VFSPipe) ReaderWriterPair(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, *vfs.FileDescription) { +func (vp *VFSPipe) ReaderWriterPair(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, *vfs.FileDescription, error) { // Connected pipes share the same locks. locks := &vfs.FileLocks{} - return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags, locks), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags, locks) + r, err := vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags, locks) + if err != nil { + return nil, nil, err + } + w, err := vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags, locks) + if err != nil { + r.DecRef(ctx) + return nil, nil, err + } + return r, w, nil } // Allocate implements vfs.FileDescriptionImpl.Allocate. @@ -85,7 +94,10 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s return nil, syserror.EINVAL } - fd := vp.newFD(mnt, vfsd, statusFlags, locks) + fd, err := vp.newFD(mnt, vfsd, statusFlags, locks) + if err != nil { + return nil, err + } // Named pipes have special blocking semantics during open: // @@ -137,16 +149,18 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s } // Preconditions: vp.mu must be held. -func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) *vfs.FileDescription { +func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) (*vfs.FileDescription, error) { fd := &VFSPipeFD{ pipe: &vp.pipe, } fd.LockFD.Init(locks) - fd.vfsfd.Init(fd, statusFlags, mnt, vfsd, &vfs.FileDescriptionOptions{ + if err := fd.vfsfd.Init(fd, statusFlags, mnt, vfsd, &vfs.FileDescriptionOptions{ DenyPRead: true, DenyPWrite: true, UseDentryMetadata: true, - }) + }); err != nil { + return nil, err + } switch { case fd.vfsfd.IsReadable() && fd.vfsfd.IsWritable(): @@ -160,7 +174,7 @@ func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, l panic("invalid pipe flags: must be readable, writable, or both") } - return &fd.vfsfd + return &fd.vfsfd, nil } // VFSPipeFD implements vfs.FileDescriptionImpl for pipes. It also implements diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go index 1abfe2201..cef58a590 100644 --- a/pkg/sentry/kernel/ptrace.go +++ b/pkg/sentry/kernel/ptrace.go @@ -259,8 +259,8 @@ func (t *Task) ptraceTrapLocked(code int32) { Signo: int32(linux.SIGTRAP), Code: code, } - t.ptraceSiginfo.SetPid(int32(t.tg.pidns.tids[t])) - t.ptraceSiginfo.SetUid(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) + t.ptraceSiginfo.SetPID(int32(t.tg.pidns.tids[t])) + t.ptraceSiginfo.SetUID(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) if t.beginPtraceStopLocked() { tracer := t.Tracer() tracer.signalStop(t, arch.CLD_TRAPPED, int32(linux.SIGTRAP)) diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go index 387edfa91..60917e7d3 100644 --- a/pkg/sentry/kernel/seccomp.go +++ b/pkg/sentry/kernel/seccomp.go @@ -106,7 +106,7 @@ func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip u func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip usermem.Addr) uint32 { data := linux.SeccompData{ Nr: sysno, - Arch: t.tc.st.AuditNumber, + Arch: t.image.st.AuditNumber, InstructionPointer: uint64(ip), } // data.args is []uint64 and args is []arch.SyscallArgument (uintptr), so diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go index b99c0bffa..db01e4a97 100644 --- a/pkg/sentry/kernel/semaphore/semaphore.go +++ b/pkg/sentry/kernel/semaphore/semaphore.go @@ -29,17 +29,17 @@ import ( ) const ( - valueMax = 32767 // SEMVMX + // Maximum semaphore value. + valueMax = linux.SEMVMX - // semaphoresMax is "maximum number of semaphores per semaphore ID" (SEMMSL). - semaphoresMax = 32000 + // Maximum number of semaphore sets. + setsMax = linux.SEMMNI - // setMax is "system-wide limit on the number of semaphore sets" (SEMMNI). - setsMax = 32000 + // Maximum number of semaphroes in a semaphore set. + semsMax = linux.SEMMSL - // semaphoresTotalMax is "system-wide limit on the number of semaphores" - // (SEMMNS = SEMMNI*SEMMSL). - semaphoresTotalMax = 1024000000 + // Maximum number of semaphores in all semaphroe sets. + semsTotalMax = linux.SEMMNS ) // Registry maintains a set of semaphores that can be found by key or ID. @@ -52,6 +52,9 @@ type Registry struct { mu sync.Mutex `state:"nosave"` semaphores map[int32]*Set lastIDUsed int32 + // indexes maintains a mapping between a set's index in virtual array and + // its identifier. + indexes map[int32]int32 } // Set represents a set of semaphores that can be operated atomically. @@ -113,6 +116,7 @@ func NewRegistry(userNS *auth.UserNamespace) *Registry { return &Registry{ userNS: userNS, semaphores: make(map[int32]*Set), + indexes: make(map[int32]int32), } } @@ -122,7 +126,7 @@ func NewRegistry(userNS *auth.UserNamespace) *Registry { // be found. If exclusive is true, it fails if a set with the same key already // exists. func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linux.FileMode, private, create, exclusive bool) (*Set, error) { - if nsems < 0 || nsems > semaphoresMax { + if nsems < 0 || nsems > semsMax { return nil, syserror.EINVAL } @@ -163,10 +167,13 @@ func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linu } // Apply system limits. + // + // Map semaphores and map indexes in a registry are of the same size, + // check map semaphores only here for the system limit. if len(r.semaphores) >= setsMax { return nil, syserror.EINVAL } - if r.totalSems() > int(semaphoresTotalMax-nsems) { + if r.totalSems() > int(semsTotalMax-nsems) { return nil, syserror.EINVAL } @@ -176,6 +183,53 @@ func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linu return r.newSet(ctx, key, owner, owner, perms, nsems) } +// IPCInfo returns information about system-wide semaphore limits and parameters. +func (r *Registry) IPCInfo() *linux.SemInfo { + return &linux.SemInfo{ + SemMap: linux.SEMMAP, + SemMni: linux.SEMMNI, + SemMns: linux.SEMMNS, + SemMnu: linux.SEMMNU, + SemMsl: linux.SEMMSL, + SemOpm: linux.SEMOPM, + SemUme: linux.SEMUME, + SemUsz: linux.SEMUSZ, + SemVmx: linux.SEMVMX, + SemAem: linux.SEMAEM, + } +} + +// SemInfo returns a seminfo structure containing the same information as +// for IPC_INFO, except that SemUsz field returns the number of existing +// semaphore sets, and SemAem field returns the number of existing semaphores. +func (r *Registry) SemInfo() *linux.SemInfo { + r.mu.Lock() + defer r.mu.Unlock() + + info := r.IPCInfo() + info.SemUsz = uint32(len(r.semaphores)) + info.SemAem = uint32(r.totalSems()) + + return info +} + +// HighestIndex returns the index of the highest used entry in +// the kernel's array. +func (r *Registry) HighestIndex() int32 { + r.mu.Lock() + defer r.mu.Unlock() + + // By default, highest used index is 0 even though + // there is no semaphroe set. + var highestIndex int32 + for index := range r.indexes { + if index > highestIndex { + highestIndex = index + } + } + return highestIndex +} + // RemoveID removes set with give 'id' from the registry and marks the set as // dead. All waiters will be awakened and fail. func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error { @@ -186,6 +240,11 @@ func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error { if set == nil { return syserror.EINVAL } + index, found := r.findIndexByID(id) + if !found { + // Inconsistent state. + panic(fmt.Sprintf("unable to find an index for ID: %d", id)) + } set.mu.Lock() defer set.mu.Unlock() @@ -197,6 +256,7 @@ func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error { } delete(r.semaphores, set.ID) + delete(r.indexes, index) set.destroy() return nil } @@ -220,6 +280,11 @@ func (r *Registry) newSet(ctx context.Context, key int32, owner, creator fs.File continue } if r.semaphores[id] == nil { + index, found := r.findFirstAvailableIndex() + if !found { + panic("unable to find an available index") + } + r.indexes[index] = id r.lastIDUsed = id r.semaphores[id] = set set.ID = id @@ -238,6 +303,18 @@ func (r *Registry) FindByID(id int32) *Set { return r.semaphores[id] } +// FindByIndex looks up a set given an index. +func (r *Registry) FindByIndex(index int32) *Set { + r.mu.Lock() + defer r.mu.Unlock() + + id, present := r.indexes[index] + if !present { + return nil + } + return r.semaphores[id] +} + func (r *Registry) findByKey(key int32) *Set { for _, v := range r.semaphores { if v.key == key { @@ -247,6 +324,24 @@ func (r *Registry) findByKey(key int32) *Set { return nil } +func (r *Registry) findIndexByID(id int32) (int32, bool) { + for k, v := range r.indexes { + if v == id { + return k, true + } + } + return 0, false +} + +func (r *Registry) findFirstAvailableIndex() (int32, bool) { + for index := int32(0); index < setsMax; index++ { + if _, present := r.indexes[index]; !present { + return index, true + } + } + return 0, false +} + func (r *Registry) totalSems() int { totalSems := 0 for _, v := range r.semaphores { diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD index 80a592c8f..073e14507 100644 --- a/pkg/sentry/kernel/shm/BUILD +++ b/pkg/sentry/kernel/shm/BUILD @@ -6,6 +6,9 @@ package(licenses = ["notice"]) go_template_instance( name = "shm_refs", out = "shm_refs.go", + consts = { + "enableLogging": "true", + }, package = "shm", prefix = "Shm", template = "//pkg/refsvfs2:refs_template", diff --git a/pkg/sentry/kernel/signal.go b/pkg/sentry/kernel/signal.go index e8cce37d0..2488ae7d5 100644 --- a/pkg/sentry/kernel/signal.go +++ b/pkg/sentry/kernel/signal.go @@ -73,7 +73,7 @@ func SignalInfoNoInfo(sig linux.Signal, sender, receiver *Task) *arch.SignalInfo Signo: int32(sig), Code: arch.SignalInfoUser, } - info.SetPid(int32(receiver.tg.pidns.IDOfThreadGroup(sender.tg))) - info.SetUid(int32(sender.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) + info.SetPID(int32(receiver.tg.pidns.IDOfThreadGroup(sender.tg))) + info.SetUID(int32(sender.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) return info } diff --git a/pkg/sentry/kernel/signalfd/signalfd.go b/pkg/sentry/kernel/signalfd/signalfd.go index 78f718cfe..884966120 100644 --- a/pkg/sentry/kernel/signalfd/signalfd.go +++ b/pkg/sentry/kernel/signalfd/signalfd.go @@ -106,8 +106,8 @@ func (s *SignalOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS Signo: uint32(info.Signo), Errno: info.Errno, Code: info.Code, - PID: uint32(info.Pid()), - UID: uint32(info.Uid()), + PID: uint32(info.PID()), + UID: uint32(info.UID()), Status: info.Status(), Overrun: uint32(info.Overrun()), Addr: info.Addr(), diff --git a/pkg/sentry/kernel/syscalls_state.go b/pkg/sentry/kernel/syscalls_state.go index 90f890495..0b17a562e 100644 --- a/pkg/sentry/kernel/syscalls_state.go +++ b/pkg/sentry/kernel/syscalls_state.go @@ -30,18 +30,18 @@ type syscallTableInfo struct { } // saveSt saves the SyscallTable. -func (tc *TaskContext) saveSt() syscallTableInfo { +func (image *TaskImage) saveSt() syscallTableInfo { return syscallTableInfo{ - OS: tc.st.OS, - Arch: tc.st.Arch, + OS: image.st.OS, + Arch: image.st.Arch, } } // loadSt loads the SyscallTable. -func (tc *TaskContext) loadSt(sti syscallTableInfo) { +func (image *TaskImage) loadSt(sti syscallTableInfo) { st, ok := LookupSyscallTable(sti.OS, sti.Arch) if !ok { panic(fmt.Sprintf("syscall table not found for OS %v, Arch %v", sti.OS, sti.Arch)) } - tc.st = st // Save the table reference. + image.st = st // Save the table reference. } diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go index a83ce219c..3fee7aa68 100644 --- a/pkg/sentry/kernel/syslog.go +++ b/pkg/sentry/kernel/syslog.go @@ -75,6 +75,12 @@ func (s *syslog) Log() []byte { "Checking naughty and nice process list...", // Check it up to twice. "Granting licence to kill(2)...", // British spelling for British movie. "Letting the watchdogs out...", + "Conjuring /dev/null black hole...", + "Adversarially training Redcode AI...", + "Singleplexing /dev/ptmx...", + "Recruiting cron-ies...", + "Verifying that no non-zero bytes made their way into /dev/zero...", + "Accelerating teletypewriter to 9600 baud...", } selectMessage := func() string { diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index 037971393..c0ab53c94 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" - "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/inet" @@ -29,11 +28,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/sentry/limits" - "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/unimpl" - "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" @@ -63,6 +58,12 @@ import ( type Task struct { taskNode + // goid is the task goroutine's ID. goid is owned by the task goroutine, + // but since it's used to detect cases where non-task goroutines + // incorrectly access state owned by, or exclusive to, the task goroutine, + // goid is always accessed using atomic memory operations. + goid int64 `state:"nosave"` + // runState is what the task goroutine is executing if it is not stopped. // If runState is nil, the task goroutine should exit or has exited. // runState is exclusive to the task goroutine. @@ -83,7 +84,7 @@ type Task struct { // taskWork is exclusive to the task goroutine. taskWork []TaskWorker - // haveSyscallReturn is true if tc.Arch().Return() represents a value + // haveSyscallReturn is true if image.Arch().Return() represents a value // returned by a syscall (or set by ptrace after a syscall). // // haveSyscallReturn is exclusive to the task goroutine. @@ -257,10 +258,10 @@ type Task struct { // mu protects some of the following fields. mu sync.Mutex `state:"nosave"` - // tc holds task data provided by the ELF loader. + // image holds task data provided by the ELF loader. // - // tc is protected by mu, and is owned by the task goroutine. - tc TaskContext + // image is protected by mu, and is owned by the task goroutine. + image TaskImage // fsContext is the task's filesystem context. // @@ -274,7 +275,7 @@ type Task struct { // If vforkParent is not nil, it is the task that created this task with // vfork() or clone(CLONE_VFORK), and should have its vforkStop ended when - // this TaskContext is released. + // this TaskImage is released. // // vforkParent is protected by the TaskSet mutex. vforkParent *Task @@ -641,64 +642,6 @@ func (t *Task) Kernel() *Kernel { return t.k } -// Value implements context.Context.Value. -// -// Preconditions: The caller must be running on the task goroutine (as implied -// by the requirements of context.Context). -func (t *Task) Value(key interface{}) interface{} { - switch key { - case CtxCanTrace: - return t.CanTrace - case CtxKernel: - return t.k - case CtxPIDNamespace: - return t.tg.pidns - case CtxUTSNamespace: - return t.utsns - case CtxIPCNamespace: - ipcns := t.IPCNamespace() - ipcns.IncRef() - return ipcns - case CtxTask: - return t - case auth.CtxCredentials: - return t.Credentials() - case context.CtxThreadGroupID: - return int32(t.ThreadGroup().ID()) - case fs.CtxRoot: - return t.fsContext.RootDirectory() - case vfs.CtxRoot: - return t.fsContext.RootDirectoryVFS2() - case vfs.CtxMountNamespace: - t.mountNamespaceVFS2.IncRef() - return t.mountNamespaceVFS2 - case fs.CtxDirentCacheLimiter: - return t.k.DirentCacheLimiter - case inet.CtxStack: - return t.NetworkContext() - case ktime.CtxRealtimeClock: - return t.k.RealtimeClock() - case limits.CtxLimits: - return t.tg.limits - case pgalloc.CtxMemoryFile: - return t.k.mf - case pgalloc.CtxMemoryFileProvider: - return t.k - case platform.CtxPlatform: - return t.k - case uniqueid.CtxGlobalUniqueID: - return t.k.UniqueID() - case uniqueid.CtxGlobalUniqueIDProvider: - return t.k - case uniqueid.CtxInotifyCookie: - return t.k.GenerateInotifyCookie() - case unimpl.CtxEvents: - return t.k - default: - return nil - } -} - // SetClearTID sets t's cleartid. // // Preconditions: The caller must be running on the task goroutine. @@ -751,12 +694,12 @@ func (t *Task) IsChrooted() bool { return root != realRoot } -// TaskContext returns t's TaskContext. +// TaskImage returns t's TaskImage. // // Precondition: The caller must be running on the task goroutine, or t.mu must // be locked. -func (t *Task) TaskContext() *TaskContext { - return &t.tc +func (t *Task) TaskImage() *TaskImage { + return &t.image } // FSContext returns t's FSContext. FSContext does not take an additional diff --git a/pkg/sentry/kernel/task_acct.go b/pkg/sentry/kernel/task_acct.go index 5f3e60fe8..e574997f7 100644 --- a/pkg/sentry/kernel/task_acct.go +++ b/pkg/sentry/kernel/task_acct.go @@ -136,14 +136,14 @@ func (tg *ThreadGroup) IOUsage() *usage.IO { func (t *Task) Name() string { t.mu.Lock() defer t.mu.Unlock() - return t.tc.Name + return t.image.Name } // SetName changes t's name. func (t *Task) SetName(name string) { t.mu.Lock() defer t.mu.Unlock() - t.tc.Name = name + t.image.Name = name t.Debugf("Set thread name to %q", name) } diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go index 4a4a69ee2..9419f2e95 100644 --- a/pkg/sentry/kernel/task_block.go +++ b/pkg/sentry/kernel/task_block.go @@ -20,6 +20,7 @@ import ( "time" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) @@ -32,6 +33,8 @@ import ( // // - An error which is nil if an event is received from C, ETIMEDOUT if the timeout // expired, and syserror.ErrInterrupted if t is interrupted. +// +// Preconditions: The caller must be running on the task goroutine. func (t *Task) BlockWithTimeout(C chan struct{}, haveTimeout bool, timeout time.Duration) (time.Duration, error) { if !haveTimeout { return timeout, t.block(C, nil) @@ -112,7 +115,14 @@ func (t *Task) Block(C <-chan struct{}) error { // block blocks a task on one of many events. // N.B. defer is too expensive to be used here. +// +// Preconditions: The caller must be running on the task goroutine. func (t *Task) block(C <-chan struct{}, timerChan <-chan struct{}) error { + // This function is very hot; skip this check outside of +race builds. + if sync.RaceEnabled { + t.assertTaskGoroutine() + } + // Fast path if the request is already done. select { case <-C: @@ -156,33 +166,39 @@ func (t *Task) block(C <-chan struct{}, timerChan <-chan struct{}) error { } } -// SleepStart implements amutex.Sleeper.SleepStart. +// SleepStart implements context.ChannelSleeper.SleepStart. func (t *Task) SleepStart() <-chan struct{} { + t.assertTaskGoroutine() t.Deactivate() t.accountTaskGoroutineEnter(TaskGoroutineBlockedInterruptible) return t.interruptChan } -// SleepFinish implements amutex.Sleeper.SleepFinish. +// SleepFinish implements context.ChannelSleeper.SleepFinish. func (t *Task) SleepFinish(success bool) { if !success { - // The interrupted notification is consumed only at the top-level - // (Run). Therefore we attempt to reset the pending notification. - // This will also elide our next entry back into the task, so we - // will process signals, state changes, etc. + // Our caller received from t.interruptChan; we need to re-send to it + // to ensure that t.interrupted() is still true. t.interruptSelf() } t.accountTaskGoroutineLeave(TaskGoroutineBlockedInterruptible) t.Activate() } -// Interrupted implements amutex.Sleeper.Interrupted +// Interrupted implements context.ChannelSleeper.Interrupted. func (t *Task) Interrupted() bool { - return len(t.interruptChan) != 0 + if t.interrupted() { + return true + } + // Indicate that t's task goroutine is still responsive (i.e. reset the + // watchdog timer). + t.accountTaskGoroutineRunning() + return false } // UninterruptibleSleepStart implements context.Context.UninterruptibleSleepStart. func (t *Task) UninterruptibleSleepStart(deactivate bool) { + t.assertTaskGoroutine() if deactivate { t.Deactivate() } @@ -198,13 +214,17 @@ func (t *Task) UninterruptibleSleepFinish(activate bool) { } // interrupted returns true if interrupt or interruptSelf has been called at -// least once since the last call to interrupted. +// least once since the last call to unsetInterrupted. func (t *Task) interrupted() bool { + return len(t.interruptChan) != 0 +} + +// unsetInterrupted causes interrupted to return false until the next call to +// interrupt or interruptSelf. +func (t *Task) unsetInterrupted() { select { case <-t.interruptChan: - return true default: - return false } } @@ -220,9 +240,7 @@ func (t *Task) interrupt() { func (t *Task) interruptSelf() { select { case t.interruptChan <- struct{}{}: - t.Debugf("Interrupt queued") default: - t.Debugf("Dropping duplicate interrupt") } // platform.Context.Interrupt() is unnecessary since a task goroutine // calling interruptSelf() cannot also be blocked in diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index 527344162..f305e69c0 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -115,7 +115,7 @@ type CloneOptions struct { ParentTID usermem.Addr // If Vfork is true, place the parent in vforkStop until the cloned task - // releases its TaskContext. + // releases its TaskImage. Vfork bool // If Untraced is true, do not report PTRACE_EVENT_CLONE/FORK/VFORK for @@ -226,20 +226,20 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { }) } - tc, err := t.tc.Fork(t, t.k, !opts.NewAddressSpace) + image, err := t.image.Fork(t, t.k, !opts.NewAddressSpace) if err != nil { return 0, nil, err } cu.Add(func() { - tc.release() + image.release() }) // clone() returns 0 in the child. - tc.Arch.SetReturn(0) + image.Arch.SetReturn(0) if opts.Stack != 0 { - tc.Arch.SetStack(uintptr(opts.Stack)) + image.Arch.SetStack(uintptr(opts.Stack)) } if opts.SetTLS { - if !tc.Arch.SetTLS(uintptr(opts.TLS)) { + if !image.Arch.SetTLS(uintptr(opts.TLS)) { return 0, nil, syserror.EPERM } } @@ -288,7 +288,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { Kernel: t.k, ThreadGroup: tg, SignalMask: t.SignalMask(), - TaskContext: tc, + TaskImage: image, FSContext: fsContext, FDTable: fdTable, Credentials: creds, diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go index d1136461a..70b0699dc 100644 --- a/pkg/sentry/kernel/task_context.go +++ b/pkg/sentry/kernel/task_context.go @@ -1,4 +1,4 @@ -// Copyright 2018 The gVisor Authors. +// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -15,159 +15,175 @@ package kernel import ( - "fmt" + "time" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/kernel/futex" - "gvisor.dev/gvisor/pkg/sentry/loader" - "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/unimpl" + "gvisor.dev/gvisor/pkg/sentry/uniqueid" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" ) -var errNoSyscalls = syserr.New("no syscall table found", linux.ENOEXEC) - -// Auxmap contains miscellaneous data for the task. -type Auxmap map[string]interface{} - -// TaskContext is the subset of a task's data that is provided by the loader. -// -// +stateify savable -type TaskContext struct { - // Name is the thread name set by the prctl(PR_SET_NAME) system call. - Name string - - // Arch is the architecture-specific context (registers, etc.) - Arch arch.Context - - // MemoryManager is the task's address space. - MemoryManager *mm.MemoryManager +// Deadline implements context.Context.Deadline. +func (t *Task) Deadline() (time.Time, bool) { + return time.Time{}, false +} - // fu implements futexes in the address space. - fu *futex.Manager +// Done implements context.Context.Done. +func (t *Task) Done() <-chan struct{} { + return nil +} - // st is the task's syscall table. - st *SyscallTable `state:".(syscallTableInfo)"` +// Err implements context.Context.Err. +func (t *Task) Err() error { + return nil } -// release releases all resources held by the TaskContext. release is called by -// the task when it execs into a new TaskContext or exits. -func (tc *TaskContext) release() { - // Nil out pointers so that if the task is saved after release, it doesn't - // follow the pointers to possibly now-invalid objects. - if tc.MemoryManager != nil { - tc.MemoryManager.DecUsers(context.Background()) - tc.MemoryManager = nil +// Value implements context.Context.Value. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) Value(key interface{}) interface{} { + // This function is very hot; skip this check outside of +race builds. + if sync.RaceEnabled { + t.assertTaskGoroutine() } - tc.fu = nil + return t.contextValue(key, true /* isTaskGoroutine */) } -// Fork returns a duplicate of tc. The copied TaskContext always has an -// independent arch.Context. If shareAddressSpace is true, the copied -// TaskContext shares an address space with the original; otherwise, the copied -// TaskContext has an independent address space that is initially a duplicate -// of the original's. -func (tc *TaskContext) Fork(ctx context.Context, k *Kernel, shareAddressSpace bool) (*TaskContext, error) { - newTC := &TaskContext{ - Name: tc.Name, - Arch: tc.Arch.Fork(), - st: tc.st, - } - if shareAddressSpace { - newTC.MemoryManager = tc.MemoryManager - if newTC.MemoryManager != nil { - if !newTC.MemoryManager.IncUsers() { - // Shouldn't be possible since tc.MemoryManager should be a - // counted user. - panic(fmt.Sprintf("TaskContext.Fork called with userless TaskContext.MemoryManager")) - } +func (t *Task) contextValue(key interface{}, isTaskGoroutine bool) interface{} { + switch key { + case CtxCanTrace: + return t.CanTrace + case CtxKernel: + return t.k + case CtxPIDNamespace: + return t.tg.pidns + case CtxUTSNamespace: + if !isTaskGoroutine { + t.mu.Lock() + defer t.mu.Unlock() + } + return t.utsns + case CtxIPCNamespace: + if !isTaskGoroutine { + t.mu.Lock() + defer t.mu.Unlock() + } + ipcns := t.ipcns + ipcns.IncRef() + return ipcns + case CtxTask: + return t + case auth.CtxCredentials: + return t.creds.Load() + case context.CtxThreadGroupID: + return int32(t.tg.ID()) + case fs.CtxRoot: + if !isTaskGoroutine { + t.mu.Lock() + defer t.mu.Unlock() + } + return t.fsContext.RootDirectory() + case vfs.CtxRoot: + if !isTaskGoroutine { + t.mu.Lock() + defer t.mu.Unlock() } - newTC.fu = tc.fu - } else { - newMM, err := tc.MemoryManager.Fork(ctx) - if err != nil { - return nil, err + return t.fsContext.RootDirectoryVFS2() + case vfs.CtxMountNamespace: + if !isTaskGoroutine { + t.mu.Lock() + defer t.mu.Unlock() } - newTC.MemoryManager = newMM - newTC.fu = k.futexes.Fork() + t.mountNamespaceVFS2.IncRef() + return t.mountNamespaceVFS2 + case fs.CtxDirentCacheLimiter: + return t.k.DirentCacheLimiter + case inet.CtxStack: + return t.NetworkContext() + case ktime.CtxRealtimeClock: + return t.k.RealtimeClock() + case limits.CtxLimits: + return t.tg.limits + case pgalloc.CtxMemoryFile: + return t.k.mf + case pgalloc.CtxMemoryFileProvider: + return t.k + case platform.CtxPlatform: + return t.k + case uniqueid.CtxGlobalUniqueID: + return t.k.UniqueID() + case uniqueid.CtxGlobalUniqueIDProvider: + return t.k + case uniqueid.CtxInotifyCookie: + return t.k.GenerateInotifyCookie() + case unimpl.CtxEvents: + return t.k + default: + return nil } - return newTC, nil } -// Arch returns t's arch.Context. -// -// Preconditions: The caller must be running on the task goroutine, or t.mu -// must be locked. -func (t *Task) Arch() arch.Context { - return t.tc.Arch +// taskAsyncContext implements context.Context for a goroutine that performs +// work on behalf of a Task, but is not the task goroutine. +type taskAsyncContext struct { + context.NoopSleeper + + t *Task } -// MemoryManager returns t's MemoryManager. MemoryManager does not take an -// additional reference on the returned MM. -// -// Preconditions: The caller must be running on the task goroutine, or t.mu -// must be locked. -func (t *Task) MemoryManager() *mm.MemoryManager { - return t.tc.MemoryManager +// AsyncContext returns a context.Context representing t. The returned +// context.Context is intended for use by goroutines other than t's task +// goroutine; for example, signal delivery to t will not interrupt goroutines +// that are blocking using the returned context.Context. +func (t *Task) AsyncContext() context.Context { + return taskAsyncContext{t: t} } -// SyscallTable returns t's syscall table. -// -// Preconditions: The caller must be running on the task goroutine, or t.mu -// must be locked. -func (t *Task) SyscallTable() *SyscallTable { - return t.tc.st +// Debugf implements log.Logger.Debugf. +func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) { + ctx.t.Debugf(format, v...) } -// Stack returns the userspace stack. -// -// Preconditions: The caller must be running on the task goroutine, or t.mu -// must be locked. -func (t *Task) Stack() *arch.Stack { - return &arch.Stack{ - Arch: t.Arch(), - IO: t.MemoryManager(), - Bottom: usermem.Addr(t.Arch().Stack()), - } +// Infof implements log.Logger.Infof. +func (ctx taskAsyncContext) Infof(format string, v ...interface{}) { + ctx.t.Infof(format, v...) } -// LoadTaskImage loads a specified file into a new TaskContext. -// -// args.MemoryManager does not need to be set by the caller. -func (k *Kernel) LoadTaskImage(ctx context.Context, args loader.LoadArgs) (*TaskContext, *syserr.Error) { - // If File is not nil, we should load that instead of resolving Filename. - if args.File != nil { - args.Filename = args.File.PathnameWithDeleted(ctx) - } +// Warningf implements log.Logger.Warningf. +func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) { + ctx.t.Warningf(format, v...) +} + +// IsLogging implements log.Logger.IsLogging. +func (ctx taskAsyncContext) IsLogging(level log.Level) bool { + return ctx.t.IsLogging(level) +} - // Prepare a new user address space to load into. - m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation) - defer m.DecUsers(ctx) - args.MemoryManager = m +// Deadline implements context.Context.Deadline. +func (ctx taskAsyncContext) Deadline() (time.Time, bool) { + return time.Time{}, false +} - os, ac, name, err := loader.Load(ctx, args, k.extraAuxv, k.vdso) - if err != nil { - return nil, err - } +// Done implements context.Context.Done. +func (ctx taskAsyncContext) Done() <-chan struct{} { + return nil +} - // Lookup our new syscall table. - st, ok := LookupSyscallTable(os, ac.Arch()) - if !ok { - // No syscall table found. This means that the ELF binary does not match - // the architecture. - return nil, errNoSyscalls - } +// Err implements context.Context.Err. +func (ctx taskAsyncContext) Err() error { + return nil +} - if !m.IncUsers() { - panic("Failed to increment users count on new MM") - } - return &TaskContext{ - Name: name, - Arch: ac, - MemoryManager: m, - fu: k.futexes.Fork(), - st: st, - }, nil +// Value implements context.Context.Value. +func (ctx taskAsyncContext) Value(key interface{}) interface{} { + return ctx.t.contextValue(key, false /* isTaskGoroutine */) } diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go index 412d471d3..d9897e802 100644 --- a/pkg/sentry/kernel/task_exec.go +++ b/pkg/sentry/kernel/task_exec.go @@ -83,11 +83,12 @@ type execStop struct{} func (*execStop) Killable() bool { return true } // Execve implements the execve(2) syscall by killing all other tasks in its -// thread group and switching to newTC. Execve always takes ownership of newTC. +// thread group and switching to newImage. Execve always takes ownership of +// newImage. // // Preconditions: The caller must be running Task.doSyscallInvoke on the task // goroutine. -func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) { +func (t *Task) Execve(newImage *TaskImage) (*SyscallControl, error) { t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() t.tg.signalHandlers.mu.Lock() @@ -96,7 +97,7 @@ func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) { if t.tg.exiting || t.tg.execing != nil { // We lost to a racing group-exit, kill, or exec from another thread // and should just exit. - newTC.release() + newImage.release() return nil, syserror.EINTR } @@ -118,7 +119,7 @@ func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) { t.beginInternalStopLocked((*execStop)(nil)) } - return &SyscallControl{next: &runSyscallAfterExecStop{newTC}, ignoreReturn: true}, nil + return &SyscallControl{next: &runSyscallAfterExecStop{newImage}, ignoreReturn: true}, nil } // The runSyscallAfterExecStop state continues execve(2) after all siblings of @@ -126,16 +127,16 @@ func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) { // // +stateify savable type runSyscallAfterExecStop struct { - tc *TaskContext + image *TaskImage } func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { - t.traceExecEvent(r.tc) + t.traceExecEvent(r.image) t.tg.pidns.owner.mu.Lock() t.tg.execing = nil if t.killed() { t.tg.pidns.owner.mu.Unlock() - r.tc.release() + r.image.release() return (*runInterrupt)(nil) } // We are the thread group leader now. Save our old thread ID for @@ -214,7 +215,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { // executables (set-user/group-ID bits and file capabilities). This // allows us to unconditionally enable user dumpability on the new mm. // See fs/exec.c:setup_new_exec. - r.tc.MemoryManager.SetDumpability(mm.UserDumpable) + r.image.MemoryManager.SetDumpability(mm.UserDumpable) // Switch to the new process. t.MemoryManager().Deactivate() @@ -222,8 +223,8 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { // Update credentials to reflect the execve. This should precede switching // MMs to ensure that dumpability has been reset first, if needed. t.updateCredsForExecLocked() - t.tc.release() - t.tc = *r.tc + t.image.release() + t.image = *r.image t.mu.Unlock() t.unstopVforkParent() t.p.FullStateChanged() diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index ce7b9641d..16986244c 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -266,7 +266,7 @@ func (*runExitMain) execute(t *Task) taskRunState { t.updateRSSLocked() t.tg.pidns.owner.mu.Unlock() t.mu.Lock() - t.tc.release() + t.image.release() t.mu.Unlock() // Releasing the MM unblocks a blocked CLONE_VFORK parent. @@ -368,8 +368,8 @@ func (t *Task) exitChildren() { Signo: int32(sig), Code: arch.SignalInfoUser, } - siginfo.SetPid(int32(c.tg.pidns.tids[t])) - siginfo.SetUid(int32(t.Credentials().RealKUID.In(c.UserNamespace()).OrOverflow())) + siginfo.SetPID(int32(c.tg.pidns.tids[t])) + siginfo.SetUID(int32(t.Credentials().RealKUID.In(c.UserNamespace()).OrOverflow())) c.tg.signalHandlers.mu.Lock() c.sendSignalLocked(siginfo, true /* group */) c.tg.signalHandlers.mu.Unlock() @@ -698,8 +698,8 @@ func (t *Task) exitNotificationSignal(sig linux.Signal, receiver *Task) *arch.Si info := &arch.SignalInfo{ Signo: int32(sig), } - info.SetPid(int32(receiver.tg.pidns.tids[t])) - info.SetUid(int32(t.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) + info.SetPID(int32(receiver.tg.pidns.tids[t])) + info.SetUID(int32(t.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) if t.exitStatus.Signaled() { info.Code = arch.CLD_KILLED info.SetStatus(int32(t.exitStatus.Signo)) diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go index c80391475..195c7da9b 100644 --- a/pkg/sentry/kernel/task_futex.go +++ b/pkg/sentry/kernel/task_futex.go @@ -26,7 +26,7 @@ import ( // Preconditions: The caller must be running on the task goroutine, or t.mu // must be locked. func (t *Task) Futex() *futex.Manager { - return t.tc.fu + return t.image.fu } // SwapUint32 implements futex.Target.SwapUint32. diff --git a/pkg/sentry/kernel/task_image.go b/pkg/sentry/kernel/task_image.go new file mode 100644 index 000000000..ce5fbd299 --- /dev/null +++ b/pkg/sentry/kernel/task_image.go @@ -0,0 +1,173 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel/futex" + "gvisor.dev/gvisor/pkg/sentry/loader" + "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/usermem" +) + +var errNoSyscalls = syserr.New("no syscall table found", linux.ENOEXEC) + +// Auxmap contains miscellaneous data for the task. +type Auxmap map[string]interface{} + +// TaskImage is the subset of a task's data that is provided by the loader. +// +// +stateify savable +type TaskImage struct { + // Name is the thread name set by the prctl(PR_SET_NAME) system call. + Name string + + // Arch is the architecture-specific context (registers, etc.) + Arch arch.Context + + // MemoryManager is the task's address space. + MemoryManager *mm.MemoryManager + + // fu implements futexes in the address space. + fu *futex.Manager + + // st is the task's syscall table. + st *SyscallTable `state:".(syscallTableInfo)"` +} + +// release releases all resources held by the TaskImage. release is called by +// the task when it execs into a new TaskImage or exits. +func (image *TaskImage) release() { + // Nil out pointers so that if the task is saved after release, it doesn't + // follow the pointers to possibly now-invalid objects. + if image.MemoryManager != nil { + image.MemoryManager.DecUsers(context.Background()) + image.MemoryManager = nil + } + image.fu = nil +} + +// Fork returns a duplicate of image. The copied TaskImage always has an +// independent arch.Context. If shareAddressSpace is true, the copied +// TaskImage shares an address space with the original; otherwise, the copied +// TaskImage has an independent address space that is initially a duplicate +// of the original's. +func (image *TaskImage) Fork(ctx context.Context, k *Kernel, shareAddressSpace bool) (*TaskImage, error) { + newImage := &TaskImage{ + Name: image.Name, + Arch: image.Arch.Fork(), + st: image.st, + } + if shareAddressSpace { + newImage.MemoryManager = image.MemoryManager + if newImage.MemoryManager != nil { + if !newImage.MemoryManager.IncUsers() { + // Shouldn't be possible since image.MemoryManager should be a + // counted user. + panic(fmt.Sprintf("TaskImage.Fork called with userless TaskImage.MemoryManager")) + } + } + newImage.fu = image.fu + } else { + newMM, err := image.MemoryManager.Fork(ctx) + if err != nil { + return nil, err + } + newImage.MemoryManager = newMM + newImage.fu = k.futexes.Fork() + } + return newImage, nil +} + +// Arch returns t's arch.Context. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) Arch() arch.Context { + return t.image.Arch +} + +// MemoryManager returns t's MemoryManager. MemoryManager does not take an +// additional reference on the returned MM. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) MemoryManager() *mm.MemoryManager { + return t.image.MemoryManager +} + +// SyscallTable returns t's syscall table. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) SyscallTable() *SyscallTable { + return t.image.st +} + +// Stack returns the userspace stack. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) Stack() *arch.Stack { + return &arch.Stack{ + Arch: t.Arch(), + IO: t.MemoryManager(), + Bottom: usermem.Addr(t.Arch().Stack()), + } +} + +// LoadTaskImage loads a specified file into a new TaskImage. +// +// args.MemoryManager does not need to be set by the caller. +func (k *Kernel) LoadTaskImage(ctx context.Context, args loader.LoadArgs) (*TaskImage, *syserr.Error) { + // If File is not nil, we should load that instead of resolving Filename. + if args.File != nil { + args.Filename = args.File.PathnameWithDeleted(ctx) + } + + // Prepare a new user address space to load into. + m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation) + defer m.DecUsers(ctx) + args.MemoryManager = m + + os, ac, name, err := loader.Load(ctx, args, k.extraAuxv, k.vdso) + if err != nil { + return nil, err + } + + // Lookup our new syscall table. + st, ok := LookupSyscallTable(os, ac.Arch()) + if !ok { + // No syscall table found. This means that the ELF binary does not match + // the architecture. + return nil, errNoSyscalls + } + + if !m.IncUsers() { + panic("Failed to increment users count on new MM") + } + return &TaskImage{ + Name: name, + Arch: ac, + MemoryManager: m, + fu: k.futexes.Fork(), + st: st, + }, nil +} diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go index d23cea802..c70e5e6ce 100644 --- a/pkg/sentry/kernel/task_log.go +++ b/pkg/sentry/kernel/task_log.go @@ -19,6 +19,7 @@ import ( "runtime/trace" "sort" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/usermem" ) @@ -215,7 +216,7 @@ func (t *Task) rebuildTraceContext(tid ThreadID) { // arbitrarily large (in general it won't be, especially for cases // where we're collecting a brief profile), so using the TID is a // reasonable compromise in this case. - t.traceContext, t.traceTask = trace.NewTask(t, fmt.Sprintf("tid:%d", tid)) + t.traceContext, t.traceTask = trace.NewTask(context.Background(), fmt.Sprintf("tid:%d", tid)) } // traceCloneEvent is called when a new task is spawned. @@ -237,11 +238,11 @@ func (t *Task) traceExitEvent() { } // traceExecEvent is called when a task calls exec. -func (t *Task) traceExecEvent(tc *TaskContext) { +func (t *Task) traceExecEvent(image *TaskImage) { if !trace.IsEnabled() { return } - file := tc.MemoryManager.Executable() + file := image.MemoryManager.Executable() if file == nil { trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>") return diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go index 8dc3fec90..3ccecf4b6 100644 --- a/pkg/sentry/kernel/task_run.go +++ b/pkg/sentry/kernel/task_run.go @@ -16,11 +16,13 @@ package kernel import ( "bytes" + "fmt" "runtime" "runtime/trace" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/goid" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/hostcpu" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" @@ -57,6 +59,8 @@ type taskRunState interface { // make it visible in stack dumps. A goroutine for a given task can be identified // searching for Task.run()'s argument value. func (t *Task) run(threadID uintptr) { + atomic.StoreInt64(&t.goid, goid.Get()) + // Construct t.blockingTimer here. We do this here because we can't // reconstruct t.blockingTimer during restore in Task.afterLoad(), because // kernel.timekeeper.SetClocks() hasn't been called yet. @@ -99,6 +103,9 @@ func (t *Task) run(threadID uintptr) { t.tg.pidns.owner.runningGoroutines.Done() t.p.Release() + // Deferring this store triggers a false positive in the race + // detector (https://github.com/golang/go/issues/42599). + atomic.StoreInt64(&t.goid, 0) // Keep argument alive because stack trace for dead variables may not be correct. runtime.KeepAlive(threadID) return @@ -317,7 +324,7 @@ func (app *runApp) execute(t *Task) taskRunState { // region. We should be able to easily identify // vsyscalls by having a <fault><syscall> pair. if at.Execute { - if sysno, ok := t.tc.st.LookupEmulate(addr); ok { + if sysno, ok := t.image.st.LookupEmulate(addr); ok { return t.doVsyscall(addr, sysno) } } @@ -375,6 +382,19 @@ func (app *runApp) execute(t *Task) taskRunState { } } +// assertTaskGoroutine panics if the caller is not running on t's task +// goroutine. +func (t *Task) assertTaskGoroutine() { + if got, want := goid.Get(), atomic.LoadInt64(&t.goid); got != want { + panic(fmt.Sprintf("running on goroutine %d (task goroutine for kernel.Task %p is %d)", got, t, want)) + } +} + +// GoroutineID returns the ID of t's task goroutine. +func (t *Task) GoroutineID() int64 { + return atomic.LoadInt64(&t.goid) +} + // waitGoroutineStoppedOrExited blocks until t's task goroutine stops or exits. func (t *Task) waitGoroutineStoppedOrExited() { t.goroutineStopped.Wait() diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go index 52c55d13d..9ba5f8d78 100644 --- a/pkg/sentry/kernel/task_sched.go +++ b/pkg/sentry/kernel/task_sched.go @@ -157,6 +157,18 @@ func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) { t.goschedSeq.EndWrite() } +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) accountTaskGoroutineRunning() { + now := t.k.CPUClockNow() + if t.gosched.State != TaskGoroutineRunningSys { + panic(fmt.Sprintf("Task goroutine in state %v (expected %v)", t.gosched.State, TaskGoroutineRunningSys)) + } + t.goschedSeq.BeginWrite() + t.gosched.SysTicks += now - t.gosched.Timestamp + t.gosched.Timestamp = now + t.goschedSeq.EndWrite() +} + // TaskGoroutineSchedInfo returns a copy of t's task goroutine scheduling info. // Most clients should use t.CPUStats() instead. func (t *Task) TaskGoroutineSchedInfo() TaskGoroutineSchedInfo { diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go index ebdb83061..75af3af79 100644 --- a/pkg/sentry/kernel/task_signals.go +++ b/pkg/sentry/kernel/task_signals.go @@ -619,9 +619,6 @@ func (t *Task) setSignalMaskLocked(mask linux.SignalSet) { return } }) - // We have to re-issue the interrupt consumed by t.interrupted() since - // it might have been for a different reason. - t.interruptSelf() } // Conversely, if the new mask unblocks any signals that were blocked by @@ -917,8 +914,8 @@ func (t *Task) signalStop(target *Task, code int32, status int32) { Signo: int32(linux.SIGCHLD), Code: code, } - sigchld.SetPid(int32(t.tg.pidns.tids[target])) - sigchld.SetUid(int32(target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) + sigchld.SetPID(int32(t.tg.pidns.tids[target])) + sigchld.SetUID(int32(target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) sigchld.SetStatus(status) // TODO(b/72102453): Set utime, stime. t.sendSignalLocked(sigchld, true /* group */) @@ -931,10 +928,10 @@ func (t *Task) signalStop(target *Task, code int32, status int32) { type runInterrupt struct{} func (*runInterrupt) execute(t *Task) taskRunState { - // Interrupts are de-duplicated (if t is interrupted twice before - // t.interrupted() is called, t.interrupted() will only return true once), - // so early exits from this function must re-enter the runInterrupt state - // to check for more interrupt-signaled conditions. + // Interrupts are de-duplicated (t.unsetInterrupted() will undo the effect + // of all previous calls to t.interrupted() regardless of how many such + // calls there have been), so early exits from this function must re-enter + // the runInterrupt state to check for more interrupt-signaled conditions. t.tg.signalHandlers.mu.Lock() @@ -1025,8 +1022,8 @@ func (*runInterrupt) execute(t *Task) taskRunState { Signo: int32(sig), Code: t.ptraceCode, } - t.ptraceSiginfo.SetPid(int32(t.tg.pidns.tids[t])) - t.ptraceSiginfo.SetUid(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) + t.ptraceSiginfo.SetPID(int32(t.tg.pidns.tids[t])) + t.ptraceSiginfo.SetUID(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) } else { t.ptraceCode = int32(sig) t.ptraceSiginfo = nil @@ -1080,6 +1077,7 @@ func (*runInterrupt) execute(t *Task) taskRunState { return t.deliverSignal(info, act) } + t.unsetInterrupted() t.tg.signalHandlers.mu.Unlock() return (*runApp)(nil) } @@ -1116,11 +1114,11 @@ func (*runInterruptAfterSignalDeliveryStop) execute(t *Task) taskRunState { if parent == nil { // Tracer has detached and t was created by Kernel.CreateProcess(). // Pretend the parent is in an ancestor PID + user namespace. - info.SetPid(0) - info.SetUid(int32(auth.OverflowUID)) + info.SetPID(0) + info.SetUID(int32(auth.OverflowUID)) } else { - info.SetPid(int32(t.tg.pidns.tids[parent])) - info.SetUid(int32(parent.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) + info.SetPID(int32(t.tg.pidns.tids[parent])) + info.SetUID(int32(parent.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) } } t.tg.signalHandlers.mu.Lock() diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index 8e28230cc..36e1384f1 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -46,10 +46,10 @@ type TaskConfig struct { // SignalMask is the new task's initial signal mask. SignalMask linux.SignalSet - // TaskContext is the TaskContext of the new task. Ownership of the - // TaskContext is transferred to TaskSet.NewTask, whether or not it + // TaskImage is the TaskImage of the new task. Ownership of the + // TaskImage is transferred to TaskSet.NewTask, whether or not it // succeeds. - TaskContext *TaskContext + TaskImage *TaskImage // FSContext is the FSContext of the new task. A reference must be held on // FSContext, which is transferred to TaskSet.NewTask whether or not it @@ -105,7 +105,7 @@ type TaskConfig struct { func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) { t, err := ts.newTask(cfg) if err != nil { - cfg.TaskContext.release() + cfg.TaskImage.release() cfg.FSContext.DecRef(ctx) cfg.FDTable.DecRef(ctx) cfg.IPCNamespace.DecRef(ctx) @@ -121,7 +121,7 @@ func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) // of cfg if it succeeds. func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { tg := cfg.ThreadGroup - tc := cfg.TaskContext + image := cfg.TaskImage t := &Task{ taskNode: taskNode{ tg: tg, @@ -132,7 +132,7 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { interruptChan: make(chan struct{}, 1), signalMask: cfg.SignalMask, signalStack: arch.SignalStack{Flags: arch.SignalStackFlagDisable}, - tc: *tc, + image: *image, fsContext: cfg.FSContext, fdTable: cfg.FDTable, p: cfg.Kernel.Platform.NewContext(), |