diff options
Diffstat (limited to 'pkg/sentry/kernel')
42 files changed, 1046 insertions, 578 deletions
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index c0de72eef..0ee60569c 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -79,7 +79,7 @@ go_template_instance( out = "fd_table_refs.go", package = "kernel", prefix = "FDTable", - template = "//pkg/refs_vfs2:refs_template", + template = "//pkg/refsvfs2:refs_template", types = { "T": "FDTable", }, @@ -90,7 +90,7 @@ go_template_instance( out = "fs_context_refs.go", package = "kernel", prefix = "FSContext", - template = "//pkg/refs_vfs2:refs_template", + template = "//pkg/refsvfs2:refs_template", types = { "T": "FSContext", }, @@ -101,7 +101,7 @@ go_template_instance( out = "ipc_namespace_refs.go", package = "kernel", prefix = "IPCNamespace", - template = "//pkg/refs_vfs2:refs_template", + template = "//pkg/refsvfs2:refs_template", types = { "T": "IPCNamespace", }, @@ -112,7 +112,7 @@ go_template_instance( out = "process_group_refs.go", package = "kernel", prefix = "ProcessGroup", - template = "//pkg/refs_vfs2:refs_template", + template = "//pkg/refsvfs2:refs_template", types = { "T": "ProcessGroup", }, @@ -123,7 +123,7 @@ go_template_instance( out = "session_refs.go", package = "kernel", prefix = "Session", - template = "//pkg/refs_vfs2:refs_template", + template = "//pkg/refsvfs2:refs_template", types = { "T": "Session", }, @@ -184,6 +184,7 @@ go_library( "task_exit.go", "task_futex.go", "task_identity.go", + "task_image.go", "task_list.go", "task_log.go", "task_net.go", @@ -224,12 +225,13 @@ go_library( "//pkg/cpuid", "//pkg/eventchannel", "//pkg/fspath", + "//pkg/goid", "//pkg/log", "//pkg/marshal", "//pkg/marshal/primitive", "//pkg/metric", "//pkg/refs", - "//pkg/refs_vfs2", + "//pkg/refsvfs2", "//pkg/safemem", "//pkg/secio", "//pkg/sentry/arch", diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go index 1b9721534..0ddbe5ff6 100644 --- a/pkg/sentry/kernel/abstract_socket_namespace.go +++ b/pkg/sentry/kernel/abstract_socket_namespace.go @@ -19,7 +19,7 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs_vfs2" + "gvisor.dev/gvisor/pkg/refsvfs2" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sync" ) @@ -27,7 +27,7 @@ import ( // +stateify savable type abstractEndpoint struct { ep transport.BoundEndpoint - socket refs_vfs2.RefCounter + socket refsvfs2.RefCounter name string ns *AbstractSocketNamespace } @@ -57,7 +57,7 @@ func NewAbstractSocketNamespace() *AbstractSocketNamespace { // its backing socket. type boundEndpoint struct { transport.BoundEndpoint - socket refs_vfs2.RefCounter + socket refsvfs2.RefCounter } // Release implements transport.BoundEndpoint.Release. @@ -89,7 +89,7 @@ func (a *AbstractSocketNamespace) BoundEndpoint(name string) transport.BoundEndp // // When the last reference managed by socket is dropped, ep may be removed from the // namespace. -func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, socket refs_vfs2.RefCounter) error { +func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, socket refsvfs2.RefCounter) error { a.mu.Lock() defer a.mu.Unlock() @@ -109,7 +109,7 @@ func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep tran // Remove removes the specified socket at name from the abstract socket // namespace, if it has not yet been replaced. -func (a *AbstractSocketNamespace) Remove(name string, socket refs_vfs2.RefCounter) { +func (a *AbstractSocketNamespace) Remove(name string, socket refsvfs2.RefCounter) { a.mu.Lock() defer a.mu.Unlock() diff --git a/pkg/sentry/kernel/aio.go b/pkg/sentry/kernel/aio.go index 0ac78c0b8..ec36d1a49 100644 --- a/pkg/sentry/kernel/aio.go +++ b/pkg/sentry/kernel/aio.go @@ -15,10 +15,7 @@ package kernel import ( - "time" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/log" ) // AIOCallback is an function that does asynchronous I/O on behalf of a task. @@ -26,7 +23,7 @@ type AIOCallback func(context.Context) // QueueAIO queues an AIOCallback which will be run asynchronously. func (t *Task) QueueAIO(cb AIOCallback) { - ctx := taskAsyncContext{t: t} + ctx := t.AsyncContext() wg := &t.TaskSet().aioGoroutines wg.Add(1) go func() { @@ -34,48 +31,3 @@ func (t *Task) QueueAIO(cb AIOCallback) { wg.Done() }() } - -type taskAsyncContext struct { - context.NoopSleeper - t *Task -} - -// Debugf implements log.Logger.Debugf. -func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) { - ctx.t.Debugf(format, v...) -} - -// Infof implements log.Logger.Infof. -func (ctx taskAsyncContext) Infof(format string, v ...interface{}) { - ctx.t.Infof(format, v...) -} - -// Warningf implements log.Logger.Warningf. -func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) { - ctx.t.Warningf(format, v...) -} - -// IsLogging implements log.Logger.IsLogging. -func (ctx taskAsyncContext) IsLogging(level log.Level) bool { - return ctx.t.IsLogging(level) -} - -// Deadline implements context.Context.Deadline. -func (ctx taskAsyncContext) Deadline() (time.Time, bool) { - return ctx.t.Deadline() -} - -// Done implements context.Context.Done. -func (ctx taskAsyncContext) Done() <-chan struct{} { - return ctx.t.Done() -} - -// Err implements context.Context.Err. -func (ctx taskAsyncContext) Err() error { - return ctx.t.Err() -} - -// Value implements context.Context.Value. -func (ctx taskAsyncContext) Value(key interface{}) interface{} { - return ctx.t.Value(key) -} diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go index bb94769c4..a8596410f 100644 --- a/pkg/sentry/kernel/context.go +++ b/pkg/sentry/kernel/context.go @@ -15,8 +15,6 @@ package kernel import ( - "time" - "gvisor.dev/gvisor/pkg/context" ) @@ -98,18 +96,3 @@ func TaskFromContext(ctx context.Context) *Task { } return nil } - -// Deadline implements context.Context.Deadline. -func (*Task) Deadline() (time.Time, bool) { - return time.Time{}, false -} - -// Done implements context.Context.Done. -func (*Task) Done() <-chan struct{} { - return nil -} - -// Err implements context.Context.Err. -func (*Task) Err() error { - return nil -} diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go index 15519f0df..61aeca044 100644 --- a/pkg/sentry/kernel/epoll/epoll.go +++ b/pkg/sentry/kernel/epoll/epoll.go @@ -273,7 +273,7 @@ func (e *EventPoll) ReadEvents(max int) []linux.EpollEvent { // // Callback is called when one of the files we're polling becomes ready. It // moves said file to the readyList if it's currently in the waiting list. -func (p *pollEntry) Callback(*waiter.Entry) { +func (p *pollEntry) Callback(*waiter.Entry, waiter.EventMask) { e := p.epoll e.listsMu.Lock() @@ -306,9 +306,8 @@ func (e *EventPoll) initEntryReadiness(entry *pollEntry) { f.EventRegister(&entry.waiter, entry.mask) // Check if the file happens to already be in a ready state. - ready := f.Readiness(entry.mask) & entry.mask - if ready != 0 { - entry.Callback(&entry.waiter) + if ready := f.Readiness(entry.mask) & entry.mask; ready != 0 { + entry.Callback(&entry.waiter, ready) } } diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD index 2b3955598..f855f038b 100644 --- a/pkg/sentry/kernel/fasync/BUILD +++ b/pkg/sentry/kernel/fasync/BUILD @@ -8,11 +8,13 @@ go_library( visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", + "//pkg/sentry/arch", "//pkg/sentry/fs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", "//pkg/sync", + "//pkg/syserror", "//pkg/waiter", ], ) diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go index 153d2cd9b..b66d61c6f 100644 --- a/pkg/sentry/kernel/fasync/fasync.go +++ b/pkg/sentry/kernel/fasync/fasync.go @@ -17,22 +17,45 @@ package fasync import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) -// New creates a new fs.FileAsync. -func New() fs.FileAsync { - return &FileAsync{} +// Table to convert waiter event masks into si_band siginfo codes. +// Taken from fs/fcntl.c:band_table. +var bandTable = map[waiter.EventMask]int64{ + // POLL_IN + waiter.EventIn: linux.EPOLLIN | linux.EPOLLRDNORM, + // POLL_OUT + waiter.EventOut: linux.EPOLLOUT | linux.EPOLLWRNORM | linux.EPOLLWRBAND, + // POLL_ERR + waiter.EventErr: linux.EPOLLERR, + // POLL_PRI + waiter.EventPri: linux.EPOLLPRI | linux.EPOLLRDBAND, + // POLL_HUP + waiter.EventHUp: linux.EPOLLHUP | linux.EPOLLERR, } -// NewVFS2 creates a new vfs.FileAsync. -func NewVFS2() vfs.FileAsync { - return &FileAsync{} +// New returns a function that creates a new fs.FileAsync with the given file +// descriptor. +func New(fd int) func() fs.FileAsync { + return func() fs.FileAsync { + return &FileAsync{fd: fd} + } +} + +// NewVFS2 returns a function that creates a new vfs.FileAsync with the given +// file descriptor. +func NewVFS2(fd int) func() vfs.FileAsync { + return func() vfs.FileAsync { + return &FileAsync{fd: fd} + } } // FileAsync sends signals when the registered file is ready for IO. @@ -42,6 +65,12 @@ type FileAsync struct { // e is immutable after first use (which is protected by mu below). e waiter.Entry + // fd is the file descriptor to notify about. + // It is immutable, set at allocation time. This matches Linux semantics in + // fs/fcntl.c:fasync_helper. + // The fd value is passed to the signal recipient in siginfo.si_fd. + fd int + // regMu protects registeration and unregistration actions on e. // // regMu must be held while registration decisions are being made @@ -56,6 +85,10 @@ type FileAsync struct { mu sync.Mutex `state:"nosave"` requester *auth.Credentials registered bool + // signal is the signal to deliver upon I/O being available. + // The default value ("zero signal") means the default SIGIO signal will be + // delivered. + signal linux.Signal // Only one of the following is allowed to be non-nil. recipientPG *kernel.ProcessGroup @@ -64,10 +97,10 @@ type FileAsync struct { } // Callback sends a signal. -func (a *FileAsync) Callback(e *waiter.Entry) { +func (a *FileAsync) Callback(e *waiter.Entry, mask waiter.EventMask) { a.mu.Lock() + defer a.mu.Unlock() if !a.registered { - a.mu.Unlock() return } t := a.recipientT @@ -80,19 +113,34 @@ func (a *FileAsync) Callback(e *waiter.Entry) { } if t == nil { // No recipient has been registered. - a.mu.Unlock() return } c := t.Credentials() // Logic from sigio_perm in fs/fcntl.c. - if a.requester.EffectiveKUID == 0 || + permCheck := (a.requester.EffectiveKUID == 0 || a.requester.EffectiveKUID == c.SavedKUID || a.requester.EffectiveKUID == c.RealKUID || a.requester.RealKUID == c.SavedKUID || - a.requester.RealKUID == c.RealKUID { - t.SendSignal(kernel.SignalInfoPriv(linux.SIGIO)) + a.requester.RealKUID == c.RealKUID) + if !permCheck { + return } - a.mu.Unlock() + signalInfo := &arch.SignalInfo{ + Signo: int32(linux.SIGIO), + Code: arch.SignalInfoKernel, + } + if a.signal != 0 { + signalInfo.Signo = int32(a.signal) + signalInfo.SetFD(uint32(a.fd)) + var band int64 + for m, bandCode := range bandTable { + if m&mask != 0 { + band |= bandCode + } + } + signalInfo.SetBand(band) + } + t.SendSignal(signalInfo) } // Register sets the file which will be monitored for IO events. @@ -186,3 +234,25 @@ func (a *FileAsync) ClearOwner() { a.recipientTG = nil a.recipientPG = nil } + +// Signal returns which signal will be sent to the signal recipient. +// A value of zero means the signal to deliver wasn't customized, which means +// the default signal (SIGIO) will be delivered. +func (a *FileAsync) Signal() linux.Signal { + a.mu.Lock() + defer a.mu.Unlock() + return a.signal +} + +// SetSignal overrides which signal to send when I/O is available. +// The default behavior can be reset by specifying signal zero, which means +// to send SIGIO. +func (a *FileAsync) SetSignal(signal linux.Signal) error { + if signal != 0 && !signal.IsValid() { + return syserror.EINVAL + } + a.mu.Lock() + defer a.mu.Unlock() + a.signal = signal + return nil +} diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index 0ec7344cd..7aba31587 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -110,7 +110,7 @@ func (f *FDTable) saveDescriptorTable() map[int32]descriptor { func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) { ctx := context.Background() - f.init() // Initialize table. + f.initNoLeakCheck() // Initialize table. f.used = 0 for fd, d := range m { if file, fileVFS2 := f.setAll(ctx, fd, d.file, d.fileVFS2, d.flags); file != nil || fileVFS2 != nil { @@ -240,6 +240,10 @@ func (f *FDTable) String() string { case fileVFS2 != nil: vfsObj := fileVFS2.Mount().Filesystem().VirtualFilesystem() + vd := fileVFS2.VirtualDentry() + if vd.Dentry() == nil { + panic(fmt.Sprintf("fd %d (type %T) has nil dentry: %#v", fd, fileVFS2.Impl(), fileVFS2)) + } name, err := vfsObj.PathnameWithDeleted(ctx, vfs.VirtualDentry{}, fileVFS2.VirtualDentry()) if err != nil { fmt.Fprintf(&buf, "<err: %v>\n", err) diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go index da79e6627..f17f9c59c 100644 --- a/pkg/sentry/kernel/fd_table_unsafe.go +++ b/pkg/sentry/kernel/fd_table_unsafe.go @@ -31,14 +31,21 @@ type descriptorTable struct { slice unsafe.Pointer `state:".(map[int32]*descriptor)"` } -// init initializes the table. +// initNoLeakCheck initializes the table without enabling leak checking. // -// TODO(gvisor.dev/1486): Enable leak check for FDTable. -func (f *FDTable) init() { +// This is used when loading an FDTable after S/R, during which the ref count +// object itself will enable leak checking if necessary. +func (f *FDTable) initNoLeakCheck() { var slice []unsafe.Pointer // Empty slice. atomic.StorePointer(&f.slice, unsafe.Pointer(&slice)) } +// init initializes the table with leak checking. +func (f *FDTable) init() { + f.initNoLeakCheck() + f.InitRefs() +} + // get gets a file entry. // // The boolean indicates whether this was in range. @@ -114,18 +121,21 @@ func (f *FDTable) setAll(ctx context.Context, fd int32, file *fs.File, fileVFS2 panic("VFS1 and VFS2 files set") } - slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice)) + slicePtr := (*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice)) // Grow the table as required. - if last := int32(len(slice)); fd >= last { + if last := int32(len(*slicePtr)); fd >= last { end := fd + 1 if end < 2*last { end = 2 * last } - slice = append(slice, make([]unsafe.Pointer, end-last)...) - atomic.StorePointer(&f.slice, unsafe.Pointer(&slice)) + newSlice := append(*slicePtr, make([]unsafe.Pointer, end-last)...) + slicePtr = &newSlice + atomic.StorePointer(&f.slice, unsafe.Pointer(slicePtr)) } + slice := *slicePtr + var desc *descriptor if file != nil || fileVFS2 != nil { desc = &descriptor{ diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go index d46d1e1c1..dfde4deee 100644 --- a/pkg/sentry/kernel/fs_context.go +++ b/pkg/sentry/kernel/fs_context.go @@ -63,7 +63,7 @@ func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext { cwd: cwd, umask: umask, } - f.EnableLeakCheck() + f.InitRefs() return &f } @@ -76,7 +76,7 @@ func NewFSContextVFS2(root, cwd vfs.VirtualDentry, umask uint) *FSContext { cwdVFS2: cwd, umask: umask, } - f.EnableLeakCheck() + f.InitRefs() return &f } @@ -130,13 +130,15 @@ func (f *FSContext) Fork() *FSContext { f.root.IncRef() } - return &FSContext{ + ctx := &FSContext{ cwd: f.cwd, root: f.root, cwdVFS2: f.cwdVFS2, rootVFS2: f.rootVFS2, umask: f.umask, } + ctx.InitRefs() + return ctx } // WorkingDirectory returns the current working directory. @@ -147,19 +149,23 @@ func (f *FSContext) WorkingDirectory() *fs.Dirent { f.mu.Lock() defer f.mu.Unlock() - f.cwd.IncRef() + if f.cwd != nil { + f.cwd.IncRef() + } return f.cwd } // WorkingDirectoryVFS2 returns the current working directory. // -// This will return nil if called after f is destroyed, otherwise it will return -// a Dirent with a reference taken. +// This will return an empty vfs.VirtualDentry if called after f is +// destroyed, otherwise it will return a Dirent with a reference taken. func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry { f.mu.Lock() defer f.mu.Unlock() - f.cwdVFS2.IncRef() + if f.cwdVFS2.Ok() { + f.cwdVFS2.IncRef() + } return f.cwdVFS2 } @@ -218,13 +224,15 @@ func (f *FSContext) RootDirectory() *fs.Dirent { // RootDirectoryVFS2 returns the current filesystem root. // -// This will return nil if called after f is destroyed, otherwise it will return -// a Dirent with a reference taken. +// This will return an empty vfs.VirtualDentry if called after f is +// destroyed, otherwise it will return a Dirent with a reference taken. func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry { f.mu.Lock() defer f.mu.Unlock() - f.rootVFS2.IncRef() + if f.rootVFS2.Ok() { + f.rootVFS2.IncRef() + } return f.rootVFS2 } diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go index 3f34ee0db..9545bb5ef 100644 --- a/pkg/sentry/kernel/ipc_namespace.go +++ b/pkg/sentry/kernel/ipc_namespace.go @@ -41,7 +41,7 @@ func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace { semaphores: semaphore.NewRegistry(userNS), shms: shm.NewRegistry(userNS), } - ns.EnableLeakCheck() + ns.InitRefs() return ns } @@ -55,7 +55,7 @@ func (i *IPCNamespace) ShmRegistry() *shm.Registry { return i.shms } -// DecRef implements refs_vfs2.RefCounter.DecRef. +// DecRef implements refsvfs2.RefCounter.DecRef. func (i *IPCNamespace) DecRef(ctx context.Context) { i.IPCNamespaceRefs.DecRef(func() { i.shms.Release(ctx) diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 0eb2bf7bd..b8627a54f 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -214,9 +214,11 @@ type Kernel struct { // netlinkPorts manages allocation of netlink socket port IDs. netlinkPorts *port.Manager - // saveErr is the error causing the sandbox to exit during save, if - // any. It is protected by extMu. - saveErr error `state:"nosave"` + // saveStatus is nil if the sandbox has not been saved, errSaved or + // errAutoSaved if it has been saved successfully, or the error causing the + // sandbox to exit during save. + // It is protected by extMu. + saveStatus error `state:"nosave"` // danglingEndpoints is used to save / restore tcpip.DanglingEndpoints. danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"` @@ -430,9 +432,8 @@ func (k *Kernel) Init(args InitKernelArgs) error { // SaveTo saves the state of k to w. // // Preconditions: The kernel must be paused throughout the call to SaveTo. -func (k *Kernel) SaveTo(w wire.Writer) error { +func (k *Kernel) SaveTo(ctx context.Context, w wire.Writer) error { saveStart := time.Now() - ctx := k.SupervisorContext() // Do not allow other Kernel methods to affect it while it's being saved. k.extMu.Lock() @@ -446,38 +447,55 @@ func (k *Kernel) SaveTo(w wire.Writer) error { k.mf.StartEvictions() k.mf.WaitForEvictions() - // Flush write operations on open files so data reaches backing storage. - // This must come after MemoryFile eviction since eviction may cause file - // writes. - if err := k.tasks.flushWritesToFiles(ctx); err != nil { - return err - } + if VFS2Enabled { + // Discard unsavable mappings, such as those for host file descriptors. + if err := k.invalidateUnsavableMappings(ctx); err != nil { + return fmt.Errorf("failed to invalidate unsavable mappings: %v", err) + } + + // Prepare filesystems for saving. This must be done after + // invalidateUnsavableMappings(), since dropping memory mappings may + // affect filesystem state (e.g. page cache reference counts). + if err := k.vfs.PrepareSave(ctx); err != nil { + return err + } + } else { + // Flush cached file writes to backing storage. This must come after + // MemoryFile eviction since eviction may cause file writes. + if err := k.flushWritesToFiles(ctx); err != nil { + return err + } - // Remove all epoll waiter objects from underlying wait queues. - // NOTE: for programs to resume execution in future snapshot scenarios, - // we will need to re-establish these waiter objects after saving. - k.tasks.unregisterEpollWaiters(ctx) + // Remove all epoll waiter objects from underlying wait queues. + // NOTE: for programs to resume execution in future snapshot scenarios, + // we will need to re-establish these waiter objects after saving. + k.tasks.unregisterEpollWaiters(ctx) - // Clear the dirent cache before saving because Dirents must be Loaded in a - // particular order (parents before children), and Loading dirents from a cache - // breaks that order. - if err := k.flushMountSourceRefs(ctx); err != nil { - return err - } + // Clear the dirent cache before saving because Dirents must be Loaded in a + // particular order (parents before children), and Loading dirents from a cache + // breaks that order. + if err := k.flushMountSourceRefs(ctx); err != nil { + return err + } - // Ensure that all inode and mount release operations have completed. - fs.AsyncBarrier() + // Ensure that all inode and mount release operations have completed. + fs.AsyncBarrier() - // Once all fs work has completed (flushed references have all been released), - // reset mount mappings. This allows individual mounts to save how inodes map - // to filesystem resources. Without this, fs.Inodes cannot be restored. - fs.SaveInodeMappings() + // Once all fs work has completed (flushed references have all been released), + // reset mount mappings. This allows individual mounts to save how inodes map + // to filesystem resources. Without this, fs.Inodes cannot be restored. + fs.SaveInodeMappings() - // Discard unsavable mappings, such as those for host file descriptors. - // This must be done after waiting for "asynchronous fs work", which - // includes async I/O that may touch application memory. - if err := k.invalidateUnsavableMappings(ctx); err != nil { - return fmt.Errorf("failed to invalidate unsavable mappings: %v", err) + // Discard unsavable mappings, such as those for host file descriptors. + // This must be done after waiting for "asynchronous fs work", which + // includes async I/O that may touch application memory. + // + // TODO(gvisor.dev/issue/1624): This rationale is believed to be + // obsolete since AIO callbacks are now waited-for by Kernel.Pause(), + // but this order is conservatively retained for VFS1. + if err := k.invalidateUnsavableMappings(ctx); err != nil { + return fmt.Errorf("failed to invalidate unsavable mappings: %v", err) + } } // Save the CPUID FeatureSet before the rest of the kernel so we can @@ -486,14 +504,14 @@ func (k *Kernel) SaveTo(w wire.Writer) error { // // N.B. This will also be saved along with the full kernel save below. cpuidStart := time.Now() - if _, err := state.Save(k.SupervisorContext(), w, k.FeatureSet()); err != nil { + if _, err := state.Save(ctx, w, k.FeatureSet()); err != nil { return err } log.Infof("CPUID save took [%s].", time.Since(cpuidStart)) // Save the kernel state. kernelStart := time.Now() - stats, err := state.Save(k.SupervisorContext(), w, k) + stats, err := state.Save(ctx, w, k) if err != nil { return err } @@ -502,7 +520,7 @@ func (k *Kernel) SaveTo(w wire.Writer) error { // Save the memory file's state. memoryStart := time.Now() - if err := k.mf.SaveTo(k.SupervisorContext(), w); err != nil { + if err := k.mf.SaveTo(ctx, w); err != nil { return err } log.Infof("Memory save took [%s].", time.Since(memoryStart)) @@ -514,11 +532,9 @@ func (k *Kernel) SaveTo(w wire.Writer) error { // flushMountSourceRefs flushes the MountSources for all mounted filesystems // and open FDs. +// +// Preconditions: !VFS2Enabled. func (k *Kernel) flushMountSourceRefs(ctx context.Context) error { - if VFS2Enabled { - return nil // Not relevant. - } - // Flush all mount sources for currently mounted filesystems in each task. flushed := make(map[*fs.MountNamespace]struct{}) k.tasks.mu.RLock() @@ -561,13 +577,9 @@ func (ts *TaskSet) forEachFDPaused(ctx context.Context, f func(*fs.File, *vfs.Fi return err } -func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error { - // TODO(gvisor.dev/issue/1663): Add save support for VFS2. - if VFS2Enabled { - return nil - } - - return ts.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error { +// Preconditions: !VFS2Enabled. +func (k *Kernel) flushWritesToFiles(ctx context.Context) error { + return k.tasks.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error { if flags := file.Flags(); !flags.Write { return nil } @@ -589,37 +601,8 @@ func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error { }) } -// Preconditions: The kernel must be paused. -func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { - invalidated := make(map[*mm.MemoryManager]struct{}) - k.tasks.mu.RLock() - defer k.tasks.mu.RUnlock() - for t := range k.tasks.Root.tids { - // We can skip locking Task.mu here since the kernel is paused. - if mm := t.tc.MemoryManager; mm != nil { - if _, ok := invalidated[mm]; !ok { - if err := mm.InvalidateUnsavable(ctx); err != nil { - return err - } - invalidated[mm] = struct{}{} - } - } - // I really wish we just had a sync.Map of all MMs... - if r, ok := t.runState.(*runSyscallAfterExecStop); ok { - if err := r.tc.MemoryManager.InvalidateUnsavable(ctx); err != nil { - return err - } - } - } - return nil -} - +// Preconditions: !VFS2Enabled. func (ts *TaskSet) unregisterEpollWaiters(ctx context.Context) { - // TODO(gvisor.dev/issue/1663): Add save support for VFS2. - if VFS2Enabled { - return - } - ts.mu.RLock() defer ts.mu.RUnlock() @@ -644,8 +627,33 @@ func (ts *TaskSet) unregisterEpollWaiters(ctx context.Context) { } } +// Preconditions: The kernel must be paused. +func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { + invalidated := make(map[*mm.MemoryManager]struct{}) + k.tasks.mu.RLock() + defer k.tasks.mu.RUnlock() + for t := range k.tasks.Root.tids { + // We can skip locking Task.mu here since the kernel is paused. + if mm := t.image.MemoryManager; mm != nil { + if _, ok := invalidated[mm]; !ok { + if err := mm.InvalidateUnsavable(ctx); err != nil { + return err + } + invalidated[mm] = struct{}{} + } + } + // I really wish we just had a sync.Map of all MMs... + if r, ok := t.runState.(*runSyscallAfterExecStop); ok { + if err := r.image.MemoryManager.InvalidateUnsavable(ctx); err != nil { + return err + } + } + } + return nil +} + // LoadFrom returns a new Kernel loaded from args. -func (k *Kernel) LoadFrom(r wire.Reader, net inet.Stack, clocks sentrytime.Clocks) error { +func (k *Kernel) LoadFrom(ctx context.Context, r wire.Reader, net inet.Stack, clocks sentrytime.Clocks, vfsOpts *vfs.CompleteRestoreOptions) error { loadStart := time.Now() initAppCores := k.applicationCores @@ -656,7 +664,7 @@ func (k *Kernel) LoadFrom(r wire.Reader, net inet.Stack, clocks sentrytime.Clock // don't need to explicitly install it in the Kernel. cpuidStart := time.Now() var features cpuid.FeatureSet - if _, err := state.Load(k.SupervisorContext(), r, &features); err != nil { + if _, err := state.Load(ctx, r, &features); err != nil { return err } log.Infof("CPUID load took [%s].", time.Since(cpuidStart)) @@ -671,7 +679,7 @@ func (k *Kernel) LoadFrom(r wire.Reader, net inet.Stack, clocks sentrytime.Clock // Load the kernel state. kernelStart := time.Now() - stats, err := state.Load(k.SupervisorContext(), r, k) + stats, err := state.Load(ctx, r, k) if err != nil { return err } @@ -684,7 +692,7 @@ func (k *Kernel) LoadFrom(r wire.Reader, net inet.Stack, clocks sentrytime.Clock // Load the memory file's state. memoryStart := time.Now() - if err := k.mf.LoadFrom(k.SupervisorContext(), r); err != nil { + if err := k.mf.LoadFrom(ctx, r); err != nil { return err } log.Infof("Memory load took [%s].", time.Since(memoryStart)) @@ -696,11 +704,17 @@ func (k *Kernel) LoadFrom(r wire.Reader, net inet.Stack, clocks sentrytime.Clock net.Resume() } - // Ensure that all pending asynchronous work is complete: - // - namedpipe opening - // - inode file opening - if err := fs.AsyncErrorBarrier(); err != nil { - return err + if VFS2Enabled { + if err := k.vfs.CompleteRestore(ctx, vfsOpts); err != nil { + return err + } + } else { + // Ensure that all pending asynchronous work is complete: + // - namedpipe opening + // - inode file opening + if err := fs.AsyncErrorBarrier(); err != nil { + return err + } } tcpip.AsyncLoading.Wait() @@ -1005,7 +1019,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, Features: k.featureSet, } - tc, se := k.LoadTaskImage(ctx, loadArgs) + image, se := k.LoadTaskImage(ctx, loadArgs) if se != nil { return nil, 0, errors.New(se.String()) } @@ -1018,7 +1032,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, config := &TaskConfig{ Kernel: k, ThreadGroup: tg, - TaskContext: tc, + TaskImage: image, FSContext: fsContext, FDTable: args.FDTable, Credentials: args.Credentials, @@ -1034,7 +1048,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, if err != nil { return nil, 0, err } - t.traceExecEvent(tc) // Simulate exec for tracing. + t.traceExecEvent(image) // Simulate exec for tracing. // Success. cu.Release() @@ -1347,6 +1361,13 @@ func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error { // not have meaningful trace data. Rebuilding here ensures that we can do so // after tracing has been enabled. func (k *Kernel) RebuildTraceContexts() { + // We need to pause all task goroutines because Task.rebuildTraceContext() + // replaces Task.traceContext and Task.traceTask, which are + // task-goroutine-exclusive (i.e. the task goroutine assumes that it can + // access them without synchronization) for performance. + k.Pause() + defer k.Unpause() + k.extMu.Lock() defer k.extMu.Unlock() k.tasks.mu.RLock() @@ -1462,12 +1483,42 @@ func (k *Kernel) NetlinkPorts() *port.Manager { return k.netlinkPorts } -// SaveError returns the sandbox error that caused the kernel to exit during -// save. -func (k *Kernel) SaveError() error { +var ( + errSaved = errors.New("sandbox has been successfully saved") + errAutoSaved = errors.New("sandbox has been successfully auto-saved") +) + +// SaveStatus returns the sandbox save status. If it was saved successfully, +// autosaved indicates whether save was triggered by autosave. If it was not +// saved successfully, err indicates the sandbox error that caused the kernel to +// exit during save. +func (k *Kernel) SaveStatus() (saved, autosaved bool, err error) { k.extMu.Lock() defer k.extMu.Unlock() - return k.saveErr + switch k.saveStatus { + case nil: + return false, false, nil + case errSaved: + return true, false, nil + case errAutoSaved: + return true, true, nil + default: + return false, false, k.saveStatus + } +} + +// SetSaveSuccess sets the flag indicating that save completed successfully, if +// no status was already set. +func (k *Kernel) SetSaveSuccess(autosave bool) { + k.extMu.Lock() + defer k.extMu.Unlock() + if k.saveStatus == nil { + if autosave { + k.saveStatus = errAutoSaved + } else { + k.saveStatus = errSaved + } + } } // SetSaveError sets the sandbox error that caused the kernel to exit during @@ -1475,8 +1526,8 @@ func (k *Kernel) SaveError() error { func (k *Kernel) SetSaveError(err error) { k.extMu.Lock() defer k.extMu.Unlock() - if k.saveErr == nil { - k.saveErr = err + if k.saveStatus == nil { + k.saveStatus = err } } diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go index ce0db5583..d6fb0fdb8 100644 --- a/pkg/sentry/kernel/pipe/node_test.go +++ b/pkg/sentry/kernel/pipe/node_test.go @@ -22,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" ) type sleeper struct { @@ -66,7 +65,8 @@ func testOpenOrDie(ctx context.Context, t *testing.T, n fs.InodeOperations, flag d := fs.NewDirent(ctx, inode, "pipe") file, err := n.GetFile(ctx, d, flags) if err != nil { - t.Fatalf("open with flags %+v failed: %v", flags, err) + t.Errorf("open with flags %+v failed: %v", flags, err) + return nil, err } if doneChan != nil { doneChan <- struct{}{} @@ -85,11 +85,11 @@ func testOpen(ctx context.Context, t *testing.T, n fs.InodeOperations, flags fs. } func newNamedPipe(t *testing.T) *Pipe { - return NewPipe(true, DefaultPipeSize, usermem.PageSize) + return NewPipe(true, DefaultPipeSize) } func newAnonPipe(t *testing.T) *Pipe { - return NewPipe(false, DefaultPipeSize, usermem.PageSize) + return NewPipe(false, DefaultPipeSize) } // assertRecvBlocks ensures that a recv attempt on c blocks for at least diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go index 67beb0ad6..b989e14c7 100644 --- a/pkg/sentry/kernel/pipe/pipe.go +++ b/pkg/sentry/kernel/pipe/pipe.go @@ -26,18 +26,27 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) const ( // MinimumPipeSize is a hard limit of the minimum size of a pipe. - MinimumPipeSize = 64 << 10 + // It corresponds to fs/pipe.c:pipe_min_size. + MinimumPipeSize = usermem.PageSize + + // MaximumPipeSize is a hard limit on the maximum size of a pipe. + // It corresponds to fs/pipe.c:pipe_max_size. + MaximumPipeSize = 1048576 // DefaultPipeSize is the system-wide default size of a pipe in bytes. - DefaultPipeSize = MinimumPipeSize + // It corresponds to pipe_fs_i.h:PIPE_DEF_BUFFERS. + DefaultPipeSize = 16 * usermem.PageSize - // MaximumPipeSize is a hard limit on the maximum size of a pipe. - MaximumPipeSize = 8 << 20 + // atomicIOBytes is the maximum number of bytes that the pipe will + // guarantee atomic reads or writes atomically. + // It corresponds to limits.h:PIPE_BUF. + atomicIOBytes = 4096 ) // Pipe is an encapsulation of a platform-independent pipe. @@ -53,12 +62,6 @@ type Pipe struct { // This value is immutable. isNamed bool - // atomicIOBytes is the maximum number of bytes that the pipe will - // guarantee atomic reads or writes atomically. - // - // This value is immutable. - atomicIOBytes int64 - // The number of active readers for this pipe. // // Access atomically. @@ -94,47 +97,34 @@ type Pipe struct { // NewPipe initializes and returns a pipe. // -// N.B. The size and atomicIOBytes will be bounded. -func NewPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *Pipe { +// N.B. The size will be bounded. +func NewPipe(isNamed bool, sizeBytes int64) *Pipe { if sizeBytes < MinimumPipeSize { sizeBytes = MinimumPipeSize } if sizeBytes > MaximumPipeSize { sizeBytes = MaximumPipeSize } - if atomicIOBytes <= 0 { - atomicIOBytes = 1 - } - if atomicIOBytes > sizeBytes { - atomicIOBytes = sizeBytes - } var p Pipe - initPipe(&p, isNamed, sizeBytes, atomicIOBytes) + initPipe(&p, isNamed, sizeBytes) return &p } -func initPipe(pipe *Pipe, isNamed bool, sizeBytes, atomicIOBytes int64) { +func initPipe(pipe *Pipe, isNamed bool, sizeBytes int64) { if sizeBytes < MinimumPipeSize { sizeBytes = MinimumPipeSize } if sizeBytes > MaximumPipeSize { sizeBytes = MaximumPipeSize } - if atomicIOBytes <= 0 { - atomicIOBytes = 1 - } - if atomicIOBytes > sizeBytes { - atomicIOBytes = sizeBytes - } pipe.isNamed = isNamed pipe.max = sizeBytes - pipe.atomicIOBytes = atomicIOBytes } // NewConnectedPipe initializes a pipe and returns a pair of objects // representing the read and write ends of the pipe. -func NewConnectedPipe(ctx context.Context, sizeBytes, atomicIOBytes int64) (*fs.File, *fs.File) { - p := NewPipe(false /* isNamed */, sizeBytes, atomicIOBytes) +func NewConnectedPipe(ctx context.Context, sizeBytes int64) (*fs.File, *fs.File) { + p := NewPipe(false /* isNamed */, sizeBytes) // Build an fs.Dirent for the pipe which will be shared by both // returned files. @@ -264,7 +254,7 @@ func (p *Pipe) writeLocked(ctx context.Context, ops writeOps) (int64, error) { wanted := ops.left() avail := p.max - p.view.Size() if wanted > avail { - if wanted <= p.atomicIOBytes { + if wanted <= atomicIOBytes { return 0, syserror.ErrWouldBlock } ops.limit(avail) diff --git a/pkg/sentry/kernel/pipe/pipe_test.go b/pkg/sentry/kernel/pipe/pipe_test.go index fe97e9800..3dd739080 100644 --- a/pkg/sentry/kernel/pipe/pipe_test.go +++ b/pkg/sentry/kernel/pipe/pipe_test.go @@ -26,7 +26,7 @@ import ( func TestPipeRW(t *testing.T) { ctx := contexttest.Context(t) - r, w := NewConnectedPipe(ctx, 65536, 4096) + r, w := NewConnectedPipe(ctx, 65536) defer r.DecRef(ctx) defer w.DecRef(ctx) @@ -46,7 +46,7 @@ func TestPipeRW(t *testing.T) { func TestPipeReadBlock(t *testing.T) { ctx := contexttest.Context(t) - r, w := NewConnectedPipe(ctx, 65536, 4096) + r, w := NewConnectedPipe(ctx, 65536) defer r.DecRef(ctx) defer w.DecRef(ctx) @@ -61,7 +61,7 @@ func TestPipeWriteBlock(t *testing.T) { const capacity = MinimumPipeSize ctx := contexttest.Context(t) - r, w := NewConnectedPipe(ctx, capacity, atomicIOBytes) + r, w := NewConnectedPipe(ctx, capacity) defer r.DecRef(ctx) defer w.DecRef(ctx) @@ -76,7 +76,7 @@ func TestPipeWriteUntilEnd(t *testing.T) { const atomicIOBytes = 2 ctx := contexttest.Context(t) - r, w := NewConnectedPipe(ctx, atomicIOBytes, atomicIOBytes) + r, w := NewConnectedPipe(ctx, atomicIOBytes) defer r.DecRef(ctx) defer w.DecRef(ctx) @@ -116,7 +116,8 @@ func TestPipeWriteUntilEnd(t *testing.T) { } } if err != nil { - t.Fatalf("Readv: got unexpected error %v", err) + t.Errorf("Readv: got unexpected error %v", err) + return } } }() diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index 1a152142b..7b23cbe86 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -33,6 +33,8 @@ import ( // VFSPipe represents the actual pipe, analagous to an inode. VFSPipes should // not be copied. +// +// +stateify savable type VFSPipe struct { // mu protects the fields below. mu sync.Mutex `state:"nosave"` @@ -52,9 +54,9 @@ type VFSPipe struct { } // NewVFSPipe returns an initialized VFSPipe. -func NewVFSPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *VFSPipe { +func NewVFSPipe(isNamed bool, sizeBytes int64) *VFSPipe { var vp VFSPipe - initPipe(&vp.pipe, isNamed, sizeBytes, atomicIOBytes) + initPipe(&vp.pipe, isNamed, sizeBytes) return &vp } @@ -164,6 +166,8 @@ func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, l // VFSPipeFD implements vfs.FileDescriptionImpl for pipes. It also implements // non-atomic usermem.IO methods, allowing it to be passed as usermem.IO to // other FileDescriptions for splice(2) and tee(2). +// +// +stateify savable type VFSPipeFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go index 1145faf13..cef58a590 100644 --- a/pkg/sentry/kernel/ptrace.go +++ b/pkg/sentry/kernel/ptrace.go @@ -259,8 +259,8 @@ func (t *Task) ptraceTrapLocked(code int32) { Signo: int32(linux.SIGTRAP), Code: code, } - t.ptraceSiginfo.SetPid(int32(t.tg.pidns.tids[t])) - t.ptraceSiginfo.SetUid(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) + t.ptraceSiginfo.SetPID(int32(t.tg.pidns.tids[t])) + t.ptraceSiginfo.SetUID(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) if t.beginPtraceStopLocked() { tracer := t.Tracer() tracer.signalStop(t, arch.CLD_TRAPPED, int32(linux.SIGTRAP)) @@ -1000,7 +1000,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { // at the address specified by the data parameter, and the return value // is the error flag." - ptrace(2) word := t.Arch().Native(0) - if _, err := word.CopyIn(target.AsCopyContext(usermem.IOOpts{IgnorePermissions: true}), addr); err != nil { + if _, err := word.CopyIn(target.CopyContext(t, usermem.IOOpts{IgnorePermissions: true}), addr); err != nil { return err } _, err := word.CopyOut(t, data) @@ -1008,7 +1008,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { case linux.PTRACE_POKETEXT, linux.PTRACE_POKEDATA: word := t.Arch().Native(uintptr(data)) - _, err := word.CopyOut(target.AsCopyContext(usermem.IOOpts{IgnorePermissions: true}), addr) + _, err := word.CopyOut(target.CopyContext(t, usermem.IOOpts{IgnorePermissions: true}), addr) return err case linux.PTRACE_GETREGSET: diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go index 387edfa91..60917e7d3 100644 --- a/pkg/sentry/kernel/seccomp.go +++ b/pkg/sentry/kernel/seccomp.go @@ -106,7 +106,7 @@ func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip u func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip usermem.Addr) uint32 { data := linux.SeccompData{ Nr: sysno, - Arch: t.tc.st.AuditNumber, + Arch: t.image.st.AuditNumber, InstructionPointer: uint64(ip), } // data.args is []uint64 and args is []arch.SyscallArgument (uintptr), so diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go index c00fa1138..3dd3953b3 100644 --- a/pkg/sentry/kernel/semaphore/semaphore.go +++ b/pkg/sentry/kernel/semaphore/semaphore.go @@ -29,17 +29,17 @@ import ( ) const ( - valueMax = 32767 // SEMVMX + // Maximum semaphore value. + valueMax = linux.SEMVMX - // semaphoresMax is "maximum number of semaphores per semaphore ID" (SEMMSL). - semaphoresMax = 32000 + // Maximum number of semaphore sets. + setsMax = linux.SEMMNI - // setMax is "system-wide limit on the number of semaphore sets" (SEMMNI). - setsMax = 32000 + // Maximum number of semaphroes in a semaphore set. + semsMax = linux.SEMMSL - // semaphoresTotalMax is "system-wide limit on the number of semaphores" - // (SEMMNS = SEMMNI*SEMMSL). - semaphoresTotalMax = 1024000000 + // Maximum number of semaphores in all semaphroe sets. + semsTotalMax = linux.SEMMNS ) // Registry maintains a set of semaphores that can be found by key or ID. @@ -52,6 +52,9 @@ type Registry struct { mu sync.Mutex `state:"nosave"` semaphores map[int32]*Set lastIDUsed int32 + // indexes maintains a mapping between a set's index in virtual array and + // its identifier. + indexes map[int32]int32 } // Set represents a set of semaphores that can be operated atomically. @@ -103,6 +106,7 @@ type waiter struct { waiterEntry // value represents how much resource the waiter needs to wake up. + // The value is either 0 or negative. value int16 ch chan struct{} } @@ -112,6 +116,7 @@ func NewRegistry(userNS *auth.UserNamespace) *Registry { return &Registry{ userNS: userNS, semaphores: make(map[int32]*Set), + indexes: make(map[int32]int32), } } @@ -121,7 +126,7 @@ func NewRegistry(userNS *auth.UserNamespace) *Registry { // be found. If exclusive is true, it fails if a set with the same key already // exists. func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linux.FileMode, private, create, exclusive bool) (*Set, error) { - if nsems < 0 || nsems > semaphoresMax { + if nsems < 0 || nsems > semsMax { return nil, syserror.EINVAL } @@ -162,10 +167,13 @@ func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linu } // Apply system limits. + // + // Map semaphores and map indexes in a registry are of the same size, + // check map semaphores only here for the system limit. if len(r.semaphores) >= setsMax { return nil, syserror.EINVAL } - if r.totalSems() > int(semaphoresTotalMax-nsems) { + if r.totalSems() > int(semsTotalMax-nsems) { return nil, syserror.EINVAL } @@ -175,6 +183,39 @@ func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linu return r.newSet(ctx, key, owner, owner, perms, nsems) } +// IPCInfo returns information about system-wide semaphore limits and parameters. +func (r *Registry) IPCInfo() *linux.SemInfo { + return &linux.SemInfo{ + SemMap: linux.SEMMAP, + SemMni: linux.SEMMNI, + SemMns: linux.SEMMNS, + SemMnu: linux.SEMMNU, + SemMsl: linux.SEMMSL, + SemOpm: linux.SEMOPM, + SemUme: linux.SEMUME, + SemUsz: 0, // SemUsz not supported. + SemVmx: linux.SEMVMX, + SemAem: linux.SEMAEM, + } +} + +// HighestIndex returns the index of the highest used entry in +// the kernel's array. +func (r *Registry) HighestIndex() int32 { + r.mu.Lock() + defer r.mu.Unlock() + + // By default, highest used index is 0 even though + // there is no semaphroe set. + var highestIndex int32 + for index := range r.indexes { + if index > highestIndex { + highestIndex = index + } + } + return highestIndex +} + // RemoveID removes set with give 'id' from the registry and marks the set as // dead. All waiters will be awakened and fail. func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error { @@ -185,6 +226,11 @@ func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error { if set == nil { return syserror.EINVAL } + index, found := r.findIndexByID(id) + if !found { + // Inconsistent state. + panic(fmt.Sprintf("unable to find an index for ID: %d", id)) + } set.mu.Lock() defer set.mu.Unlock() @@ -196,6 +242,7 @@ func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error { } delete(r.semaphores, set.ID) + delete(r.indexes, index) set.destroy() return nil } @@ -219,6 +266,11 @@ func (r *Registry) newSet(ctx context.Context, key int32, owner, creator fs.File continue } if r.semaphores[id] == nil { + index, found := r.findFirstAvailableIndex() + if !found { + panic("unable to find an available index") + } + r.indexes[index] = id r.lastIDUsed = id r.semaphores[id] = set set.ID = id @@ -246,6 +298,24 @@ func (r *Registry) findByKey(key int32) *Set { return nil } +func (r *Registry) findIndexByID(id int32) (int32, bool) { + for k, v := range r.indexes { + if v == id { + return k, true + } + } + return 0, false +} + +func (r *Registry) findFirstAvailableIndex() (int32, bool) { + for index := int32(0); index < setsMax; index++ { + if _, present := r.indexes[index]; !present { + return index, true + } + } + return 0, false +} + func (r *Registry) totalSems() int { totalSems := 0 for _, v := range r.semaphores { @@ -283,6 +353,33 @@ func (s *Set) Change(ctx context.Context, creds *auth.Credentials, owner fs.File return nil } +// GetStat extracts semid_ds information from the set. +func (s *Set) GetStat(creds *auth.Credentials) (*linux.SemidDS, error) { + s.mu.Lock() + defer s.mu.Unlock() + + // "The calling process must have read permission on the semaphore set." + if !s.checkPerms(creds, fs.PermMask{Read: true}) { + return nil, syserror.EACCES + } + + ds := &linux.SemidDS{ + SemPerm: linux.IPCPerm{ + Key: uint32(s.key), + UID: uint32(creds.UserNamespace.MapFromKUID(s.owner.UID)), + GID: uint32(creds.UserNamespace.MapFromKGID(s.owner.GID)), + CUID: uint32(creds.UserNamespace.MapFromKUID(s.creator.UID)), + CGID: uint32(creds.UserNamespace.MapFromKGID(s.creator.GID)), + Mode: uint16(s.perms.LinuxMode()), + Seq: 0, // IPC sequence not supported. + }, + SemOTime: s.opTime.TimeT(), + SemCTime: s.changeTime.TimeT(), + SemNSems: uint64(s.Size()), + } + return ds, nil +} + // SetVal overrides a semaphore value, waking up waiters as needed. func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Credentials, pid int32) error { if val < 0 || val > valueMax { @@ -320,7 +417,7 @@ func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credenti } for _, val := range vals { - if val < 0 || val > valueMax { + if val > valueMax { return syserror.ERANGE } } @@ -396,6 +493,42 @@ func (s *Set) GetPID(num int32, creds *auth.Credentials) (int32, error) { return sem.pid, nil } +func (s *Set) countWaiters(num int32, creds *auth.Credentials, pred func(w *waiter) bool) (uint16, error) { + s.mu.Lock() + defer s.mu.Unlock() + + // The calling process must have read permission on the semaphore set. + if !s.checkPerms(creds, fs.PermMask{Read: true}) { + return 0, syserror.EACCES + } + + sem := s.findSem(num) + if sem == nil { + return 0, syserror.ERANGE + } + var cnt uint16 + for w := sem.waiters.Front(); w != nil; w = w.Next() { + if pred(w) { + cnt++ + } + } + return cnt, nil +} + +// CountZeroWaiters returns number of waiters waiting for the sem's value to increase. +func (s *Set) CountZeroWaiters(num int32, creds *auth.Credentials) (uint16, error) { + return s.countWaiters(num, creds, func(w *waiter) bool { + return w.value == 0 + }) +} + +// CountNegativeWaiters returns number of waiters waiting for the sem to go to zero. +func (s *Set) CountNegativeWaiters(num int32, creds *auth.Credentials) (uint16, error) { + return s.countWaiters(num, creds, func(w *waiter) bool { + return w.value < 0 + }) +} + // ExecuteOps attempts to execute a list of operations to the set. It only // succeeds when all operations can be applied. No changes are made if it fails. // @@ -548,11 +681,18 @@ func (s *Set) destroy() { } } +func abs(val int16) int16 { + if val < 0 { + return -val + } + return val +} + // wakeWaiters goes over all waiters and checks which of them can be notified. func (s *sem) wakeWaiters() { // Note that this will release all waiters waiting for 0 too. for w := s.waiters.Front(); w != nil; { - if s.value < w.value { + if s.value < abs(w.value) { // Still blocked, skip it. w = w.Next() continue diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go index df5c8421b..0cd9e2533 100644 --- a/pkg/sentry/kernel/sessions.go +++ b/pkg/sentry/kernel/sessions.go @@ -295,7 +295,7 @@ func (tg *ThreadGroup) createSession() error { id: SessionID(id), leader: tg, } - s.EnableLeakCheck() + s.InitRefs() // Create a new ProcessGroup, belonging to that Session. // This also has a single reference (assigned below). @@ -309,7 +309,7 @@ func (tg *ThreadGroup) createSession() error { session: s, ancestors: 0, } - pg.refs.EnableLeakCheck() + pg.refs.InitRefs() // Tie them and return the result. s.processGroups.PushBack(pg) @@ -395,7 +395,7 @@ func (tg *ThreadGroup) CreateProcessGroup() error { originator: tg, session: tg.processGroup.session, } - pg.refs.EnableLeakCheck() + pg.refs.InitRefs() if tg.leader.parent != nil && tg.leader.parent.tg.processGroup.session == pg.session { pg.ancestors++ @@ -477,20 +477,20 @@ func (tg *ThreadGroup) Session() *Session { // // If this group isn't visible in this namespace, zero will be returned. It is // the callers responsibility to check that before using this function. -func (pidns *PIDNamespace) IDOfSession(s *Session) SessionID { - pidns.owner.mu.RLock() - defer pidns.owner.mu.RUnlock() - return pidns.sids[s] +func (ns *PIDNamespace) IDOfSession(s *Session) SessionID { + ns.owner.mu.RLock() + defer ns.owner.mu.RUnlock() + return ns.sids[s] } // SessionWithID returns the Session with the given ID in the PID namespace ns, // or nil if that given ID is not defined in this namespace. // // A reference is not taken on the session. -func (pidns *PIDNamespace) SessionWithID(id SessionID) *Session { - pidns.owner.mu.RLock() - defer pidns.owner.mu.RUnlock() - return pidns.sessions[id] +func (ns *PIDNamespace) SessionWithID(id SessionID) *Session { + ns.owner.mu.RLock() + defer ns.owner.mu.RUnlock() + return ns.sessions[id] } // ProcessGroup returns the ThreadGroup's ProcessGroup. @@ -505,18 +505,18 @@ func (tg *ThreadGroup) ProcessGroup() *ProcessGroup { // IDOfProcessGroup returns the process group assigned to pg in PID namespace ns. // // The same constraints apply as IDOfSession. -func (pidns *PIDNamespace) IDOfProcessGroup(pg *ProcessGroup) ProcessGroupID { - pidns.owner.mu.RLock() - defer pidns.owner.mu.RUnlock() - return pidns.pgids[pg] +func (ns *PIDNamespace) IDOfProcessGroup(pg *ProcessGroup) ProcessGroupID { + ns.owner.mu.RLock() + defer ns.owner.mu.RUnlock() + return ns.pgids[pg] } // ProcessGroupWithID returns the ProcessGroup with the given ID in the PID // namespace ns, or nil if that given ID is not defined in this namespace. // // A reference is not taken on the process group. -func (pidns *PIDNamespace) ProcessGroupWithID(id ProcessGroupID) *ProcessGroup { - pidns.owner.mu.RLock() - defer pidns.owner.mu.RUnlock() - return pidns.processGroups[id] +func (ns *PIDNamespace) ProcessGroupWithID(id ProcessGroupID) *ProcessGroup { + ns.owner.mu.RLock() + defer ns.owner.mu.RUnlock() + return ns.processGroups[id] } diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD index f8a382fd8..073e14507 100644 --- a/pkg/sentry/kernel/shm/BUILD +++ b/pkg/sentry/kernel/shm/BUILD @@ -6,9 +6,12 @@ package(licenses = ["notice"]) go_template_instance( name = "shm_refs", out = "shm_refs.go", + consts = { + "enableLogging": "true", + }, package = "shm", prefix = "Shm", - template = "//pkg/refs_vfs2:refs_template", + template = "//pkg/refsvfs2:refs_template", types = { "T": "Shm", }, @@ -27,7 +30,7 @@ go_library( "//pkg/context", "//pkg/log", "//pkg/refs", - "//pkg/refs_vfs2", + "//pkg/refsvfs2", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go index ebbebf46b..92d60ba78 100644 --- a/pkg/sentry/kernel/shm/shm.go +++ b/pkg/sentry/kernel/shm/shm.go @@ -251,7 +251,7 @@ func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.Fi creatorPID: pid, changeTime: ktime.NowFromContext(ctx), } - shm.EnableLeakCheck() + shm.InitRefs() // Find the next available ID. for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ { diff --git a/pkg/sentry/kernel/signal.go b/pkg/sentry/kernel/signal.go index e8cce37d0..2488ae7d5 100644 --- a/pkg/sentry/kernel/signal.go +++ b/pkg/sentry/kernel/signal.go @@ -73,7 +73,7 @@ func SignalInfoNoInfo(sig linux.Signal, sender, receiver *Task) *arch.SignalInfo Signo: int32(sig), Code: arch.SignalInfoUser, } - info.SetPid(int32(receiver.tg.pidns.IDOfThreadGroup(sender.tg))) - info.SetUid(int32(sender.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) + info.SetPID(int32(receiver.tg.pidns.IDOfThreadGroup(sender.tg))) + info.SetUID(int32(sender.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) return info } diff --git a/pkg/sentry/kernel/signalfd/signalfd.go b/pkg/sentry/kernel/signalfd/signalfd.go index 78f718cfe..884966120 100644 --- a/pkg/sentry/kernel/signalfd/signalfd.go +++ b/pkg/sentry/kernel/signalfd/signalfd.go @@ -106,8 +106,8 @@ func (s *SignalOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS Signo: uint32(info.Signo), Errno: info.Errno, Code: info.Code, - PID: uint32(info.Pid()), - UID: uint32(info.Uid()), + PID: uint32(info.PID()), + UID: uint32(info.UID()), Status: info.Status(), Overrun: uint32(info.Overrun()), Addr: info.Addr(), diff --git a/pkg/sentry/kernel/syscalls_state.go b/pkg/sentry/kernel/syscalls_state.go index 90f890495..0b17a562e 100644 --- a/pkg/sentry/kernel/syscalls_state.go +++ b/pkg/sentry/kernel/syscalls_state.go @@ -30,18 +30,18 @@ type syscallTableInfo struct { } // saveSt saves the SyscallTable. -func (tc *TaskContext) saveSt() syscallTableInfo { +func (image *TaskImage) saveSt() syscallTableInfo { return syscallTableInfo{ - OS: tc.st.OS, - Arch: tc.st.Arch, + OS: image.st.OS, + Arch: image.st.Arch, } } // loadSt loads the SyscallTable. -func (tc *TaskContext) loadSt(sti syscallTableInfo) { +func (image *TaskImage) loadSt(sti syscallTableInfo) { st, ok := LookupSyscallTable(sti.OS, sti.Arch) if !ok { panic(fmt.Sprintf("syscall table not found for OS %v, Arch %v", sti.OS, sti.Arch)) } - tc.st = st // Save the table reference. + image.st = st // Save the table reference. } diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go index a83ce219c..3fee7aa68 100644 --- a/pkg/sentry/kernel/syslog.go +++ b/pkg/sentry/kernel/syslog.go @@ -75,6 +75,12 @@ func (s *syslog) Log() []byte { "Checking naughty and nice process list...", // Check it up to twice. "Granting licence to kill(2)...", // British spelling for British movie. "Letting the watchdogs out...", + "Conjuring /dev/null black hole...", + "Adversarially training Redcode AI...", + "Singleplexing /dev/ptmx...", + "Recruiting cron-ies...", + "Verifying that no non-zero bytes made their way into /dev/zero...", + "Accelerating teletypewriter to 9600 baud...", } selectMessage := func() string { diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index 037971393..c0ab53c94 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" - "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/inet" @@ -29,11 +28,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/sentry/limits" - "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/unimpl" - "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" @@ -63,6 +58,12 @@ import ( type Task struct { taskNode + // goid is the task goroutine's ID. goid is owned by the task goroutine, + // but since it's used to detect cases where non-task goroutines + // incorrectly access state owned by, or exclusive to, the task goroutine, + // goid is always accessed using atomic memory operations. + goid int64 `state:"nosave"` + // runState is what the task goroutine is executing if it is not stopped. // If runState is nil, the task goroutine should exit or has exited. // runState is exclusive to the task goroutine. @@ -83,7 +84,7 @@ type Task struct { // taskWork is exclusive to the task goroutine. taskWork []TaskWorker - // haveSyscallReturn is true if tc.Arch().Return() represents a value + // haveSyscallReturn is true if image.Arch().Return() represents a value // returned by a syscall (or set by ptrace after a syscall). // // haveSyscallReturn is exclusive to the task goroutine. @@ -257,10 +258,10 @@ type Task struct { // mu protects some of the following fields. mu sync.Mutex `state:"nosave"` - // tc holds task data provided by the ELF loader. + // image holds task data provided by the ELF loader. // - // tc is protected by mu, and is owned by the task goroutine. - tc TaskContext + // image is protected by mu, and is owned by the task goroutine. + image TaskImage // fsContext is the task's filesystem context. // @@ -274,7 +275,7 @@ type Task struct { // If vforkParent is not nil, it is the task that created this task with // vfork() or clone(CLONE_VFORK), and should have its vforkStop ended when - // this TaskContext is released. + // this TaskImage is released. // // vforkParent is protected by the TaskSet mutex. vforkParent *Task @@ -641,64 +642,6 @@ func (t *Task) Kernel() *Kernel { return t.k } -// Value implements context.Context.Value. -// -// Preconditions: The caller must be running on the task goroutine (as implied -// by the requirements of context.Context). -func (t *Task) Value(key interface{}) interface{} { - switch key { - case CtxCanTrace: - return t.CanTrace - case CtxKernel: - return t.k - case CtxPIDNamespace: - return t.tg.pidns - case CtxUTSNamespace: - return t.utsns - case CtxIPCNamespace: - ipcns := t.IPCNamespace() - ipcns.IncRef() - return ipcns - case CtxTask: - return t - case auth.CtxCredentials: - return t.Credentials() - case context.CtxThreadGroupID: - return int32(t.ThreadGroup().ID()) - case fs.CtxRoot: - return t.fsContext.RootDirectory() - case vfs.CtxRoot: - return t.fsContext.RootDirectoryVFS2() - case vfs.CtxMountNamespace: - t.mountNamespaceVFS2.IncRef() - return t.mountNamespaceVFS2 - case fs.CtxDirentCacheLimiter: - return t.k.DirentCacheLimiter - case inet.CtxStack: - return t.NetworkContext() - case ktime.CtxRealtimeClock: - return t.k.RealtimeClock() - case limits.CtxLimits: - return t.tg.limits - case pgalloc.CtxMemoryFile: - return t.k.mf - case pgalloc.CtxMemoryFileProvider: - return t.k - case platform.CtxPlatform: - return t.k - case uniqueid.CtxGlobalUniqueID: - return t.k.UniqueID() - case uniqueid.CtxGlobalUniqueIDProvider: - return t.k - case uniqueid.CtxInotifyCookie: - return t.k.GenerateInotifyCookie() - case unimpl.CtxEvents: - return t.k - default: - return nil - } -} - // SetClearTID sets t's cleartid. // // Preconditions: The caller must be running on the task goroutine. @@ -751,12 +694,12 @@ func (t *Task) IsChrooted() bool { return root != realRoot } -// TaskContext returns t's TaskContext. +// TaskImage returns t's TaskImage. // // Precondition: The caller must be running on the task goroutine, or t.mu must // be locked. -func (t *Task) TaskContext() *TaskContext { - return &t.tc +func (t *Task) TaskImage() *TaskImage { + return &t.image } // FSContext returns t's FSContext. FSContext does not take an additional diff --git a/pkg/sentry/kernel/task_acct.go b/pkg/sentry/kernel/task_acct.go index 5f3e60fe8..e574997f7 100644 --- a/pkg/sentry/kernel/task_acct.go +++ b/pkg/sentry/kernel/task_acct.go @@ -136,14 +136,14 @@ func (tg *ThreadGroup) IOUsage() *usage.IO { func (t *Task) Name() string { t.mu.Lock() defer t.mu.Unlock() - return t.tc.Name + return t.image.Name } // SetName changes t's name. func (t *Task) SetName(name string) { t.mu.Lock() defer t.mu.Unlock() - t.tc.Name = name + t.image.Name = name t.Debugf("Set thread name to %q", name) } diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go index 4a4a69ee2..9419f2e95 100644 --- a/pkg/sentry/kernel/task_block.go +++ b/pkg/sentry/kernel/task_block.go @@ -20,6 +20,7 @@ import ( "time" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) @@ -32,6 +33,8 @@ import ( // // - An error which is nil if an event is received from C, ETIMEDOUT if the timeout // expired, and syserror.ErrInterrupted if t is interrupted. +// +// Preconditions: The caller must be running on the task goroutine. func (t *Task) BlockWithTimeout(C chan struct{}, haveTimeout bool, timeout time.Duration) (time.Duration, error) { if !haveTimeout { return timeout, t.block(C, nil) @@ -112,7 +115,14 @@ func (t *Task) Block(C <-chan struct{}) error { // block blocks a task on one of many events. // N.B. defer is too expensive to be used here. +// +// Preconditions: The caller must be running on the task goroutine. func (t *Task) block(C <-chan struct{}, timerChan <-chan struct{}) error { + // This function is very hot; skip this check outside of +race builds. + if sync.RaceEnabled { + t.assertTaskGoroutine() + } + // Fast path if the request is already done. select { case <-C: @@ -156,33 +166,39 @@ func (t *Task) block(C <-chan struct{}, timerChan <-chan struct{}) error { } } -// SleepStart implements amutex.Sleeper.SleepStart. +// SleepStart implements context.ChannelSleeper.SleepStart. func (t *Task) SleepStart() <-chan struct{} { + t.assertTaskGoroutine() t.Deactivate() t.accountTaskGoroutineEnter(TaskGoroutineBlockedInterruptible) return t.interruptChan } -// SleepFinish implements amutex.Sleeper.SleepFinish. +// SleepFinish implements context.ChannelSleeper.SleepFinish. func (t *Task) SleepFinish(success bool) { if !success { - // The interrupted notification is consumed only at the top-level - // (Run). Therefore we attempt to reset the pending notification. - // This will also elide our next entry back into the task, so we - // will process signals, state changes, etc. + // Our caller received from t.interruptChan; we need to re-send to it + // to ensure that t.interrupted() is still true. t.interruptSelf() } t.accountTaskGoroutineLeave(TaskGoroutineBlockedInterruptible) t.Activate() } -// Interrupted implements amutex.Sleeper.Interrupted +// Interrupted implements context.ChannelSleeper.Interrupted. func (t *Task) Interrupted() bool { - return len(t.interruptChan) != 0 + if t.interrupted() { + return true + } + // Indicate that t's task goroutine is still responsive (i.e. reset the + // watchdog timer). + t.accountTaskGoroutineRunning() + return false } // UninterruptibleSleepStart implements context.Context.UninterruptibleSleepStart. func (t *Task) UninterruptibleSleepStart(deactivate bool) { + t.assertTaskGoroutine() if deactivate { t.Deactivate() } @@ -198,13 +214,17 @@ func (t *Task) UninterruptibleSleepFinish(activate bool) { } // interrupted returns true if interrupt or interruptSelf has been called at -// least once since the last call to interrupted. +// least once since the last call to unsetInterrupted. func (t *Task) interrupted() bool { + return len(t.interruptChan) != 0 +} + +// unsetInterrupted causes interrupted to return false until the next call to +// interrupt or interruptSelf. +func (t *Task) unsetInterrupted() { select { case <-t.interruptChan: - return true default: - return false } } @@ -220,9 +240,7 @@ func (t *Task) interrupt() { func (t *Task) interruptSelf() { select { case t.interruptChan <- struct{}{}: - t.Debugf("Interrupt queued") default: - t.Debugf("Dropping duplicate interrupt") } // platform.Context.Interrupt() is unnecessary since a task goroutine // calling interruptSelf() cannot also be blocked in diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index 682080c14..f305e69c0 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -115,7 +115,7 @@ type CloneOptions struct { ParentTID usermem.Addr // If Vfork is true, place the parent in vforkStop until the cloned task - // releases its TaskContext. + // releases its TaskImage. Vfork bool // If Untraced is true, do not report PTRACE_EVENT_CLONE/FORK/VFORK for @@ -226,20 +226,20 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { }) } - tc, err := t.tc.Fork(t, t.k, !opts.NewAddressSpace) + image, err := t.image.Fork(t, t.k, !opts.NewAddressSpace) if err != nil { return 0, nil, err } cu.Add(func() { - tc.release() + image.release() }) // clone() returns 0 in the child. - tc.Arch.SetReturn(0) + image.Arch.SetReturn(0) if opts.Stack != 0 { - tc.Arch.SetStack(uintptr(opts.Stack)) + image.Arch.SetStack(uintptr(opts.Stack)) } if opts.SetTLS { - if !tc.Arch.SetTLS(uintptr(opts.TLS)) { + if !image.Arch.SetTLS(uintptr(opts.TLS)) { return 0, nil, syserror.EPERM } } @@ -288,7 +288,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { Kernel: t.k, ThreadGroup: tg, SignalMask: t.SignalMask(), - TaskContext: tc, + TaskImage: image, FSContext: fsContext, FDTable: fdTable, Credentials: creds, @@ -355,7 +355,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { } if opts.ChildSetTID { ctid := nt.ThreadID() - ctid.CopyOut(nt.AsCopyContext(usermem.IOOpts{AddressSpaceActive: false}), opts.ChildTID) + ctid.CopyOut(nt.CopyContext(t, usermem.IOOpts{AddressSpaceActive: false}), opts.ChildTID) } ntid := t.tg.pidns.IDOfTask(nt) if opts.ParentSetTID { diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go index d1136461a..70b0699dc 100644 --- a/pkg/sentry/kernel/task_context.go +++ b/pkg/sentry/kernel/task_context.go @@ -1,4 +1,4 @@ -// Copyright 2018 The gVisor Authors. +// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -15,159 +15,175 @@ package kernel import ( - "fmt" + "time" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/kernel/futex" - "gvisor.dev/gvisor/pkg/sentry/loader" - "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/unimpl" + "gvisor.dev/gvisor/pkg/sentry/uniqueid" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" ) -var errNoSyscalls = syserr.New("no syscall table found", linux.ENOEXEC) - -// Auxmap contains miscellaneous data for the task. -type Auxmap map[string]interface{} - -// TaskContext is the subset of a task's data that is provided by the loader. -// -// +stateify savable -type TaskContext struct { - // Name is the thread name set by the prctl(PR_SET_NAME) system call. - Name string - - // Arch is the architecture-specific context (registers, etc.) - Arch arch.Context - - // MemoryManager is the task's address space. - MemoryManager *mm.MemoryManager +// Deadline implements context.Context.Deadline. +func (t *Task) Deadline() (time.Time, bool) { + return time.Time{}, false +} - // fu implements futexes in the address space. - fu *futex.Manager +// Done implements context.Context.Done. +func (t *Task) Done() <-chan struct{} { + return nil +} - // st is the task's syscall table. - st *SyscallTable `state:".(syscallTableInfo)"` +// Err implements context.Context.Err. +func (t *Task) Err() error { + return nil } -// release releases all resources held by the TaskContext. release is called by -// the task when it execs into a new TaskContext or exits. -func (tc *TaskContext) release() { - // Nil out pointers so that if the task is saved after release, it doesn't - // follow the pointers to possibly now-invalid objects. - if tc.MemoryManager != nil { - tc.MemoryManager.DecUsers(context.Background()) - tc.MemoryManager = nil +// Value implements context.Context.Value. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) Value(key interface{}) interface{} { + // This function is very hot; skip this check outside of +race builds. + if sync.RaceEnabled { + t.assertTaskGoroutine() } - tc.fu = nil + return t.contextValue(key, true /* isTaskGoroutine */) } -// Fork returns a duplicate of tc. The copied TaskContext always has an -// independent arch.Context. If shareAddressSpace is true, the copied -// TaskContext shares an address space with the original; otherwise, the copied -// TaskContext has an independent address space that is initially a duplicate -// of the original's. -func (tc *TaskContext) Fork(ctx context.Context, k *Kernel, shareAddressSpace bool) (*TaskContext, error) { - newTC := &TaskContext{ - Name: tc.Name, - Arch: tc.Arch.Fork(), - st: tc.st, - } - if shareAddressSpace { - newTC.MemoryManager = tc.MemoryManager - if newTC.MemoryManager != nil { - if !newTC.MemoryManager.IncUsers() { - // Shouldn't be possible since tc.MemoryManager should be a - // counted user. - panic(fmt.Sprintf("TaskContext.Fork called with userless TaskContext.MemoryManager")) - } +func (t *Task) contextValue(key interface{}, isTaskGoroutine bool) interface{} { + switch key { + case CtxCanTrace: + return t.CanTrace + case CtxKernel: + return t.k + case CtxPIDNamespace: + return t.tg.pidns + case CtxUTSNamespace: + if !isTaskGoroutine { + t.mu.Lock() + defer t.mu.Unlock() + } + return t.utsns + case CtxIPCNamespace: + if !isTaskGoroutine { + t.mu.Lock() + defer t.mu.Unlock() + } + ipcns := t.ipcns + ipcns.IncRef() + return ipcns + case CtxTask: + return t + case auth.CtxCredentials: + return t.creds.Load() + case context.CtxThreadGroupID: + return int32(t.tg.ID()) + case fs.CtxRoot: + if !isTaskGoroutine { + t.mu.Lock() + defer t.mu.Unlock() + } + return t.fsContext.RootDirectory() + case vfs.CtxRoot: + if !isTaskGoroutine { + t.mu.Lock() + defer t.mu.Unlock() } - newTC.fu = tc.fu - } else { - newMM, err := tc.MemoryManager.Fork(ctx) - if err != nil { - return nil, err + return t.fsContext.RootDirectoryVFS2() + case vfs.CtxMountNamespace: + if !isTaskGoroutine { + t.mu.Lock() + defer t.mu.Unlock() } - newTC.MemoryManager = newMM - newTC.fu = k.futexes.Fork() + t.mountNamespaceVFS2.IncRef() + return t.mountNamespaceVFS2 + case fs.CtxDirentCacheLimiter: + return t.k.DirentCacheLimiter + case inet.CtxStack: + return t.NetworkContext() + case ktime.CtxRealtimeClock: + return t.k.RealtimeClock() + case limits.CtxLimits: + return t.tg.limits + case pgalloc.CtxMemoryFile: + return t.k.mf + case pgalloc.CtxMemoryFileProvider: + return t.k + case platform.CtxPlatform: + return t.k + case uniqueid.CtxGlobalUniqueID: + return t.k.UniqueID() + case uniqueid.CtxGlobalUniqueIDProvider: + return t.k + case uniqueid.CtxInotifyCookie: + return t.k.GenerateInotifyCookie() + case unimpl.CtxEvents: + return t.k + default: + return nil } - return newTC, nil } -// Arch returns t's arch.Context. -// -// Preconditions: The caller must be running on the task goroutine, or t.mu -// must be locked. -func (t *Task) Arch() arch.Context { - return t.tc.Arch +// taskAsyncContext implements context.Context for a goroutine that performs +// work on behalf of a Task, but is not the task goroutine. +type taskAsyncContext struct { + context.NoopSleeper + + t *Task } -// MemoryManager returns t's MemoryManager. MemoryManager does not take an -// additional reference on the returned MM. -// -// Preconditions: The caller must be running on the task goroutine, or t.mu -// must be locked. -func (t *Task) MemoryManager() *mm.MemoryManager { - return t.tc.MemoryManager +// AsyncContext returns a context.Context representing t. The returned +// context.Context is intended for use by goroutines other than t's task +// goroutine; for example, signal delivery to t will not interrupt goroutines +// that are blocking using the returned context.Context. +func (t *Task) AsyncContext() context.Context { + return taskAsyncContext{t: t} } -// SyscallTable returns t's syscall table. -// -// Preconditions: The caller must be running on the task goroutine, or t.mu -// must be locked. -func (t *Task) SyscallTable() *SyscallTable { - return t.tc.st +// Debugf implements log.Logger.Debugf. +func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) { + ctx.t.Debugf(format, v...) } -// Stack returns the userspace stack. -// -// Preconditions: The caller must be running on the task goroutine, or t.mu -// must be locked. -func (t *Task) Stack() *arch.Stack { - return &arch.Stack{ - Arch: t.Arch(), - IO: t.MemoryManager(), - Bottom: usermem.Addr(t.Arch().Stack()), - } +// Infof implements log.Logger.Infof. +func (ctx taskAsyncContext) Infof(format string, v ...interface{}) { + ctx.t.Infof(format, v...) } -// LoadTaskImage loads a specified file into a new TaskContext. -// -// args.MemoryManager does not need to be set by the caller. -func (k *Kernel) LoadTaskImage(ctx context.Context, args loader.LoadArgs) (*TaskContext, *syserr.Error) { - // If File is not nil, we should load that instead of resolving Filename. - if args.File != nil { - args.Filename = args.File.PathnameWithDeleted(ctx) - } +// Warningf implements log.Logger.Warningf. +func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) { + ctx.t.Warningf(format, v...) +} + +// IsLogging implements log.Logger.IsLogging. +func (ctx taskAsyncContext) IsLogging(level log.Level) bool { + return ctx.t.IsLogging(level) +} - // Prepare a new user address space to load into. - m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation) - defer m.DecUsers(ctx) - args.MemoryManager = m +// Deadline implements context.Context.Deadline. +func (ctx taskAsyncContext) Deadline() (time.Time, bool) { + return time.Time{}, false +} - os, ac, name, err := loader.Load(ctx, args, k.extraAuxv, k.vdso) - if err != nil { - return nil, err - } +// Done implements context.Context.Done. +func (ctx taskAsyncContext) Done() <-chan struct{} { + return nil +} - // Lookup our new syscall table. - st, ok := LookupSyscallTable(os, ac.Arch()) - if !ok { - // No syscall table found. This means that the ELF binary does not match - // the architecture. - return nil, errNoSyscalls - } +// Err implements context.Context.Err. +func (ctx taskAsyncContext) Err() error { + return nil +} - if !m.IncUsers() { - panic("Failed to increment users count on new MM") - } - return &TaskContext{ - Name: name, - Arch: ac, - MemoryManager: m, - fu: k.futexes.Fork(), - st: st, - }, nil +// Value implements context.Context.Value. +func (ctx taskAsyncContext) Value(key interface{}) interface{} { + return ctx.t.contextValue(key, false /* isTaskGoroutine */) } diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go index 412d471d3..d9897e802 100644 --- a/pkg/sentry/kernel/task_exec.go +++ b/pkg/sentry/kernel/task_exec.go @@ -83,11 +83,12 @@ type execStop struct{} func (*execStop) Killable() bool { return true } // Execve implements the execve(2) syscall by killing all other tasks in its -// thread group and switching to newTC. Execve always takes ownership of newTC. +// thread group and switching to newImage. Execve always takes ownership of +// newImage. // // Preconditions: The caller must be running Task.doSyscallInvoke on the task // goroutine. -func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) { +func (t *Task) Execve(newImage *TaskImage) (*SyscallControl, error) { t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() t.tg.signalHandlers.mu.Lock() @@ -96,7 +97,7 @@ func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) { if t.tg.exiting || t.tg.execing != nil { // We lost to a racing group-exit, kill, or exec from another thread // and should just exit. - newTC.release() + newImage.release() return nil, syserror.EINTR } @@ -118,7 +119,7 @@ func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) { t.beginInternalStopLocked((*execStop)(nil)) } - return &SyscallControl{next: &runSyscallAfterExecStop{newTC}, ignoreReturn: true}, nil + return &SyscallControl{next: &runSyscallAfterExecStop{newImage}, ignoreReturn: true}, nil } // The runSyscallAfterExecStop state continues execve(2) after all siblings of @@ -126,16 +127,16 @@ func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) { // // +stateify savable type runSyscallAfterExecStop struct { - tc *TaskContext + image *TaskImage } func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { - t.traceExecEvent(r.tc) + t.traceExecEvent(r.image) t.tg.pidns.owner.mu.Lock() t.tg.execing = nil if t.killed() { t.tg.pidns.owner.mu.Unlock() - r.tc.release() + r.image.release() return (*runInterrupt)(nil) } // We are the thread group leader now. Save our old thread ID for @@ -214,7 +215,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { // executables (set-user/group-ID bits and file capabilities). This // allows us to unconditionally enable user dumpability on the new mm. // See fs/exec.c:setup_new_exec. - r.tc.MemoryManager.SetDumpability(mm.UserDumpable) + r.image.MemoryManager.SetDumpability(mm.UserDumpable) // Switch to the new process. t.MemoryManager().Deactivate() @@ -222,8 +223,8 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { // Update credentials to reflect the execve. This should precede switching // MMs to ensure that dumpability has been reset first, if needed. t.updateCredsForExecLocked() - t.tc.release() - t.tc = *r.tc + t.image.release() + t.image = *r.image t.mu.Unlock() t.unstopVforkParent() t.p.FullStateChanged() diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index ce7b9641d..16986244c 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -266,7 +266,7 @@ func (*runExitMain) execute(t *Task) taskRunState { t.updateRSSLocked() t.tg.pidns.owner.mu.Unlock() t.mu.Lock() - t.tc.release() + t.image.release() t.mu.Unlock() // Releasing the MM unblocks a blocked CLONE_VFORK parent. @@ -368,8 +368,8 @@ func (t *Task) exitChildren() { Signo: int32(sig), Code: arch.SignalInfoUser, } - siginfo.SetPid(int32(c.tg.pidns.tids[t])) - siginfo.SetUid(int32(t.Credentials().RealKUID.In(c.UserNamespace()).OrOverflow())) + siginfo.SetPID(int32(c.tg.pidns.tids[t])) + siginfo.SetUID(int32(t.Credentials().RealKUID.In(c.UserNamespace()).OrOverflow())) c.tg.signalHandlers.mu.Lock() c.sendSignalLocked(siginfo, true /* group */) c.tg.signalHandlers.mu.Unlock() @@ -698,8 +698,8 @@ func (t *Task) exitNotificationSignal(sig linux.Signal, receiver *Task) *arch.Si info := &arch.SignalInfo{ Signo: int32(sig), } - info.SetPid(int32(receiver.tg.pidns.tids[t])) - info.SetUid(int32(t.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) + info.SetPID(int32(receiver.tg.pidns.tids[t])) + info.SetUID(int32(t.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) if t.exitStatus.Signaled() { info.Code = arch.CLD_KILLED info.SetStatus(int32(t.exitStatus.Signo)) diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go index c80391475..195c7da9b 100644 --- a/pkg/sentry/kernel/task_futex.go +++ b/pkg/sentry/kernel/task_futex.go @@ -26,7 +26,7 @@ import ( // Preconditions: The caller must be running on the task goroutine, or t.mu // must be locked. func (t *Task) Futex() *futex.Manager { - return t.tc.fu + return t.image.fu } // SwapUint32 implements futex.Target.SwapUint32. diff --git a/pkg/sentry/kernel/task_image.go b/pkg/sentry/kernel/task_image.go new file mode 100644 index 000000000..ce5fbd299 --- /dev/null +++ b/pkg/sentry/kernel/task_image.go @@ -0,0 +1,173 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel/futex" + "gvisor.dev/gvisor/pkg/sentry/loader" + "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/usermem" +) + +var errNoSyscalls = syserr.New("no syscall table found", linux.ENOEXEC) + +// Auxmap contains miscellaneous data for the task. +type Auxmap map[string]interface{} + +// TaskImage is the subset of a task's data that is provided by the loader. +// +// +stateify savable +type TaskImage struct { + // Name is the thread name set by the prctl(PR_SET_NAME) system call. + Name string + + // Arch is the architecture-specific context (registers, etc.) + Arch arch.Context + + // MemoryManager is the task's address space. + MemoryManager *mm.MemoryManager + + // fu implements futexes in the address space. + fu *futex.Manager + + // st is the task's syscall table. + st *SyscallTable `state:".(syscallTableInfo)"` +} + +// release releases all resources held by the TaskImage. release is called by +// the task when it execs into a new TaskImage or exits. +func (image *TaskImage) release() { + // Nil out pointers so that if the task is saved after release, it doesn't + // follow the pointers to possibly now-invalid objects. + if image.MemoryManager != nil { + image.MemoryManager.DecUsers(context.Background()) + image.MemoryManager = nil + } + image.fu = nil +} + +// Fork returns a duplicate of image. The copied TaskImage always has an +// independent arch.Context. If shareAddressSpace is true, the copied +// TaskImage shares an address space with the original; otherwise, the copied +// TaskImage has an independent address space that is initially a duplicate +// of the original's. +func (image *TaskImage) Fork(ctx context.Context, k *Kernel, shareAddressSpace bool) (*TaskImage, error) { + newImage := &TaskImage{ + Name: image.Name, + Arch: image.Arch.Fork(), + st: image.st, + } + if shareAddressSpace { + newImage.MemoryManager = image.MemoryManager + if newImage.MemoryManager != nil { + if !newImage.MemoryManager.IncUsers() { + // Shouldn't be possible since image.MemoryManager should be a + // counted user. + panic(fmt.Sprintf("TaskImage.Fork called with userless TaskImage.MemoryManager")) + } + } + newImage.fu = image.fu + } else { + newMM, err := image.MemoryManager.Fork(ctx) + if err != nil { + return nil, err + } + newImage.MemoryManager = newMM + newImage.fu = k.futexes.Fork() + } + return newImage, nil +} + +// Arch returns t's arch.Context. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) Arch() arch.Context { + return t.image.Arch +} + +// MemoryManager returns t's MemoryManager. MemoryManager does not take an +// additional reference on the returned MM. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) MemoryManager() *mm.MemoryManager { + return t.image.MemoryManager +} + +// SyscallTable returns t's syscall table. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) SyscallTable() *SyscallTable { + return t.image.st +} + +// Stack returns the userspace stack. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) Stack() *arch.Stack { + return &arch.Stack{ + Arch: t.Arch(), + IO: t.MemoryManager(), + Bottom: usermem.Addr(t.Arch().Stack()), + } +} + +// LoadTaskImage loads a specified file into a new TaskImage. +// +// args.MemoryManager does not need to be set by the caller. +func (k *Kernel) LoadTaskImage(ctx context.Context, args loader.LoadArgs) (*TaskImage, *syserr.Error) { + // If File is not nil, we should load that instead of resolving Filename. + if args.File != nil { + args.Filename = args.File.PathnameWithDeleted(ctx) + } + + // Prepare a new user address space to load into. + m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation) + defer m.DecUsers(ctx) + args.MemoryManager = m + + os, ac, name, err := loader.Load(ctx, args, k.extraAuxv, k.vdso) + if err != nil { + return nil, err + } + + // Lookup our new syscall table. + st, ok := LookupSyscallTable(os, ac.Arch()) + if !ok { + // No syscall table found. This means that the ELF binary does not match + // the architecture. + return nil, errNoSyscalls + } + + if !m.IncUsers() { + panic("Failed to increment users count on new MM") + } + return &TaskImage{ + Name: name, + Arch: ac, + MemoryManager: m, + fu: k.futexes.Fork(), + st: st, + }, nil +} diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go index d23cea802..c70e5e6ce 100644 --- a/pkg/sentry/kernel/task_log.go +++ b/pkg/sentry/kernel/task_log.go @@ -19,6 +19,7 @@ import ( "runtime/trace" "sort" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/usermem" ) @@ -215,7 +216,7 @@ func (t *Task) rebuildTraceContext(tid ThreadID) { // arbitrarily large (in general it won't be, especially for cases // where we're collecting a brief profile), so using the TID is a // reasonable compromise in this case. - t.traceContext, t.traceTask = trace.NewTask(t, fmt.Sprintf("tid:%d", tid)) + t.traceContext, t.traceTask = trace.NewTask(context.Background(), fmt.Sprintf("tid:%d", tid)) } // traceCloneEvent is called when a new task is spawned. @@ -237,11 +238,11 @@ func (t *Task) traceExitEvent() { } // traceExecEvent is called when a task calls exec. -func (t *Task) traceExecEvent(tc *TaskContext) { +func (t *Task) traceExecEvent(image *TaskImage) { if !trace.IsEnabled() { return } - file := tc.MemoryManager.Executable() + file := image.MemoryManager.Executable() if file == nil { trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>") return diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go index 8dc3fec90..3ccecf4b6 100644 --- a/pkg/sentry/kernel/task_run.go +++ b/pkg/sentry/kernel/task_run.go @@ -16,11 +16,13 @@ package kernel import ( "bytes" + "fmt" "runtime" "runtime/trace" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/goid" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/hostcpu" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" @@ -57,6 +59,8 @@ type taskRunState interface { // make it visible in stack dumps. A goroutine for a given task can be identified // searching for Task.run()'s argument value. func (t *Task) run(threadID uintptr) { + atomic.StoreInt64(&t.goid, goid.Get()) + // Construct t.blockingTimer here. We do this here because we can't // reconstruct t.blockingTimer during restore in Task.afterLoad(), because // kernel.timekeeper.SetClocks() hasn't been called yet. @@ -99,6 +103,9 @@ func (t *Task) run(threadID uintptr) { t.tg.pidns.owner.runningGoroutines.Done() t.p.Release() + // Deferring this store triggers a false positive in the race + // detector (https://github.com/golang/go/issues/42599). + atomic.StoreInt64(&t.goid, 0) // Keep argument alive because stack trace for dead variables may not be correct. runtime.KeepAlive(threadID) return @@ -317,7 +324,7 @@ func (app *runApp) execute(t *Task) taskRunState { // region. We should be able to easily identify // vsyscalls by having a <fault><syscall> pair. if at.Execute { - if sysno, ok := t.tc.st.LookupEmulate(addr); ok { + if sysno, ok := t.image.st.LookupEmulate(addr); ok { return t.doVsyscall(addr, sysno) } } @@ -375,6 +382,19 @@ func (app *runApp) execute(t *Task) taskRunState { } } +// assertTaskGoroutine panics if the caller is not running on t's task +// goroutine. +func (t *Task) assertTaskGoroutine() { + if got, want := goid.Get(), atomic.LoadInt64(&t.goid); got != want { + panic(fmt.Sprintf("running on goroutine %d (task goroutine for kernel.Task %p is %d)", got, t, want)) + } +} + +// GoroutineID returns the ID of t's task goroutine. +func (t *Task) GoroutineID() int64 { + return atomic.LoadInt64(&t.goid) +} + // waitGoroutineStoppedOrExited blocks until t's task goroutine stops or exits. func (t *Task) waitGoroutineStoppedOrExited() { t.goroutineStopped.Wait() diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go index 52c55d13d..9ba5f8d78 100644 --- a/pkg/sentry/kernel/task_sched.go +++ b/pkg/sentry/kernel/task_sched.go @@ -157,6 +157,18 @@ func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) { t.goschedSeq.EndWrite() } +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) accountTaskGoroutineRunning() { + now := t.k.CPUClockNow() + if t.gosched.State != TaskGoroutineRunningSys { + panic(fmt.Sprintf("Task goroutine in state %v (expected %v)", t.gosched.State, TaskGoroutineRunningSys)) + } + t.goschedSeq.BeginWrite() + t.gosched.SysTicks += now - t.gosched.Timestamp + t.gosched.Timestamp = now + t.goschedSeq.EndWrite() +} + // TaskGoroutineSchedInfo returns a copy of t's task goroutine scheduling info. // Most clients should use t.CPUStats() instead. func (t *Task) TaskGoroutineSchedInfo() TaskGoroutineSchedInfo { diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go index ebdb83061..75af3af79 100644 --- a/pkg/sentry/kernel/task_signals.go +++ b/pkg/sentry/kernel/task_signals.go @@ -619,9 +619,6 @@ func (t *Task) setSignalMaskLocked(mask linux.SignalSet) { return } }) - // We have to re-issue the interrupt consumed by t.interrupted() since - // it might have been for a different reason. - t.interruptSelf() } // Conversely, if the new mask unblocks any signals that were blocked by @@ -917,8 +914,8 @@ func (t *Task) signalStop(target *Task, code int32, status int32) { Signo: int32(linux.SIGCHLD), Code: code, } - sigchld.SetPid(int32(t.tg.pidns.tids[target])) - sigchld.SetUid(int32(target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) + sigchld.SetPID(int32(t.tg.pidns.tids[target])) + sigchld.SetUID(int32(target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) sigchld.SetStatus(status) // TODO(b/72102453): Set utime, stime. t.sendSignalLocked(sigchld, true /* group */) @@ -931,10 +928,10 @@ func (t *Task) signalStop(target *Task, code int32, status int32) { type runInterrupt struct{} func (*runInterrupt) execute(t *Task) taskRunState { - // Interrupts are de-duplicated (if t is interrupted twice before - // t.interrupted() is called, t.interrupted() will only return true once), - // so early exits from this function must re-enter the runInterrupt state - // to check for more interrupt-signaled conditions. + // Interrupts are de-duplicated (t.unsetInterrupted() will undo the effect + // of all previous calls to t.interrupted() regardless of how many such + // calls there have been), so early exits from this function must re-enter + // the runInterrupt state to check for more interrupt-signaled conditions. t.tg.signalHandlers.mu.Lock() @@ -1025,8 +1022,8 @@ func (*runInterrupt) execute(t *Task) taskRunState { Signo: int32(sig), Code: t.ptraceCode, } - t.ptraceSiginfo.SetPid(int32(t.tg.pidns.tids[t])) - t.ptraceSiginfo.SetUid(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) + t.ptraceSiginfo.SetPID(int32(t.tg.pidns.tids[t])) + t.ptraceSiginfo.SetUID(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) } else { t.ptraceCode = int32(sig) t.ptraceSiginfo = nil @@ -1080,6 +1077,7 @@ func (*runInterrupt) execute(t *Task) taskRunState { return t.deliverSignal(info, act) } + t.unsetInterrupted() t.tg.signalHandlers.mu.Unlock() return (*runApp)(nil) } @@ -1116,11 +1114,11 @@ func (*runInterruptAfterSignalDeliveryStop) execute(t *Task) taskRunState { if parent == nil { // Tracer has detached and t was created by Kernel.CreateProcess(). // Pretend the parent is in an ancestor PID + user namespace. - info.SetPid(0) - info.SetUid(int32(auth.OverflowUID)) + info.SetPID(0) + info.SetUID(int32(auth.OverflowUID)) } else { - info.SetPid(int32(t.tg.pidns.tids[parent])) - info.SetUid(int32(parent.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) + info.SetPID(int32(t.tg.pidns.tids[parent])) + info.SetUID(int32(parent.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) } } t.tg.signalHandlers.mu.Lock() diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index 8e28230cc..36e1384f1 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -46,10 +46,10 @@ type TaskConfig struct { // SignalMask is the new task's initial signal mask. SignalMask linux.SignalSet - // TaskContext is the TaskContext of the new task. Ownership of the - // TaskContext is transferred to TaskSet.NewTask, whether or not it + // TaskImage is the TaskImage of the new task. Ownership of the + // TaskImage is transferred to TaskSet.NewTask, whether or not it // succeeds. - TaskContext *TaskContext + TaskImage *TaskImage // FSContext is the FSContext of the new task. A reference must be held on // FSContext, which is transferred to TaskSet.NewTask whether or not it @@ -105,7 +105,7 @@ type TaskConfig struct { func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) { t, err := ts.newTask(cfg) if err != nil { - cfg.TaskContext.release() + cfg.TaskImage.release() cfg.FSContext.DecRef(ctx) cfg.FDTable.DecRef(ctx) cfg.IPCNamespace.DecRef(ctx) @@ -121,7 +121,7 @@ func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) // of cfg if it succeeds. func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { tg := cfg.ThreadGroup - tc := cfg.TaskContext + image := cfg.TaskImage t := &Task{ taskNode: taskNode{ tg: tg, @@ -132,7 +132,7 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { interruptChan: make(chan struct{}, 1), signalMask: cfg.SignalMask, signalStack: arch.SignalStack{Flags: arch.SignalStackFlagDisable}, - tc: *tc, + image: *image, fsContext: cfg.FSContext, fdTable: cfg.FDTable, p: cfg.Kernel.Platform.NewContext(), diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go index ce134bf54..94dabbcd8 100644 --- a/pkg/sentry/kernel/task_usermem.go +++ b/pkg/sentry/kernel/task_usermem.go @@ -18,7 +18,8 @@ import ( "math" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/marshal" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -281,29 +282,89 @@ func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOp }, nil } -// copyContext implements marshal.CopyContext. It wraps a task to allow copying -// memory to and from the task memory with custom usermem.IOOpts. -type copyContext struct { - *Task +type taskCopyContext struct { + ctx context.Context + t *Task opts usermem.IOOpts } -// AsCopyContext wraps the task and returns it as CopyContext. -func (t *Task) AsCopyContext(opts usermem.IOOpts) marshal.CopyContext { - return ©Context{t, opts} +// CopyContext returns a marshal.CopyContext that copies to/from t's address +// space using opts. +func (t *Task) CopyContext(ctx context.Context, opts usermem.IOOpts) *taskCopyContext { + return &taskCopyContext{ + ctx: ctx, + t: t, + opts: opts, + } +} + +// CopyScratchBuffer implements marshal.CopyContext.CopyScratchBuffer. +func (cc *taskCopyContext) CopyScratchBuffer(size int) []byte { + if ctxTask, ok := cc.ctx.(*Task); ok { + return ctxTask.CopyScratchBuffer(size) + } + return make([]byte, size) +} + +func (cc *taskCopyContext) getMemoryManager() (*mm.MemoryManager, error) { + cc.t.mu.Lock() + tmm := cc.t.MemoryManager() + cc.t.mu.Unlock() + if !tmm.IncUsers() { + return nil, syserror.EFAULT + } + return tmm, nil +} + +// CopyInBytes implements marshal.CopyContext.CopyInBytes. +func (cc *taskCopyContext) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) { + tmm, err := cc.getMemoryManager() + if err != nil { + return 0, err + } + defer tmm.DecUsers(cc.ctx) + return tmm.CopyIn(cc.ctx, addr, dst, cc.opts) +} + +// CopyOutBytes implements marshal.CopyContext.CopyOutBytes. +func (cc *taskCopyContext) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) { + tmm, err := cc.getMemoryManager() + if err != nil { + return 0, err + } + defer tmm.DecUsers(cc.ctx) + return tmm.CopyOut(cc.ctx, addr, src, cc.opts) +} + +type ownTaskCopyContext struct { + t *Task + opts usermem.IOOpts +} + +// OwnCopyContext returns a marshal.CopyContext that copies to/from t's address +// space using opts. The returned CopyContext may only be used by t's task +// goroutine. +// +// Since t already implements marshal.CopyContext, this is only needed to +// override the usermem.IOOpts used for the copy. +func (t *Task) OwnCopyContext(opts usermem.IOOpts) *ownTaskCopyContext { + return &ownTaskCopyContext{ + t: t, + opts: opts, + } } -// CopyInString copies a string in from the task's memory. -func (t *copyContext) CopyInString(addr usermem.Addr, maxLen int) (string, error) { - return usermem.CopyStringIn(t, t.MemoryManager(), addr, maxLen, t.opts) +// CopyScratchBuffer implements marshal.CopyContext.CopyScratchBuffer. +func (cc *ownTaskCopyContext) CopyScratchBuffer(size int) []byte { + return cc.t.CopyScratchBuffer(size) } -// CopyInBytes copies task memory into dst from an IO context. -func (t *copyContext) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) { - return t.MemoryManager().CopyIn(t, addr, dst, t.opts) +// CopyInBytes implements marshal.CopyContext.CopyInBytes. +func (cc *ownTaskCopyContext) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) { + return cc.t.MemoryManager().CopyIn(cc.t, addr, dst, cc.opts) } -// CopyOutBytes copies src into task memoryfrom an IO context. -func (t *copyContext) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) { - return t.MemoryManager().CopyOut(t, addr, src, t.opts) +// CopyOutBytes implements marshal.CopyContext.CopyOutBytes. +func (cc *ownTaskCopyContext) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) { + return cc.t.MemoryManager().CopyOut(cc.t, addr, src, cc.opts) } diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go index 9bc452e67..9e5c2d26f 100644 --- a/pkg/sentry/kernel/vdso.go +++ b/pkg/sentry/kernel/vdso.go @@ -115,7 +115,7 @@ func (v *VDSOParamPage) incrementSeq(paramPage safemem.Block) error { } if old != v.seq { - return fmt.Errorf("unexpected VDSOParamPage seq value: got %d expected %d. Application may hang or get incorrect time from the VDSO.", old, v.seq) + return fmt.Errorf("unexpected VDSOParamPage seq value: got %d expected %d; application may hang or get incorrect time from the VDSO", old, v.seq) } v.seq = next |