summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/kernel')
-rw-r--r--pkg/sentry/kernel/BUILD14
-rw-r--r--pkg/sentry/kernel/abstract_socket_namespace.go10
-rw-r--r--pkg/sentry/kernel/aio.go50
-rw-r--r--pkg/sentry/kernel/context.go17
-rw-r--r--pkg/sentry/kernel/epoll/epoll.go7
-rw-r--r--pkg/sentry/kernel/fasync/BUILD2
-rw-r--r--pkg/sentry/kernel/fasync/fasync.go96
-rw-r--r--pkg/sentry/kernel/fd_table.go6
-rw-r--r--pkg/sentry/kernel/fd_table_unsafe.go24
-rw-r--r--pkg/sentry/kernel/fs_context.go28
-rw-r--r--pkg/sentry/kernel/ipc_namespace.go4
-rw-r--r--pkg/sentry/kernel/kernel.go239
-rw-r--r--pkg/sentry/kernel/pipe/node_test.go8
-rw-r--r--pkg/sentry/kernel/pipe/pipe.go50
-rw-r--r--pkg/sentry/kernel/pipe/pipe_test.go11
-rw-r--r--pkg/sentry/kernel/pipe/vfs.go8
-rw-r--r--pkg/sentry/kernel/ptrace.go8
-rw-r--r--pkg/sentry/kernel/seccomp.go2
-rw-r--r--pkg/sentry/kernel/semaphore/semaphore.go164
-rw-r--r--pkg/sentry/kernel/sessions.go38
-rw-r--r--pkg/sentry/kernel/shm/BUILD7
-rw-r--r--pkg/sentry/kernel/shm/shm.go2
-rw-r--r--pkg/sentry/kernel/signal.go4
-rw-r--r--pkg/sentry/kernel/signalfd/signalfd.go4
-rw-r--r--pkg/sentry/kernel/syscalls_state.go10
-rw-r--r--pkg/sentry/kernel/syslog.go6
-rw-r--r--pkg/sentry/kernel/task.go85
-rw-r--r--pkg/sentry/kernel/task_acct.go4
-rw-r--r--pkg/sentry/kernel/task_block.go44
-rw-r--r--pkg/sentry/kernel/task_clone.go16
-rw-r--r--pkg/sentry/kernel/task_context.go272
-rw-r--r--pkg/sentry/kernel/task_exec.go21
-rw-r--r--pkg/sentry/kernel/task_exit.go10
-rw-r--r--pkg/sentry/kernel/task_futex.go2
-rw-r--r--pkg/sentry/kernel/task_image.go173
-rw-r--r--pkg/sentry/kernel/task_log.go7
-rw-r--r--pkg/sentry/kernel/task_run.go22
-rw-r--r--pkg/sentry/kernel/task_sched.go12
-rw-r--r--pkg/sentry/kernel/task_signals.go28
-rw-r--r--pkg/sentry/kernel/task_start.go12
-rw-r--r--pkg/sentry/kernel/task_usermem.go95
-rw-r--r--pkg/sentry/kernel/vdso.go2
42 files changed, 1046 insertions, 578 deletions
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index c0de72eef..0ee60569c 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -79,7 +79,7 @@ go_template_instance(
out = "fd_table_refs.go",
package = "kernel",
prefix = "FDTable",
- template = "//pkg/refs_vfs2:refs_template",
+ template = "//pkg/refsvfs2:refs_template",
types = {
"T": "FDTable",
},
@@ -90,7 +90,7 @@ go_template_instance(
out = "fs_context_refs.go",
package = "kernel",
prefix = "FSContext",
- template = "//pkg/refs_vfs2:refs_template",
+ template = "//pkg/refsvfs2:refs_template",
types = {
"T": "FSContext",
},
@@ -101,7 +101,7 @@ go_template_instance(
out = "ipc_namespace_refs.go",
package = "kernel",
prefix = "IPCNamespace",
- template = "//pkg/refs_vfs2:refs_template",
+ template = "//pkg/refsvfs2:refs_template",
types = {
"T": "IPCNamespace",
},
@@ -112,7 +112,7 @@ go_template_instance(
out = "process_group_refs.go",
package = "kernel",
prefix = "ProcessGroup",
- template = "//pkg/refs_vfs2:refs_template",
+ template = "//pkg/refsvfs2:refs_template",
types = {
"T": "ProcessGroup",
},
@@ -123,7 +123,7 @@ go_template_instance(
out = "session_refs.go",
package = "kernel",
prefix = "Session",
- template = "//pkg/refs_vfs2:refs_template",
+ template = "//pkg/refsvfs2:refs_template",
types = {
"T": "Session",
},
@@ -184,6 +184,7 @@ go_library(
"task_exit.go",
"task_futex.go",
"task_identity.go",
+ "task_image.go",
"task_list.go",
"task_log.go",
"task_net.go",
@@ -224,12 +225,13 @@ go_library(
"//pkg/cpuid",
"//pkg/eventchannel",
"//pkg/fspath",
+ "//pkg/goid",
"//pkg/log",
"//pkg/marshal",
"//pkg/marshal/primitive",
"//pkg/metric",
"//pkg/refs",
- "//pkg/refs_vfs2",
+ "//pkg/refsvfs2",
"//pkg/safemem",
"//pkg/secio",
"//pkg/sentry/arch",
diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go
index 1b9721534..0ddbe5ff6 100644
--- a/pkg/sentry/kernel/abstract_socket_namespace.go
+++ b/pkg/sentry/kernel/abstract_socket_namespace.go
@@ -19,7 +19,7 @@ import (
"syscall"
"gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/refs_vfs2"
+ "gvisor.dev/gvisor/pkg/refsvfs2"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/sync"
)
@@ -27,7 +27,7 @@ import (
// +stateify savable
type abstractEndpoint struct {
ep transport.BoundEndpoint
- socket refs_vfs2.RefCounter
+ socket refsvfs2.RefCounter
name string
ns *AbstractSocketNamespace
}
@@ -57,7 +57,7 @@ func NewAbstractSocketNamespace() *AbstractSocketNamespace {
// its backing socket.
type boundEndpoint struct {
transport.BoundEndpoint
- socket refs_vfs2.RefCounter
+ socket refsvfs2.RefCounter
}
// Release implements transport.BoundEndpoint.Release.
@@ -89,7 +89,7 @@ func (a *AbstractSocketNamespace) BoundEndpoint(name string) transport.BoundEndp
//
// When the last reference managed by socket is dropped, ep may be removed from the
// namespace.
-func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, socket refs_vfs2.RefCounter) error {
+func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, socket refsvfs2.RefCounter) error {
a.mu.Lock()
defer a.mu.Unlock()
@@ -109,7 +109,7 @@ func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep tran
// Remove removes the specified socket at name from the abstract socket
// namespace, if it has not yet been replaced.
-func (a *AbstractSocketNamespace) Remove(name string, socket refs_vfs2.RefCounter) {
+func (a *AbstractSocketNamespace) Remove(name string, socket refsvfs2.RefCounter) {
a.mu.Lock()
defer a.mu.Unlock()
diff --git a/pkg/sentry/kernel/aio.go b/pkg/sentry/kernel/aio.go
index 0ac78c0b8..ec36d1a49 100644
--- a/pkg/sentry/kernel/aio.go
+++ b/pkg/sentry/kernel/aio.go
@@ -15,10 +15,7 @@
package kernel
import (
- "time"
-
"gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/log"
)
// AIOCallback is an function that does asynchronous I/O on behalf of a task.
@@ -26,7 +23,7 @@ type AIOCallback func(context.Context)
// QueueAIO queues an AIOCallback which will be run asynchronously.
func (t *Task) QueueAIO(cb AIOCallback) {
- ctx := taskAsyncContext{t: t}
+ ctx := t.AsyncContext()
wg := &t.TaskSet().aioGoroutines
wg.Add(1)
go func() {
@@ -34,48 +31,3 @@ func (t *Task) QueueAIO(cb AIOCallback) {
wg.Done()
}()
}
-
-type taskAsyncContext struct {
- context.NoopSleeper
- t *Task
-}
-
-// Debugf implements log.Logger.Debugf.
-func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) {
- ctx.t.Debugf(format, v...)
-}
-
-// Infof implements log.Logger.Infof.
-func (ctx taskAsyncContext) Infof(format string, v ...interface{}) {
- ctx.t.Infof(format, v...)
-}
-
-// Warningf implements log.Logger.Warningf.
-func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) {
- ctx.t.Warningf(format, v...)
-}
-
-// IsLogging implements log.Logger.IsLogging.
-func (ctx taskAsyncContext) IsLogging(level log.Level) bool {
- return ctx.t.IsLogging(level)
-}
-
-// Deadline implements context.Context.Deadline.
-func (ctx taskAsyncContext) Deadline() (time.Time, bool) {
- return ctx.t.Deadline()
-}
-
-// Done implements context.Context.Done.
-func (ctx taskAsyncContext) Done() <-chan struct{} {
- return ctx.t.Done()
-}
-
-// Err implements context.Context.Err.
-func (ctx taskAsyncContext) Err() error {
- return ctx.t.Err()
-}
-
-// Value implements context.Context.Value.
-func (ctx taskAsyncContext) Value(key interface{}) interface{} {
- return ctx.t.Value(key)
-}
diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go
index bb94769c4..a8596410f 100644
--- a/pkg/sentry/kernel/context.go
+++ b/pkg/sentry/kernel/context.go
@@ -15,8 +15,6 @@
package kernel
import (
- "time"
-
"gvisor.dev/gvisor/pkg/context"
)
@@ -98,18 +96,3 @@ func TaskFromContext(ctx context.Context) *Task {
}
return nil
}
-
-// Deadline implements context.Context.Deadline.
-func (*Task) Deadline() (time.Time, bool) {
- return time.Time{}, false
-}
-
-// Done implements context.Context.Done.
-func (*Task) Done() <-chan struct{} {
- return nil
-}
-
-// Err implements context.Context.Err.
-func (*Task) Err() error {
- return nil
-}
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index 15519f0df..61aeca044 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -273,7 +273,7 @@ func (e *EventPoll) ReadEvents(max int) []linux.EpollEvent {
//
// Callback is called when one of the files we're polling becomes ready. It
// moves said file to the readyList if it's currently in the waiting list.
-func (p *pollEntry) Callback(*waiter.Entry) {
+func (p *pollEntry) Callback(*waiter.Entry, waiter.EventMask) {
e := p.epoll
e.listsMu.Lock()
@@ -306,9 +306,8 @@ func (e *EventPoll) initEntryReadiness(entry *pollEntry) {
f.EventRegister(&entry.waiter, entry.mask)
// Check if the file happens to already be in a ready state.
- ready := f.Readiness(entry.mask) & entry.mask
- if ready != 0 {
- entry.Callback(&entry.waiter)
+ if ready := f.Readiness(entry.mask) & entry.mask; ready != 0 {
+ entry.Callback(&entry.waiter, ready)
}
}
diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD
index 2b3955598..f855f038b 100644
--- a/pkg/sentry/kernel/fasync/BUILD
+++ b/pkg/sentry/kernel/fasync/BUILD
@@ -8,11 +8,13 @@ go_library(
visibility = ["//:sandbox"],
deps = [
"//pkg/abi/linux",
+ "//pkg/sentry/arch",
"//pkg/sentry/fs",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/vfs",
"//pkg/sync",
+ "//pkg/syserror",
"//pkg/waiter",
],
)
diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go
index 153d2cd9b..b66d61c6f 100644
--- a/pkg/sentry/kernel/fasync/fasync.go
+++ b/pkg/sentry/kernel/fasync/fasync.go
@@ -17,22 +17,45 @@ package fasync
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/waiter"
)
-// New creates a new fs.FileAsync.
-func New() fs.FileAsync {
- return &FileAsync{}
+// Table to convert waiter event masks into si_band siginfo codes.
+// Taken from fs/fcntl.c:band_table.
+var bandTable = map[waiter.EventMask]int64{
+ // POLL_IN
+ waiter.EventIn: linux.EPOLLIN | linux.EPOLLRDNORM,
+ // POLL_OUT
+ waiter.EventOut: linux.EPOLLOUT | linux.EPOLLWRNORM | linux.EPOLLWRBAND,
+ // POLL_ERR
+ waiter.EventErr: linux.EPOLLERR,
+ // POLL_PRI
+ waiter.EventPri: linux.EPOLLPRI | linux.EPOLLRDBAND,
+ // POLL_HUP
+ waiter.EventHUp: linux.EPOLLHUP | linux.EPOLLERR,
}
-// NewVFS2 creates a new vfs.FileAsync.
-func NewVFS2() vfs.FileAsync {
- return &FileAsync{}
+// New returns a function that creates a new fs.FileAsync with the given file
+// descriptor.
+func New(fd int) func() fs.FileAsync {
+ return func() fs.FileAsync {
+ return &FileAsync{fd: fd}
+ }
+}
+
+// NewVFS2 returns a function that creates a new vfs.FileAsync with the given
+// file descriptor.
+func NewVFS2(fd int) func() vfs.FileAsync {
+ return func() vfs.FileAsync {
+ return &FileAsync{fd: fd}
+ }
}
// FileAsync sends signals when the registered file is ready for IO.
@@ -42,6 +65,12 @@ type FileAsync struct {
// e is immutable after first use (which is protected by mu below).
e waiter.Entry
+ // fd is the file descriptor to notify about.
+ // It is immutable, set at allocation time. This matches Linux semantics in
+ // fs/fcntl.c:fasync_helper.
+ // The fd value is passed to the signal recipient in siginfo.si_fd.
+ fd int
+
// regMu protects registeration and unregistration actions on e.
//
// regMu must be held while registration decisions are being made
@@ -56,6 +85,10 @@ type FileAsync struct {
mu sync.Mutex `state:"nosave"`
requester *auth.Credentials
registered bool
+ // signal is the signal to deliver upon I/O being available.
+ // The default value ("zero signal") means the default SIGIO signal will be
+ // delivered.
+ signal linux.Signal
// Only one of the following is allowed to be non-nil.
recipientPG *kernel.ProcessGroup
@@ -64,10 +97,10 @@ type FileAsync struct {
}
// Callback sends a signal.
-func (a *FileAsync) Callback(e *waiter.Entry) {
+func (a *FileAsync) Callback(e *waiter.Entry, mask waiter.EventMask) {
a.mu.Lock()
+ defer a.mu.Unlock()
if !a.registered {
- a.mu.Unlock()
return
}
t := a.recipientT
@@ -80,19 +113,34 @@ func (a *FileAsync) Callback(e *waiter.Entry) {
}
if t == nil {
// No recipient has been registered.
- a.mu.Unlock()
return
}
c := t.Credentials()
// Logic from sigio_perm in fs/fcntl.c.
- if a.requester.EffectiveKUID == 0 ||
+ permCheck := (a.requester.EffectiveKUID == 0 ||
a.requester.EffectiveKUID == c.SavedKUID ||
a.requester.EffectiveKUID == c.RealKUID ||
a.requester.RealKUID == c.SavedKUID ||
- a.requester.RealKUID == c.RealKUID {
- t.SendSignal(kernel.SignalInfoPriv(linux.SIGIO))
+ a.requester.RealKUID == c.RealKUID)
+ if !permCheck {
+ return
}
- a.mu.Unlock()
+ signalInfo := &arch.SignalInfo{
+ Signo: int32(linux.SIGIO),
+ Code: arch.SignalInfoKernel,
+ }
+ if a.signal != 0 {
+ signalInfo.Signo = int32(a.signal)
+ signalInfo.SetFD(uint32(a.fd))
+ var band int64
+ for m, bandCode := range bandTable {
+ if m&mask != 0 {
+ band |= bandCode
+ }
+ }
+ signalInfo.SetBand(band)
+ }
+ t.SendSignal(signalInfo)
}
// Register sets the file which will be monitored for IO events.
@@ -186,3 +234,25 @@ func (a *FileAsync) ClearOwner() {
a.recipientTG = nil
a.recipientPG = nil
}
+
+// Signal returns which signal will be sent to the signal recipient.
+// A value of zero means the signal to deliver wasn't customized, which means
+// the default signal (SIGIO) will be delivered.
+func (a *FileAsync) Signal() linux.Signal {
+ a.mu.Lock()
+ defer a.mu.Unlock()
+ return a.signal
+}
+
+// SetSignal overrides which signal to send when I/O is available.
+// The default behavior can be reset by specifying signal zero, which means
+// to send SIGIO.
+func (a *FileAsync) SetSignal(signal linux.Signal) error {
+ if signal != 0 && !signal.IsValid() {
+ return syserror.EINVAL
+ }
+ a.mu.Lock()
+ defer a.mu.Unlock()
+ a.signal = signal
+ return nil
+}
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index 0ec7344cd..7aba31587 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -110,7 +110,7 @@ func (f *FDTable) saveDescriptorTable() map[int32]descriptor {
func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) {
ctx := context.Background()
- f.init() // Initialize table.
+ f.initNoLeakCheck() // Initialize table.
f.used = 0
for fd, d := range m {
if file, fileVFS2 := f.setAll(ctx, fd, d.file, d.fileVFS2, d.flags); file != nil || fileVFS2 != nil {
@@ -240,6 +240,10 @@ func (f *FDTable) String() string {
case fileVFS2 != nil:
vfsObj := fileVFS2.Mount().Filesystem().VirtualFilesystem()
+ vd := fileVFS2.VirtualDentry()
+ if vd.Dentry() == nil {
+ panic(fmt.Sprintf("fd %d (type %T) has nil dentry: %#v", fd, fileVFS2.Impl(), fileVFS2))
+ }
name, err := vfsObj.PathnameWithDeleted(ctx, vfs.VirtualDentry{}, fileVFS2.VirtualDentry())
if err != nil {
fmt.Fprintf(&buf, "<err: %v>\n", err)
diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go
index da79e6627..f17f9c59c 100644
--- a/pkg/sentry/kernel/fd_table_unsafe.go
+++ b/pkg/sentry/kernel/fd_table_unsafe.go
@@ -31,14 +31,21 @@ type descriptorTable struct {
slice unsafe.Pointer `state:".(map[int32]*descriptor)"`
}
-// init initializes the table.
+// initNoLeakCheck initializes the table without enabling leak checking.
//
-// TODO(gvisor.dev/1486): Enable leak check for FDTable.
-func (f *FDTable) init() {
+// This is used when loading an FDTable after S/R, during which the ref count
+// object itself will enable leak checking if necessary.
+func (f *FDTable) initNoLeakCheck() {
var slice []unsafe.Pointer // Empty slice.
atomic.StorePointer(&f.slice, unsafe.Pointer(&slice))
}
+// init initializes the table with leak checking.
+func (f *FDTable) init() {
+ f.initNoLeakCheck()
+ f.InitRefs()
+}
+
// get gets a file entry.
//
// The boolean indicates whether this was in range.
@@ -114,18 +121,21 @@ func (f *FDTable) setAll(ctx context.Context, fd int32, file *fs.File, fileVFS2
panic("VFS1 and VFS2 files set")
}
- slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice))
+ slicePtr := (*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice))
// Grow the table as required.
- if last := int32(len(slice)); fd >= last {
+ if last := int32(len(*slicePtr)); fd >= last {
end := fd + 1
if end < 2*last {
end = 2 * last
}
- slice = append(slice, make([]unsafe.Pointer, end-last)...)
- atomic.StorePointer(&f.slice, unsafe.Pointer(&slice))
+ newSlice := append(*slicePtr, make([]unsafe.Pointer, end-last)...)
+ slicePtr = &newSlice
+ atomic.StorePointer(&f.slice, unsafe.Pointer(slicePtr))
}
+ slice := *slicePtr
+
var desc *descriptor
if file != nil || fileVFS2 != nil {
desc = &descriptor{
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
index d46d1e1c1..dfde4deee 100644
--- a/pkg/sentry/kernel/fs_context.go
+++ b/pkg/sentry/kernel/fs_context.go
@@ -63,7 +63,7 @@ func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext {
cwd: cwd,
umask: umask,
}
- f.EnableLeakCheck()
+ f.InitRefs()
return &f
}
@@ -76,7 +76,7 @@ func NewFSContextVFS2(root, cwd vfs.VirtualDentry, umask uint) *FSContext {
cwdVFS2: cwd,
umask: umask,
}
- f.EnableLeakCheck()
+ f.InitRefs()
return &f
}
@@ -130,13 +130,15 @@ func (f *FSContext) Fork() *FSContext {
f.root.IncRef()
}
- return &FSContext{
+ ctx := &FSContext{
cwd: f.cwd,
root: f.root,
cwdVFS2: f.cwdVFS2,
rootVFS2: f.rootVFS2,
umask: f.umask,
}
+ ctx.InitRefs()
+ return ctx
}
// WorkingDirectory returns the current working directory.
@@ -147,19 +149,23 @@ func (f *FSContext) WorkingDirectory() *fs.Dirent {
f.mu.Lock()
defer f.mu.Unlock()
- f.cwd.IncRef()
+ if f.cwd != nil {
+ f.cwd.IncRef()
+ }
return f.cwd
}
// WorkingDirectoryVFS2 returns the current working directory.
//
-// This will return nil if called after f is destroyed, otherwise it will return
-// a Dirent with a reference taken.
+// This will return an empty vfs.VirtualDentry if called after f is
+// destroyed, otherwise it will return a Dirent with a reference taken.
func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry {
f.mu.Lock()
defer f.mu.Unlock()
- f.cwdVFS2.IncRef()
+ if f.cwdVFS2.Ok() {
+ f.cwdVFS2.IncRef()
+ }
return f.cwdVFS2
}
@@ -218,13 +224,15 @@ func (f *FSContext) RootDirectory() *fs.Dirent {
// RootDirectoryVFS2 returns the current filesystem root.
//
-// This will return nil if called after f is destroyed, otherwise it will return
-// a Dirent with a reference taken.
+// This will return an empty vfs.VirtualDentry if called after f is
+// destroyed, otherwise it will return a Dirent with a reference taken.
func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry {
f.mu.Lock()
defer f.mu.Unlock()
- f.rootVFS2.IncRef()
+ if f.rootVFS2.Ok() {
+ f.rootVFS2.IncRef()
+ }
return f.rootVFS2
}
diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go
index 3f34ee0db..9545bb5ef 100644
--- a/pkg/sentry/kernel/ipc_namespace.go
+++ b/pkg/sentry/kernel/ipc_namespace.go
@@ -41,7 +41,7 @@ func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace {
semaphores: semaphore.NewRegistry(userNS),
shms: shm.NewRegistry(userNS),
}
- ns.EnableLeakCheck()
+ ns.InitRefs()
return ns
}
@@ -55,7 +55,7 @@ func (i *IPCNamespace) ShmRegistry() *shm.Registry {
return i.shms
}
-// DecRef implements refs_vfs2.RefCounter.DecRef.
+// DecRef implements refsvfs2.RefCounter.DecRef.
func (i *IPCNamespace) DecRef(ctx context.Context) {
i.IPCNamespaceRefs.DecRef(func() {
i.shms.Release(ctx)
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 0eb2bf7bd..b8627a54f 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -214,9 +214,11 @@ type Kernel struct {
// netlinkPorts manages allocation of netlink socket port IDs.
netlinkPorts *port.Manager
- // saveErr is the error causing the sandbox to exit during save, if
- // any. It is protected by extMu.
- saveErr error `state:"nosave"`
+ // saveStatus is nil if the sandbox has not been saved, errSaved or
+ // errAutoSaved if it has been saved successfully, or the error causing the
+ // sandbox to exit during save.
+ // It is protected by extMu.
+ saveStatus error `state:"nosave"`
// danglingEndpoints is used to save / restore tcpip.DanglingEndpoints.
danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"`
@@ -430,9 +432,8 @@ func (k *Kernel) Init(args InitKernelArgs) error {
// SaveTo saves the state of k to w.
//
// Preconditions: The kernel must be paused throughout the call to SaveTo.
-func (k *Kernel) SaveTo(w wire.Writer) error {
+func (k *Kernel) SaveTo(ctx context.Context, w wire.Writer) error {
saveStart := time.Now()
- ctx := k.SupervisorContext()
// Do not allow other Kernel methods to affect it while it's being saved.
k.extMu.Lock()
@@ -446,38 +447,55 @@ func (k *Kernel) SaveTo(w wire.Writer) error {
k.mf.StartEvictions()
k.mf.WaitForEvictions()
- // Flush write operations on open files so data reaches backing storage.
- // This must come after MemoryFile eviction since eviction may cause file
- // writes.
- if err := k.tasks.flushWritesToFiles(ctx); err != nil {
- return err
- }
+ if VFS2Enabled {
+ // Discard unsavable mappings, such as those for host file descriptors.
+ if err := k.invalidateUnsavableMappings(ctx); err != nil {
+ return fmt.Errorf("failed to invalidate unsavable mappings: %v", err)
+ }
+
+ // Prepare filesystems for saving. This must be done after
+ // invalidateUnsavableMappings(), since dropping memory mappings may
+ // affect filesystem state (e.g. page cache reference counts).
+ if err := k.vfs.PrepareSave(ctx); err != nil {
+ return err
+ }
+ } else {
+ // Flush cached file writes to backing storage. This must come after
+ // MemoryFile eviction since eviction may cause file writes.
+ if err := k.flushWritesToFiles(ctx); err != nil {
+ return err
+ }
- // Remove all epoll waiter objects from underlying wait queues.
- // NOTE: for programs to resume execution in future snapshot scenarios,
- // we will need to re-establish these waiter objects after saving.
- k.tasks.unregisterEpollWaiters(ctx)
+ // Remove all epoll waiter objects from underlying wait queues.
+ // NOTE: for programs to resume execution in future snapshot scenarios,
+ // we will need to re-establish these waiter objects after saving.
+ k.tasks.unregisterEpollWaiters(ctx)
- // Clear the dirent cache before saving because Dirents must be Loaded in a
- // particular order (parents before children), and Loading dirents from a cache
- // breaks that order.
- if err := k.flushMountSourceRefs(ctx); err != nil {
- return err
- }
+ // Clear the dirent cache before saving because Dirents must be Loaded in a
+ // particular order (parents before children), and Loading dirents from a cache
+ // breaks that order.
+ if err := k.flushMountSourceRefs(ctx); err != nil {
+ return err
+ }
- // Ensure that all inode and mount release operations have completed.
- fs.AsyncBarrier()
+ // Ensure that all inode and mount release operations have completed.
+ fs.AsyncBarrier()
- // Once all fs work has completed (flushed references have all been released),
- // reset mount mappings. This allows individual mounts to save how inodes map
- // to filesystem resources. Without this, fs.Inodes cannot be restored.
- fs.SaveInodeMappings()
+ // Once all fs work has completed (flushed references have all been released),
+ // reset mount mappings. This allows individual mounts to save how inodes map
+ // to filesystem resources. Without this, fs.Inodes cannot be restored.
+ fs.SaveInodeMappings()
- // Discard unsavable mappings, such as those for host file descriptors.
- // This must be done after waiting for "asynchronous fs work", which
- // includes async I/O that may touch application memory.
- if err := k.invalidateUnsavableMappings(ctx); err != nil {
- return fmt.Errorf("failed to invalidate unsavable mappings: %v", err)
+ // Discard unsavable mappings, such as those for host file descriptors.
+ // This must be done after waiting for "asynchronous fs work", which
+ // includes async I/O that may touch application memory.
+ //
+ // TODO(gvisor.dev/issue/1624): This rationale is believed to be
+ // obsolete since AIO callbacks are now waited-for by Kernel.Pause(),
+ // but this order is conservatively retained for VFS1.
+ if err := k.invalidateUnsavableMappings(ctx); err != nil {
+ return fmt.Errorf("failed to invalidate unsavable mappings: %v", err)
+ }
}
// Save the CPUID FeatureSet before the rest of the kernel so we can
@@ -486,14 +504,14 @@ func (k *Kernel) SaveTo(w wire.Writer) error {
//
// N.B. This will also be saved along with the full kernel save below.
cpuidStart := time.Now()
- if _, err := state.Save(k.SupervisorContext(), w, k.FeatureSet()); err != nil {
+ if _, err := state.Save(ctx, w, k.FeatureSet()); err != nil {
return err
}
log.Infof("CPUID save took [%s].", time.Since(cpuidStart))
// Save the kernel state.
kernelStart := time.Now()
- stats, err := state.Save(k.SupervisorContext(), w, k)
+ stats, err := state.Save(ctx, w, k)
if err != nil {
return err
}
@@ -502,7 +520,7 @@ func (k *Kernel) SaveTo(w wire.Writer) error {
// Save the memory file's state.
memoryStart := time.Now()
- if err := k.mf.SaveTo(k.SupervisorContext(), w); err != nil {
+ if err := k.mf.SaveTo(ctx, w); err != nil {
return err
}
log.Infof("Memory save took [%s].", time.Since(memoryStart))
@@ -514,11 +532,9 @@ func (k *Kernel) SaveTo(w wire.Writer) error {
// flushMountSourceRefs flushes the MountSources for all mounted filesystems
// and open FDs.
+//
+// Preconditions: !VFS2Enabled.
func (k *Kernel) flushMountSourceRefs(ctx context.Context) error {
- if VFS2Enabled {
- return nil // Not relevant.
- }
-
// Flush all mount sources for currently mounted filesystems in each task.
flushed := make(map[*fs.MountNamespace]struct{})
k.tasks.mu.RLock()
@@ -561,13 +577,9 @@ func (ts *TaskSet) forEachFDPaused(ctx context.Context, f func(*fs.File, *vfs.Fi
return err
}
-func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
- // TODO(gvisor.dev/issue/1663): Add save support for VFS2.
- if VFS2Enabled {
- return nil
- }
-
- return ts.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error {
+// Preconditions: !VFS2Enabled.
+func (k *Kernel) flushWritesToFiles(ctx context.Context) error {
+ return k.tasks.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error {
if flags := file.Flags(); !flags.Write {
return nil
}
@@ -589,37 +601,8 @@ func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
})
}
-// Preconditions: The kernel must be paused.
-func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error {
- invalidated := make(map[*mm.MemoryManager]struct{})
- k.tasks.mu.RLock()
- defer k.tasks.mu.RUnlock()
- for t := range k.tasks.Root.tids {
- // We can skip locking Task.mu here since the kernel is paused.
- if mm := t.tc.MemoryManager; mm != nil {
- if _, ok := invalidated[mm]; !ok {
- if err := mm.InvalidateUnsavable(ctx); err != nil {
- return err
- }
- invalidated[mm] = struct{}{}
- }
- }
- // I really wish we just had a sync.Map of all MMs...
- if r, ok := t.runState.(*runSyscallAfterExecStop); ok {
- if err := r.tc.MemoryManager.InvalidateUnsavable(ctx); err != nil {
- return err
- }
- }
- }
- return nil
-}
-
+// Preconditions: !VFS2Enabled.
func (ts *TaskSet) unregisterEpollWaiters(ctx context.Context) {
- // TODO(gvisor.dev/issue/1663): Add save support for VFS2.
- if VFS2Enabled {
- return
- }
-
ts.mu.RLock()
defer ts.mu.RUnlock()
@@ -644,8 +627,33 @@ func (ts *TaskSet) unregisterEpollWaiters(ctx context.Context) {
}
}
+// Preconditions: The kernel must be paused.
+func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error {
+ invalidated := make(map[*mm.MemoryManager]struct{})
+ k.tasks.mu.RLock()
+ defer k.tasks.mu.RUnlock()
+ for t := range k.tasks.Root.tids {
+ // We can skip locking Task.mu here since the kernel is paused.
+ if mm := t.image.MemoryManager; mm != nil {
+ if _, ok := invalidated[mm]; !ok {
+ if err := mm.InvalidateUnsavable(ctx); err != nil {
+ return err
+ }
+ invalidated[mm] = struct{}{}
+ }
+ }
+ // I really wish we just had a sync.Map of all MMs...
+ if r, ok := t.runState.(*runSyscallAfterExecStop); ok {
+ if err := r.image.MemoryManager.InvalidateUnsavable(ctx); err != nil {
+ return err
+ }
+ }
+ }
+ return nil
+}
+
// LoadFrom returns a new Kernel loaded from args.
-func (k *Kernel) LoadFrom(r wire.Reader, net inet.Stack, clocks sentrytime.Clocks) error {
+func (k *Kernel) LoadFrom(ctx context.Context, r wire.Reader, net inet.Stack, clocks sentrytime.Clocks, vfsOpts *vfs.CompleteRestoreOptions) error {
loadStart := time.Now()
initAppCores := k.applicationCores
@@ -656,7 +664,7 @@ func (k *Kernel) LoadFrom(r wire.Reader, net inet.Stack, clocks sentrytime.Clock
// don't need to explicitly install it in the Kernel.
cpuidStart := time.Now()
var features cpuid.FeatureSet
- if _, err := state.Load(k.SupervisorContext(), r, &features); err != nil {
+ if _, err := state.Load(ctx, r, &features); err != nil {
return err
}
log.Infof("CPUID load took [%s].", time.Since(cpuidStart))
@@ -671,7 +679,7 @@ func (k *Kernel) LoadFrom(r wire.Reader, net inet.Stack, clocks sentrytime.Clock
// Load the kernel state.
kernelStart := time.Now()
- stats, err := state.Load(k.SupervisorContext(), r, k)
+ stats, err := state.Load(ctx, r, k)
if err != nil {
return err
}
@@ -684,7 +692,7 @@ func (k *Kernel) LoadFrom(r wire.Reader, net inet.Stack, clocks sentrytime.Clock
// Load the memory file's state.
memoryStart := time.Now()
- if err := k.mf.LoadFrom(k.SupervisorContext(), r); err != nil {
+ if err := k.mf.LoadFrom(ctx, r); err != nil {
return err
}
log.Infof("Memory load took [%s].", time.Since(memoryStart))
@@ -696,11 +704,17 @@ func (k *Kernel) LoadFrom(r wire.Reader, net inet.Stack, clocks sentrytime.Clock
net.Resume()
}
- // Ensure that all pending asynchronous work is complete:
- // - namedpipe opening
- // - inode file opening
- if err := fs.AsyncErrorBarrier(); err != nil {
- return err
+ if VFS2Enabled {
+ if err := k.vfs.CompleteRestore(ctx, vfsOpts); err != nil {
+ return err
+ }
+ } else {
+ // Ensure that all pending asynchronous work is complete:
+ // - namedpipe opening
+ // - inode file opening
+ if err := fs.AsyncErrorBarrier(); err != nil {
+ return err
+ }
}
tcpip.AsyncLoading.Wait()
@@ -1005,7 +1019,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
Features: k.featureSet,
}
- tc, se := k.LoadTaskImage(ctx, loadArgs)
+ image, se := k.LoadTaskImage(ctx, loadArgs)
if se != nil {
return nil, 0, errors.New(se.String())
}
@@ -1018,7 +1032,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
config := &TaskConfig{
Kernel: k,
ThreadGroup: tg,
- TaskContext: tc,
+ TaskImage: image,
FSContext: fsContext,
FDTable: args.FDTable,
Credentials: args.Credentials,
@@ -1034,7 +1048,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
if err != nil {
return nil, 0, err
}
- t.traceExecEvent(tc) // Simulate exec for tracing.
+ t.traceExecEvent(image) // Simulate exec for tracing.
// Success.
cu.Release()
@@ -1347,6 +1361,13 @@ func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error {
// not have meaningful trace data. Rebuilding here ensures that we can do so
// after tracing has been enabled.
func (k *Kernel) RebuildTraceContexts() {
+ // We need to pause all task goroutines because Task.rebuildTraceContext()
+ // replaces Task.traceContext and Task.traceTask, which are
+ // task-goroutine-exclusive (i.e. the task goroutine assumes that it can
+ // access them without synchronization) for performance.
+ k.Pause()
+ defer k.Unpause()
+
k.extMu.Lock()
defer k.extMu.Unlock()
k.tasks.mu.RLock()
@@ -1462,12 +1483,42 @@ func (k *Kernel) NetlinkPorts() *port.Manager {
return k.netlinkPorts
}
-// SaveError returns the sandbox error that caused the kernel to exit during
-// save.
-func (k *Kernel) SaveError() error {
+var (
+ errSaved = errors.New("sandbox has been successfully saved")
+ errAutoSaved = errors.New("sandbox has been successfully auto-saved")
+)
+
+// SaveStatus returns the sandbox save status. If it was saved successfully,
+// autosaved indicates whether save was triggered by autosave. If it was not
+// saved successfully, err indicates the sandbox error that caused the kernel to
+// exit during save.
+func (k *Kernel) SaveStatus() (saved, autosaved bool, err error) {
k.extMu.Lock()
defer k.extMu.Unlock()
- return k.saveErr
+ switch k.saveStatus {
+ case nil:
+ return false, false, nil
+ case errSaved:
+ return true, false, nil
+ case errAutoSaved:
+ return true, true, nil
+ default:
+ return false, false, k.saveStatus
+ }
+}
+
+// SetSaveSuccess sets the flag indicating that save completed successfully, if
+// no status was already set.
+func (k *Kernel) SetSaveSuccess(autosave bool) {
+ k.extMu.Lock()
+ defer k.extMu.Unlock()
+ if k.saveStatus == nil {
+ if autosave {
+ k.saveStatus = errAutoSaved
+ } else {
+ k.saveStatus = errSaved
+ }
+ }
}
// SetSaveError sets the sandbox error that caused the kernel to exit during
@@ -1475,8 +1526,8 @@ func (k *Kernel) SaveError() error {
func (k *Kernel) SetSaveError(err error) {
k.extMu.Lock()
defer k.extMu.Unlock()
- if k.saveErr == nil {
- k.saveErr = err
+ if k.saveStatus == nil {
+ k.saveStatus = err
}
}
diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go
index ce0db5583..d6fb0fdb8 100644
--- a/pkg/sentry/kernel/pipe/node_test.go
+++ b/pkg/sentry/kernel/pipe/node_test.go
@@ -22,7 +22,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/contexttest"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/syserror"
- "gvisor.dev/gvisor/pkg/usermem"
)
type sleeper struct {
@@ -66,7 +65,8 @@ func testOpenOrDie(ctx context.Context, t *testing.T, n fs.InodeOperations, flag
d := fs.NewDirent(ctx, inode, "pipe")
file, err := n.GetFile(ctx, d, flags)
if err != nil {
- t.Fatalf("open with flags %+v failed: %v", flags, err)
+ t.Errorf("open with flags %+v failed: %v", flags, err)
+ return nil, err
}
if doneChan != nil {
doneChan <- struct{}{}
@@ -85,11 +85,11 @@ func testOpen(ctx context.Context, t *testing.T, n fs.InodeOperations, flags fs.
}
func newNamedPipe(t *testing.T) *Pipe {
- return NewPipe(true, DefaultPipeSize, usermem.PageSize)
+ return NewPipe(true, DefaultPipeSize)
}
func newAnonPipe(t *testing.T) *Pipe {
- return NewPipe(false, DefaultPipeSize, usermem.PageSize)
+ return NewPipe(false, DefaultPipeSize)
}
// assertRecvBlocks ensures that a recv attempt on c blocks for at least
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 67beb0ad6..b989e14c7 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -26,18 +26,27 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
const (
// MinimumPipeSize is a hard limit of the minimum size of a pipe.
- MinimumPipeSize = 64 << 10
+ // It corresponds to fs/pipe.c:pipe_min_size.
+ MinimumPipeSize = usermem.PageSize
+
+ // MaximumPipeSize is a hard limit on the maximum size of a pipe.
+ // It corresponds to fs/pipe.c:pipe_max_size.
+ MaximumPipeSize = 1048576
// DefaultPipeSize is the system-wide default size of a pipe in bytes.
- DefaultPipeSize = MinimumPipeSize
+ // It corresponds to pipe_fs_i.h:PIPE_DEF_BUFFERS.
+ DefaultPipeSize = 16 * usermem.PageSize
- // MaximumPipeSize is a hard limit on the maximum size of a pipe.
- MaximumPipeSize = 8 << 20
+ // atomicIOBytes is the maximum number of bytes that the pipe will
+ // guarantee atomic reads or writes atomically.
+ // It corresponds to limits.h:PIPE_BUF.
+ atomicIOBytes = 4096
)
// Pipe is an encapsulation of a platform-independent pipe.
@@ -53,12 +62,6 @@ type Pipe struct {
// This value is immutable.
isNamed bool
- // atomicIOBytes is the maximum number of bytes that the pipe will
- // guarantee atomic reads or writes atomically.
- //
- // This value is immutable.
- atomicIOBytes int64
-
// The number of active readers for this pipe.
//
// Access atomically.
@@ -94,47 +97,34 @@ type Pipe struct {
// NewPipe initializes and returns a pipe.
//
-// N.B. The size and atomicIOBytes will be bounded.
-func NewPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *Pipe {
+// N.B. The size will be bounded.
+func NewPipe(isNamed bool, sizeBytes int64) *Pipe {
if sizeBytes < MinimumPipeSize {
sizeBytes = MinimumPipeSize
}
if sizeBytes > MaximumPipeSize {
sizeBytes = MaximumPipeSize
}
- if atomicIOBytes <= 0 {
- atomicIOBytes = 1
- }
- if atomicIOBytes > sizeBytes {
- atomicIOBytes = sizeBytes
- }
var p Pipe
- initPipe(&p, isNamed, sizeBytes, atomicIOBytes)
+ initPipe(&p, isNamed, sizeBytes)
return &p
}
-func initPipe(pipe *Pipe, isNamed bool, sizeBytes, atomicIOBytes int64) {
+func initPipe(pipe *Pipe, isNamed bool, sizeBytes int64) {
if sizeBytes < MinimumPipeSize {
sizeBytes = MinimumPipeSize
}
if sizeBytes > MaximumPipeSize {
sizeBytes = MaximumPipeSize
}
- if atomicIOBytes <= 0 {
- atomicIOBytes = 1
- }
- if atomicIOBytes > sizeBytes {
- atomicIOBytes = sizeBytes
- }
pipe.isNamed = isNamed
pipe.max = sizeBytes
- pipe.atomicIOBytes = atomicIOBytes
}
// NewConnectedPipe initializes a pipe and returns a pair of objects
// representing the read and write ends of the pipe.
-func NewConnectedPipe(ctx context.Context, sizeBytes, atomicIOBytes int64) (*fs.File, *fs.File) {
- p := NewPipe(false /* isNamed */, sizeBytes, atomicIOBytes)
+func NewConnectedPipe(ctx context.Context, sizeBytes int64) (*fs.File, *fs.File) {
+ p := NewPipe(false /* isNamed */, sizeBytes)
// Build an fs.Dirent for the pipe which will be shared by both
// returned files.
@@ -264,7 +254,7 @@ func (p *Pipe) writeLocked(ctx context.Context, ops writeOps) (int64, error) {
wanted := ops.left()
avail := p.max - p.view.Size()
if wanted > avail {
- if wanted <= p.atomicIOBytes {
+ if wanted <= atomicIOBytes {
return 0, syserror.ErrWouldBlock
}
ops.limit(avail)
diff --git a/pkg/sentry/kernel/pipe/pipe_test.go b/pkg/sentry/kernel/pipe/pipe_test.go
index fe97e9800..3dd739080 100644
--- a/pkg/sentry/kernel/pipe/pipe_test.go
+++ b/pkg/sentry/kernel/pipe/pipe_test.go
@@ -26,7 +26,7 @@ import (
func TestPipeRW(t *testing.T) {
ctx := contexttest.Context(t)
- r, w := NewConnectedPipe(ctx, 65536, 4096)
+ r, w := NewConnectedPipe(ctx, 65536)
defer r.DecRef(ctx)
defer w.DecRef(ctx)
@@ -46,7 +46,7 @@ func TestPipeRW(t *testing.T) {
func TestPipeReadBlock(t *testing.T) {
ctx := contexttest.Context(t)
- r, w := NewConnectedPipe(ctx, 65536, 4096)
+ r, w := NewConnectedPipe(ctx, 65536)
defer r.DecRef(ctx)
defer w.DecRef(ctx)
@@ -61,7 +61,7 @@ func TestPipeWriteBlock(t *testing.T) {
const capacity = MinimumPipeSize
ctx := contexttest.Context(t)
- r, w := NewConnectedPipe(ctx, capacity, atomicIOBytes)
+ r, w := NewConnectedPipe(ctx, capacity)
defer r.DecRef(ctx)
defer w.DecRef(ctx)
@@ -76,7 +76,7 @@ func TestPipeWriteUntilEnd(t *testing.T) {
const atomicIOBytes = 2
ctx := contexttest.Context(t)
- r, w := NewConnectedPipe(ctx, atomicIOBytes, atomicIOBytes)
+ r, w := NewConnectedPipe(ctx, atomicIOBytes)
defer r.DecRef(ctx)
defer w.DecRef(ctx)
@@ -116,7 +116,8 @@ func TestPipeWriteUntilEnd(t *testing.T) {
}
}
if err != nil {
- t.Fatalf("Readv: got unexpected error %v", err)
+ t.Errorf("Readv: got unexpected error %v", err)
+ return
}
}
}()
diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go
index 1a152142b..7b23cbe86 100644
--- a/pkg/sentry/kernel/pipe/vfs.go
+++ b/pkg/sentry/kernel/pipe/vfs.go
@@ -33,6 +33,8 @@ import (
// VFSPipe represents the actual pipe, analagous to an inode. VFSPipes should
// not be copied.
+//
+// +stateify savable
type VFSPipe struct {
// mu protects the fields below.
mu sync.Mutex `state:"nosave"`
@@ -52,9 +54,9 @@ type VFSPipe struct {
}
// NewVFSPipe returns an initialized VFSPipe.
-func NewVFSPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *VFSPipe {
+func NewVFSPipe(isNamed bool, sizeBytes int64) *VFSPipe {
var vp VFSPipe
- initPipe(&vp.pipe, isNamed, sizeBytes, atomicIOBytes)
+ initPipe(&vp.pipe, isNamed, sizeBytes)
return &vp
}
@@ -164,6 +166,8 @@ func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, l
// VFSPipeFD implements vfs.FileDescriptionImpl for pipes. It also implements
// non-atomic usermem.IO methods, allowing it to be passed as usermem.IO to
// other FileDescriptions for splice(2) and tee(2).
+//
+// +stateify savable
type VFSPipeFD struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index 1145faf13..cef58a590 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -259,8 +259,8 @@ func (t *Task) ptraceTrapLocked(code int32) {
Signo: int32(linux.SIGTRAP),
Code: code,
}
- t.ptraceSiginfo.SetPid(int32(t.tg.pidns.tids[t]))
- t.ptraceSiginfo.SetUid(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
+ t.ptraceSiginfo.SetPID(int32(t.tg.pidns.tids[t]))
+ t.ptraceSiginfo.SetUID(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
if t.beginPtraceStopLocked() {
tracer := t.Tracer()
tracer.signalStop(t, arch.CLD_TRAPPED, int32(linux.SIGTRAP))
@@ -1000,7 +1000,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
// at the address specified by the data parameter, and the return value
// is the error flag." - ptrace(2)
word := t.Arch().Native(0)
- if _, err := word.CopyIn(target.AsCopyContext(usermem.IOOpts{IgnorePermissions: true}), addr); err != nil {
+ if _, err := word.CopyIn(target.CopyContext(t, usermem.IOOpts{IgnorePermissions: true}), addr); err != nil {
return err
}
_, err := word.CopyOut(t, data)
@@ -1008,7 +1008,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
case linux.PTRACE_POKETEXT, linux.PTRACE_POKEDATA:
word := t.Arch().Native(uintptr(data))
- _, err := word.CopyOut(target.AsCopyContext(usermem.IOOpts{IgnorePermissions: true}), addr)
+ _, err := word.CopyOut(target.CopyContext(t, usermem.IOOpts{IgnorePermissions: true}), addr)
return err
case linux.PTRACE_GETREGSET:
diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go
index 387edfa91..60917e7d3 100644
--- a/pkg/sentry/kernel/seccomp.go
+++ b/pkg/sentry/kernel/seccomp.go
@@ -106,7 +106,7 @@ func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip u
func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip usermem.Addr) uint32 {
data := linux.SeccompData{
Nr: sysno,
- Arch: t.tc.st.AuditNumber,
+ Arch: t.image.st.AuditNumber,
InstructionPointer: uint64(ip),
}
// data.args is []uint64 and args is []arch.SyscallArgument (uintptr), so
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index c00fa1138..3dd3953b3 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -29,17 +29,17 @@ import (
)
const (
- valueMax = 32767 // SEMVMX
+ // Maximum semaphore value.
+ valueMax = linux.SEMVMX
- // semaphoresMax is "maximum number of semaphores per semaphore ID" (SEMMSL).
- semaphoresMax = 32000
+ // Maximum number of semaphore sets.
+ setsMax = linux.SEMMNI
- // setMax is "system-wide limit on the number of semaphore sets" (SEMMNI).
- setsMax = 32000
+ // Maximum number of semaphroes in a semaphore set.
+ semsMax = linux.SEMMSL
- // semaphoresTotalMax is "system-wide limit on the number of semaphores"
- // (SEMMNS = SEMMNI*SEMMSL).
- semaphoresTotalMax = 1024000000
+ // Maximum number of semaphores in all semaphroe sets.
+ semsTotalMax = linux.SEMMNS
)
// Registry maintains a set of semaphores that can be found by key or ID.
@@ -52,6 +52,9 @@ type Registry struct {
mu sync.Mutex `state:"nosave"`
semaphores map[int32]*Set
lastIDUsed int32
+ // indexes maintains a mapping between a set's index in virtual array and
+ // its identifier.
+ indexes map[int32]int32
}
// Set represents a set of semaphores that can be operated atomically.
@@ -103,6 +106,7 @@ type waiter struct {
waiterEntry
// value represents how much resource the waiter needs to wake up.
+ // The value is either 0 or negative.
value int16
ch chan struct{}
}
@@ -112,6 +116,7 @@ func NewRegistry(userNS *auth.UserNamespace) *Registry {
return &Registry{
userNS: userNS,
semaphores: make(map[int32]*Set),
+ indexes: make(map[int32]int32),
}
}
@@ -121,7 +126,7 @@ func NewRegistry(userNS *auth.UserNamespace) *Registry {
// be found. If exclusive is true, it fails if a set with the same key already
// exists.
func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linux.FileMode, private, create, exclusive bool) (*Set, error) {
- if nsems < 0 || nsems > semaphoresMax {
+ if nsems < 0 || nsems > semsMax {
return nil, syserror.EINVAL
}
@@ -162,10 +167,13 @@ func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linu
}
// Apply system limits.
+ //
+ // Map semaphores and map indexes in a registry are of the same size,
+ // check map semaphores only here for the system limit.
if len(r.semaphores) >= setsMax {
return nil, syserror.EINVAL
}
- if r.totalSems() > int(semaphoresTotalMax-nsems) {
+ if r.totalSems() > int(semsTotalMax-nsems) {
return nil, syserror.EINVAL
}
@@ -175,6 +183,39 @@ func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linu
return r.newSet(ctx, key, owner, owner, perms, nsems)
}
+// IPCInfo returns information about system-wide semaphore limits and parameters.
+func (r *Registry) IPCInfo() *linux.SemInfo {
+ return &linux.SemInfo{
+ SemMap: linux.SEMMAP,
+ SemMni: linux.SEMMNI,
+ SemMns: linux.SEMMNS,
+ SemMnu: linux.SEMMNU,
+ SemMsl: linux.SEMMSL,
+ SemOpm: linux.SEMOPM,
+ SemUme: linux.SEMUME,
+ SemUsz: 0, // SemUsz not supported.
+ SemVmx: linux.SEMVMX,
+ SemAem: linux.SEMAEM,
+ }
+}
+
+// HighestIndex returns the index of the highest used entry in
+// the kernel's array.
+func (r *Registry) HighestIndex() int32 {
+ r.mu.Lock()
+ defer r.mu.Unlock()
+
+ // By default, highest used index is 0 even though
+ // there is no semaphroe set.
+ var highestIndex int32
+ for index := range r.indexes {
+ if index > highestIndex {
+ highestIndex = index
+ }
+ }
+ return highestIndex
+}
+
// RemoveID removes set with give 'id' from the registry and marks the set as
// dead. All waiters will be awakened and fail.
func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error {
@@ -185,6 +226,11 @@ func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error {
if set == nil {
return syserror.EINVAL
}
+ index, found := r.findIndexByID(id)
+ if !found {
+ // Inconsistent state.
+ panic(fmt.Sprintf("unable to find an index for ID: %d", id))
+ }
set.mu.Lock()
defer set.mu.Unlock()
@@ -196,6 +242,7 @@ func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error {
}
delete(r.semaphores, set.ID)
+ delete(r.indexes, index)
set.destroy()
return nil
}
@@ -219,6 +266,11 @@ func (r *Registry) newSet(ctx context.Context, key int32, owner, creator fs.File
continue
}
if r.semaphores[id] == nil {
+ index, found := r.findFirstAvailableIndex()
+ if !found {
+ panic("unable to find an available index")
+ }
+ r.indexes[index] = id
r.lastIDUsed = id
r.semaphores[id] = set
set.ID = id
@@ -246,6 +298,24 @@ func (r *Registry) findByKey(key int32) *Set {
return nil
}
+func (r *Registry) findIndexByID(id int32) (int32, bool) {
+ for k, v := range r.indexes {
+ if v == id {
+ return k, true
+ }
+ }
+ return 0, false
+}
+
+func (r *Registry) findFirstAvailableIndex() (int32, bool) {
+ for index := int32(0); index < setsMax; index++ {
+ if _, present := r.indexes[index]; !present {
+ return index, true
+ }
+ }
+ return 0, false
+}
+
func (r *Registry) totalSems() int {
totalSems := 0
for _, v := range r.semaphores {
@@ -283,6 +353,33 @@ func (s *Set) Change(ctx context.Context, creds *auth.Credentials, owner fs.File
return nil
}
+// GetStat extracts semid_ds information from the set.
+func (s *Set) GetStat(creds *auth.Credentials) (*linux.SemidDS, error) {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+
+ // "The calling process must have read permission on the semaphore set."
+ if !s.checkPerms(creds, fs.PermMask{Read: true}) {
+ return nil, syserror.EACCES
+ }
+
+ ds := &linux.SemidDS{
+ SemPerm: linux.IPCPerm{
+ Key: uint32(s.key),
+ UID: uint32(creds.UserNamespace.MapFromKUID(s.owner.UID)),
+ GID: uint32(creds.UserNamespace.MapFromKGID(s.owner.GID)),
+ CUID: uint32(creds.UserNamespace.MapFromKUID(s.creator.UID)),
+ CGID: uint32(creds.UserNamespace.MapFromKGID(s.creator.GID)),
+ Mode: uint16(s.perms.LinuxMode()),
+ Seq: 0, // IPC sequence not supported.
+ },
+ SemOTime: s.opTime.TimeT(),
+ SemCTime: s.changeTime.TimeT(),
+ SemNSems: uint64(s.Size()),
+ }
+ return ds, nil
+}
+
// SetVal overrides a semaphore value, waking up waiters as needed.
func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Credentials, pid int32) error {
if val < 0 || val > valueMax {
@@ -320,7 +417,7 @@ func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credenti
}
for _, val := range vals {
- if val < 0 || val > valueMax {
+ if val > valueMax {
return syserror.ERANGE
}
}
@@ -396,6 +493,42 @@ func (s *Set) GetPID(num int32, creds *auth.Credentials) (int32, error) {
return sem.pid, nil
}
+func (s *Set) countWaiters(num int32, creds *auth.Credentials, pred func(w *waiter) bool) (uint16, error) {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+
+ // The calling process must have read permission on the semaphore set.
+ if !s.checkPerms(creds, fs.PermMask{Read: true}) {
+ return 0, syserror.EACCES
+ }
+
+ sem := s.findSem(num)
+ if sem == nil {
+ return 0, syserror.ERANGE
+ }
+ var cnt uint16
+ for w := sem.waiters.Front(); w != nil; w = w.Next() {
+ if pred(w) {
+ cnt++
+ }
+ }
+ return cnt, nil
+}
+
+// CountZeroWaiters returns number of waiters waiting for the sem's value to increase.
+func (s *Set) CountZeroWaiters(num int32, creds *auth.Credentials) (uint16, error) {
+ return s.countWaiters(num, creds, func(w *waiter) bool {
+ return w.value == 0
+ })
+}
+
+// CountNegativeWaiters returns number of waiters waiting for the sem to go to zero.
+func (s *Set) CountNegativeWaiters(num int32, creds *auth.Credentials) (uint16, error) {
+ return s.countWaiters(num, creds, func(w *waiter) bool {
+ return w.value < 0
+ })
+}
+
// ExecuteOps attempts to execute a list of operations to the set. It only
// succeeds when all operations can be applied. No changes are made if it fails.
//
@@ -548,11 +681,18 @@ func (s *Set) destroy() {
}
}
+func abs(val int16) int16 {
+ if val < 0 {
+ return -val
+ }
+ return val
+}
+
// wakeWaiters goes over all waiters and checks which of them can be notified.
func (s *sem) wakeWaiters() {
// Note that this will release all waiters waiting for 0 too.
for w := s.waiters.Front(); w != nil; {
- if s.value < w.value {
+ if s.value < abs(w.value) {
// Still blocked, skip it.
w = w.Next()
continue
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index df5c8421b..0cd9e2533 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -295,7 +295,7 @@ func (tg *ThreadGroup) createSession() error {
id: SessionID(id),
leader: tg,
}
- s.EnableLeakCheck()
+ s.InitRefs()
// Create a new ProcessGroup, belonging to that Session.
// This also has a single reference (assigned below).
@@ -309,7 +309,7 @@ func (tg *ThreadGroup) createSession() error {
session: s,
ancestors: 0,
}
- pg.refs.EnableLeakCheck()
+ pg.refs.InitRefs()
// Tie them and return the result.
s.processGroups.PushBack(pg)
@@ -395,7 +395,7 @@ func (tg *ThreadGroup) CreateProcessGroup() error {
originator: tg,
session: tg.processGroup.session,
}
- pg.refs.EnableLeakCheck()
+ pg.refs.InitRefs()
if tg.leader.parent != nil && tg.leader.parent.tg.processGroup.session == pg.session {
pg.ancestors++
@@ -477,20 +477,20 @@ func (tg *ThreadGroup) Session() *Session {
//
// If this group isn't visible in this namespace, zero will be returned. It is
// the callers responsibility to check that before using this function.
-func (pidns *PIDNamespace) IDOfSession(s *Session) SessionID {
- pidns.owner.mu.RLock()
- defer pidns.owner.mu.RUnlock()
- return pidns.sids[s]
+func (ns *PIDNamespace) IDOfSession(s *Session) SessionID {
+ ns.owner.mu.RLock()
+ defer ns.owner.mu.RUnlock()
+ return ns.sids[s]
}
// SessionWithID returns the Session with the given ID in the PID namespace ns,
// or nil if that given ID is not defined in this namespace.
//
// A reference is not taken on the session.
-func (pidns *PIDNamespace) SessionWithID(id SessionID) *Session {
- pidns.owner.mu.RLock()
- defer pidns.owner.mu.RUnlock()
- return pidns.sessions[id]
+func (ns *PIDNamespace) SessionWithID(id SessionID) *Session {
+ ns.owner.mu.RLock()
+ defer ns.owner.mu.RUnlock()
+ return ns.sessions[id]
}
// ProcessGroup returns the ThreadGroup's ProcessGroup.
@@ -505,18 +505,18 @@ func (tg *ThreadGroup) ProcessGroup() *ProcessGroup {
// IDOfProcessGroup returns the process group assigned to pg in PID namespace ns.
//
// The same constraints apply as IDOfSession.
-func (pidns *PIDNamespace) IDOfProcessGroup(pg *ProcessGroup) ProcessGroupID {
- pidns.owner.mu.RLock()
- defer pidns.owner.mu.RUnlock()
- return pidns.pgids[pg]
+func (ns *PIDNamespace) IDOfProcessGroup(pg *ProcessGroup) ProcessGroupID {
+ ns.owner.mu.RLock()
+ defer ns.owner.mu.RUnlock()
+ return ns.pgids[pg]
}
// ProcessGroupWithID returns the ProcessGroup with the given ID in the PID
// namespace ns, or nil if that given ID is not defined in this namespace.
//
// A reference is not taken on the process group.
-func (pidns *PIDNamespace) ProcessGroupWithID(id ProcessGroupID) *ProcessGroup {
- pidns.owner.mu.RLock()
- defer pidns.owner.mu.RUnlock()
- return pidns.processGroups[id]
+func (ns *PIDNamespace) ProcessGroupWithID(id ProcessGroupID) *ProcessGroup {
+ ns.owner.mu.RLock()
+ defer ns.owner.mu.RUnlock()
+ return ns.processGroups[id]
}
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
index f8a382fd8..073e14507 100644
--- a/pkg/sentry/kernel/shm/BUILD
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -6,9 +6,12 @@ package(licenses = ["notice"])
go_template_instance(
name = "shm_refs",
out = "shm_refs.go",
+ consts = {
+ "enableLogging": "true",
+ },
package = "shm",
prefix = "Shm",
- template = "//pkg/refs_vfs2:refs_template",
+ template = "//pkg/refsvfs2:refs_template",
types = {
"T": "Shm",
},
@@ -27,7 +30,7 @@ go_library(
"//pkg/context",
"//pkg/log",
"//pkg/refs",
- "//pkg/refs_vfs2",
+ "//pkg/refsvfs2",
"//pkg/sentry/device",
"//pkg/sentry/fs",
"//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index ebbebf46b..92d60ba78 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -251,7 +251,7 @@ func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.Fi
creatorPID: pid,
changeTime: ktime.NowFromContext(ctx),
}
- shm.EnableLeakCheck()
+ shm.InitRefs()
// Find the next available ID.
for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ {
diff --git a/pkg/sentry/kernel/signal.go b/pkg/sentry/kernel/signal.go
index e8cce37d0..2488ae7d5 100644
--- a/pkg/sentry/kernel/signal.go
+++ b/pkg/sentry/kernel/signal.go
@@ -73,7 +73,7 @@ func SignalInfoNoInfo(sig linux.Signal, sender, receiver *Task) *arch.SignalInfo
Signo: int32(sig),
Code: arch.SignalInfoUser,
}
- info.SetPid(int32(receiver.tg.pidns.IDOfThreadGroup(sender.tg)))
- info.SetUid(int32(sender.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow()))
+ info.SetPID(int32(receiver.tg.pidns.IDOfThreadGroup(sender.tg)))
+ info.SetUID(int32(sender.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow()))
return info
}
diff --git a/pkg/sentry/kernel/signalfd/signalfd.go b/pkg/sentry/kernel/signalfd/signalfd.go
index 78f718cfe..884966120 100644
--- a/pkg/sentry/kernel/signalfd/signalfd.go
+++ b/pkg/sentry/kernel/signalfd/signalfd.go
@@ -106,8 +106,8 @@ func (s *SignalOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
Signo: uint32(info.Signo),
Errno: info.Errno,
Code: info.Code,
- PID: uint32(info.Pid()),
- UID: uint32(info.Uid()),
+ PID: uint32(info.PID()),
+ UID: uint32(info.UID()),
Status: info.Status(),
Overrun: uint32(info.Overrun()),
Addr: info.Addr(),
diff --git a/pkg/sentry/kernel/syscalls_state.go b/pkg/sentry/kernel/syscalls_state.go
index 90f890495..0b17a562e 100644
--- a/pkg/sentry/kernel/syscalls_state.go
+++ b/pkg/sentry/kernel/syscalls_state.go
@@ -30,18 +30,18 @@ type syscallTableInfo struct {
}
// saveSt saves the SyscallTable.
-func (tc *TaskContext) saveSt() syscallTableInfo {
+func (image *TaskImage) saveSt() syscallTableInfo {
return syscallTableInfo{
- OS: tc.st.OS,
- Arch: tc.st.Arch,
+ OS: image.st.OS,
+ Arch: image.st.Arch,
}
}
// loadSt loads the SyscallTable.
-func (tc *TaskContext) loadSt(sti syscallTableInfo) {
+func (image *TaskImage) loadSt(sti syscallTableInfo) {
st, ok := LookupSyscallTable(sti.OS, sti.Arch)
if !ok {
panic(fmt.Sprintf("syscall table not found for OS %v, Arch %v", sti.OS, sti.Arch))
}
- tc.st = st // Save the table reference.
+ image.st = st // Save the table reference.
}
diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go
index a83ce219c..3fee7aa68 100644
--- a/pkg/sentry/kernel/syslog.go
+++ b/pkg/sentry/kernel/syslog.go
@@ -75,6 +75,12 @@ func (s *syslog) Log() []byte {
"Checking naughty and nice process list...", // Check it up to twice.
"Granting licence to kill(2)...", // British spelling for British movie.
"Letting the watchdogs out...",
+ "Conjuring /dev/null black hole...",
+ "Adversarially training Redcode AI...",
+ "Singleplexing /dev/ptmx...",
+ "Recruiting cron-ies...",
+ "Verifying that no non-zero bytes made their way into /dev/zero...",
+ "Accelerating teletypewriter to 9600 baud...",
}
selectMessage := func() string {
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 037971393..c0ab53c94 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -21,7 +21,6 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/bpf"
- "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/inet"
@@ -29,11 +28,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
- "gvisor.dev/gvisor/pkg/sentry/limits"
- "gvisor.dev/gvisor/pkg/sentry/pgalloc"
"gvisor.dev/gvisor/pkg/sentry/platform"
- "gvisor.dev/gvisor/pkg/sentry/unimpl"
- "gvisor.dev/gvisor/pkg/sentry/uniqueid"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
@@ -63,6 +58,12 @@ import (
type Task struct {
taskNode
+ // goid is the task goroutine's ID. goid is owned by the task goroutine,
+ // but since it's used to detect cases where non-task goroutines
+ // incorrectly access state owned by, or exclusive to, the task goroutine,
+ // goid is always accessed using atomic memory operations.
+ goid int64 `state:"nosave"`
+
// runState is what the task goroutine is executing if it is not stopped.
// If runState is nil, the task goroutine should exit or has exited.
// runState is exclusive to the task goroutine.
@@ -83,7 +84,7 @@ type Task struct {
// taskWork is exclusive to the task goroutine.
taskWork []TaskWorker
- // haveSyscallReturn is true if tc.Arch().Return() represents a value
+ // haveSyscallReturn is true if image.Arch().Return() represents a value
// returned by a syscall (or set by ptrace after a syscall).
//
// haveSyscallReturn is exclusive to the task goroutine.
@@ -257,10 +258,10 @@ type Task struct {
// mu protects some of the following fields.
mu sync.Mutex `state:"nosave"`
- // tc holds task data provided by the ELF loader.
+ // image holds task data provided by the ELF loader.
//
- // tc is protected by mu, and is owned by the task goroutine.
- tc TaskContext
+ // image is protected by mu, and is owned by the task goroutine.
+ image TaskImage
// fsContext is the task's filesystem context.
//
@@ -274,7 +275,7 @@ type Task struct {
// If vforkParent is not nil, it is the task that created this task with
// vfork() or clone(CLONE_VFORK), and should have its vforkStop ended when
- // this TaskContext is released.
+ // this TaskImage is released.
//
// vforkParent is protected by the TaskSet mutex.
vforkParent *Task
@@ -641,64 +642,6 @@ func (t *Task) Kernel() *Kernel {
return t.k
}
-// Value implements context.Context.Value.
-//
-// Preconditions: The caller must be running on the task goroutine (as implied
-// by the requirements of context.Context).
-func (t *Task) Value(key interface{}) interface{} {
- switch key {
- case CtxCanTrace:
- return t.CanTrace
- case CtxKernel:
- return t.k
- case CtxPIDNamespace:
- return t.tg.pidns
- case CtxUTSNamespace:
- return t.utsns
- case CtxIPCNamespace:
- ipcns := t.IPCNamespace()
- ipcns.IncRef()
- return ipcns
- case CtxTask:
- return t
- case auth.CtxCredentials:
- return t.Credentials()
- case context.CtxThreadGroupID:
- return int32(t.ThreadGroup().ID())
- case fs.CtxRoot:
- return t.fsContext.RootDirectory()
- case vfs.CtxRoot:
- return t.fsContext.RootDirectoryVFS2()
- case vfs.CtxMountNamespace:
- t.mountNamespaceVFS2.IncRef()
- return t.mountNamespaceVFS2
- case fs.CtxDirentCacheLimiter:
- return t.k.DirentCacheLimiter
- case inet.CtxStack:
- return t.NetworkContext()
- case ktime.CtxRealtimeClock:
- return t.k.RealtimeClock()
- case limits.CtxLimits:
- return t.tg.limits
- case pgalloc.CtxMemoryFile:
- return t.k.mf
- case pgalloc.CtxMemoryFileProvider:
- return t.k
- case platform.CtxPlatform:
- return t.k
- case uniqueid.CtxGlobalUniqueID:
- return t.k.UniqueID()
- case uniqueid.CtxGlobalUniqueIDProvider:
- return t.k
- case uniqueid.CtxInotifyCookie:
- return t.k.GenerateInotifyCookie()
- case unimpl.CtxEvents:
- return t.k
- default:
- return nil
- }
-}
-
// SetClearTID sets t's cleartid.
//
// Preconditions: The caller must be running on the task goroutine.
@@ -751,12 +694,12 @@ func (t *Task) IsChrooted() bool {
return root != realRoot
}
-// TaskContext returns t's TaskContext.
+// TaskImage returns t's TaskImage.
//
// Precondition: The caller must be running on the task goroutine, or t.mu must
// be locked.
-func (t *Task) TaskContext() *TaskContext {
- return &t.tc
+func (t *Task) TaskImage() *TaskImage {
+ return &t.image
}
// FSContext returns t's FSContext. FSContext does not take an additional
diff --git a/pkg/sentry/kernel/task_acct.go b/pkg/sentry/kernel/task_acct.go
index 5f3e60fe8..e574997f7 100644
--- a/pkg/sentry/kernel/task_acct.go
+++ b/pkg/sentry/kernel/task_acct.go
@@ -136,14 +136,14 @@ func (tg *ThreadGroup) IOUsage() *usage.IO {
func (t *Task) Name() string {
t.mu.Lock()
defer t.mu.Unlock()
- return t.tc.Name
+ return t.image.Name
}
// SetName changes t's name.
func (t *Task) SetName(name string) {
t.mu.Lock()
defer t.mu.Unlock()
- t.tc.Name = name
+ t.image.Name = name
t.Debugf("Set thread name to %q", name)
}
diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go
index 4a4a69ee2..9419f2e95 100644
--- a/pkg/sentry/kernel/task_block.go
+++ b/pkg/sentry/kernel/task_block.go
@@ -20,6 +20,7 @@ import (
"time"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -32,6 +33,8 @@ import (
//
// - An error which is nil if an event is received from C, ETIMEDOUT if the timeout
// expired, and syserror.ErrInterrupted if t is interrupted.
+//
+// Preconditions: The caller must be running on the task goroutine.
func (t *Task) BlockWithTimeout(C chan struct{}, haveTimeout bool, timeout time.Duration) (time.Duration, error) {
if !haveTimeout {
return timeout, t.block(C, nil)
@@ -112,7 +115,14 @@ func (t *Task) Block(C <-chan struct{}) error {
// block blocks a task on one of many events.
// N.B. defer is too expensive to be used here.
+//
+// Preconditions: The caller must be running on the task goroutine.
func (t *Task) block(C <-chan struct{}, timerChan <-chan struct{}) error {
+ // This function is very hot; skip this check outside of +race builds.
+ if sync.RaceEnabled {
+ t.assertTaskGoroutine()
+ }
+
// Fast path if the request is already done.
select {
case <-C:
@@ -156,33 +166,39 @@ func (t *Task) block(C <-chan struct{}, timerChan <-chan struct{}) error {
}
}
-// SleepStart implements amutex.Sleeper.SleepStart.
+// SleepStart implements context.ChannelSleeper.SleepStart.
func (t *Task) SleepStart() <-chan struct{} {
+ t.assertTaskGoroutine()
t.Deactivate()
t.accountTaskGoroutineEnter(TaskGoroutineBlockedInterruptible)
return t.interruptChan
}
-// SleepFinish implements amutex.Sleeper.SleepFinish.
+// SleepFinish implements context.ChannelSleeper.SleepFinish.
func (t *Task) SleepFinish(success bool) {
if !success {
- // The interrupted notification is consumed only at the top-level
- // (Run). Therefore we attempt to reset the pending notification.
- // This will also elide our next entry back into the task, so we
- // will process signals, state changes, etc.
+ // Our caller received from t.interruptChan; we need to re-send to it
+ // to ensure that t.interrupted() is still true.
t.interruptSelf()
}
t.accountTaskGoroutineLeave(TaskGoroutineBlockedInterruptible)
t.Activate()
}
-// Interrupted implements amutex.Sleeper.Interrupted
+// Interrupted implements context.ChannelSleeper.Interrupted.
func (t *Task) Interrupted() bool {
- return len(t.interruptChan) != 0
+ if t.interrupted() {
+ return true
+ }
+ // Indicate that t's task goroutine is still responsive (i.e. reset the
+ // watchdog timer).
+ t.accountTaskGoroutineRunning()
+ return false
}
// UninterruptibleSleepStart implements context.Context.UninterruptibleSleepStart.
func (t *Task) UninterruptibleSleepStart(deactivate bool) {
+ t.assertTaskGoroutine()
if deactivate {
t.Deactivate()
}
@@ -198,13 +214,17 @@ func (t *Task) UninterruptibleSleepFinish(activate bool) {
}
// interrupted returns true if interrupt or interruptSelf has been called at
-// least once since the last call to interrupted.
+// least once since the last call to unsetInterrupted.
func (t *Task) interrupted() bool {
+ return len(t.interruptChan) != 0
+}
+
+// unsetInterrupted causes interrupted to return false until the next call to
+// interrupt or interruptSelf.
+func (t *Task) unsetInterrupted() {
select {
case <-t.interruptChan:
- return true
default:
- return false
}
}
@@ -220,9 +240,7 @@ func (t *Task) interrupt() {
func (t *Task) interruptSelf() {
select {
case t.interruptChan <- struct{}{}:
- t.Debugf("Interrupt queued")
default:
- t.Debugf("Dropping duplicate interrupt")
}
// platform.Context.Interrupt() is unnecessary since a task goroutine
// calling interruptSelf() cannot also be blocked in
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 682080c14..f305e69c0 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -115,7 +115,7 @@ type CloneOptions struct {
ParentTID usermem.Addr
// If Vfork is true, place the parent in vforkStop until the cloned task
- // releases its TaskContext.
+ // releases its TaskImage.
Vfork bool
// If Untraced is true, do not report PTRACE_EVENT_CLONE/FORK/VFORK for
@@ -226,20 +226,20 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
})
}
- tc, err := t.tc.Fork(t, t.k, !opts.NewAddressSpace)
+ image, err := t.image.Fork(t, t.k, !opts.NewAddressSpace)
if err != nil {
return 0, nil, err
}
cu.Add(func() {
- tc.release()
+ image.release()
})
// clone() returns 0 in the child.
- tc.Arch.SetReturn(0)
+ image.Arch.SetReturn(0)
if opts.Stack != 0 {
- tc.Arch.SetStack(uintptr(opts.Stack))
+ image.Arch.SetStack(uintptr(opts.Stack))
}
if opts.SetTLS {
- if !tc.Arch.SetTLS(uintptr(opts.TLS)) {
+ if !image.Arch.SetTLS(uintptr(opts.TLS)) {
return 0, nil, syserror.EPERM
}
}
@@ -288,7 +288,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
Kernel: t.k,
ThreadGroup: tg,
SignalMask: t.SignalMask(),
- TaskContext: tc,
+ TaskImage: image,
FSContext: fsContext,
FDTable: fdTable,
Credentials: creds,
@@ -355,7 +355,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
}
if opts.ChildSetTID {
ctid := nt.ThreadID()
- ctid.CopyOut(nt.AsCopyContext(usermem.IOOpts{AddressSpaceActive: false}), opts.ChildTID)
+ ctid.CopyOut(nt.CopyContext(t, usermem.IOOpts{AddressSpaceActive: false}), opts.ChildTID)
}
ntid := t.tg.pidns.IDOfTask(nt)
if opts.ParentSetTID {
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index d1136461a..70b0699dc 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 The gVisor Authors.
+// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -15,159 +15,175 @@
package kernel
import (
- "fmt"
+ "time"
- "gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/sentry/arch"
- "gvisor.dev/gvisor/pkg/sentry/kernel/futex"
- "gvisor.dev/gvisor/pkg/sentry/loader"
- "gvisor.dev/gvisor/pkg/sentry/mm"
- "gvisor.dev/gvisor/pkg/syserr"
- "gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/inet"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+ "gvisor.dev/gvisor/pkg/sentry/limits"
+ "gvisor.dev/gvisor/pkg/sentry/pgalloc"
+ "gvisor.dev/gvisor/pkg/sentry/platform"
+ "gvisor.dev/gvisor/pkg/sentry/unimpl"
+ "gvisor.dev/gvisor/pkg/sentry/uniqueid"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
)
-var errNoSyscalls = syserr.New("no syscall table found", linux.ENOEXEC)
-
-// Auxmap contains miscellaneous data for the task.
-type Auxmap map[string]interface{}
-
-// TaskContext is the subset of a task's data that is provided by the loader.
-//
-// +stateify savable
-type TaskContext struct {
- // Name is the thread name set by the prctl(PR_SET_NAME) system call.
- Name string
-
- // Arch is the architecture-specific context (registers, etc.)
- Arch arch.Context
-
- // MemoryManager is the task's address space.
- MemoryManager *mm.MemoryManager
+// Deadline implements context.Context.Deadline.
+func (t *Task) Deadline() (time.Time, bool) {
+ return time.Time{}, false
+}
- // fu implements futexes in the address space.
- fu *futex.Manager
+// Done implements context.Context.Done.
+func (t *Task) Done() <-chan struct{} {
+ return nil
+}
- // st is the task's syscall table.
- st *SyscallTable `state:".(syscallTableInfo)"`
+// Err implements context.Context.Err.
+func (t *Task) Err() error {
+ return nil
}
-// release releases all resources held by the TaskContext. release is called by
-// the task when it execs into a new TaskContext or exits.
-func (tc *TaskContext) release() {
- // Nil out pointers so that if the task is saved after release, it doesn't
- // follow the pointers to possibly now-invalid objects.
- if tc.MemoryManager != nil {
- tc.MemoryManager.DecUsers(context.Background())
- tc.MemoryManager = nil
+// Value implements context.Context.Value.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) Value(key interface{}) interface{} {
+ // This function is very hot; skip this check outside of +race builds.
+ if sync.RaceEnabled {
+ t.assertTaskGoroutine()
}
- tc.fu = nil
+ return t.contextValue(key, true /* isTaskGoroutine */)
}
-// Fork returns a duplicate of tc. The copied TaskContext always has an
-// independent arch.Context. If shareAddressSpace is true, the copied
-// TaskContext shares an address space with the original; otherwise, the copied
-// TaskContext has an independent address space that is initially a duplicate
-// of the original's.
-func (tc *TaskContext) Fork(ctx context.Context, k *Kernel, shareAddressSpace bool) (*TaskContext, error) {
- newTC := &TaskContext{
- Name: tc.Name,
- Arch: tc.Arch.Fork(),
- st: tc.st,
- }
- if shareAddressSpace {
- newTC.MemoryManager = tc.MemoryManager
- if newTC.MemoryManager != nil {
- if !newTC.MemoryManager.IncUsers() {
- // Shouldn't be possible since tc.MemoryManager should be a
- // counted user.
- panic(fmt.Sprintf("TaskContext.Fork called with userless TaskContext.MemoryManager"))
- }
+func (t *Task) contextValue(key interface{}, isTaskGoroutine bool) interface{} {
+ switch key {
+ case CtxCanTrace:
+ return t.CanTrace
+ case CtxKernel:
+ return t.k
+ case CtxPIDNamespace:
+ return t.tg.pidns
+ case CtxUTSNamespace:
+ if !isTaskGoroutine {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ }
+ return t.utsns
+ case CtxIPCNamespace:
+ if !isTaskGoroutine {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ }
+ ipcns := t.ipcns
+ ipcns.IncRef()
+ return ipcns
+ case CtxTask:
+ return t
+ case auth.CtxCredentials:
+ return t.creds.Load()
+ case context.CtxThreadGroupID:
+ return int32(t.tg.ID())
+ case fs.CtxRoot:
+ if !isTaskGoroutine {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ }
+ return t.fsContext.RootDirectory()
+ case vfs.CtxRoot:
+ if !isTaskGoroutine {
+ t.mu.Lock()
+ defer t.mu.Unlock()
}
- newTC.fu = tc.fu
- } else {
- newMM, err := tc.MemoryManager.Fork(ctx)
- if err != nil {
- return nil, err
+ return t.fsContext.RootDirectoryVFS2()
+ case vfs.CtxMountNamespace:
+ if !isTaskGoroutine {
+ t.mu.Lock()
+ defer t.mu.Unlock()
}
- newTC.MemoryManager = newMM
- newTC.fu = k.futexes.Fork()
+ t.mountNamespaceVFS2.IncRef()
+ return t.mountNamespaceVFS2
+ case fs.CtxDirentCacheLimiter:
+ return t.k.DirentCacheLimiter
+ case inet.CtxStack:
+ return t.NetworkContext()
+ case ktime.CtxRealtimeClock:
+ return t.k.RealtimeClock()
+ case limits.CtxLimits:
+ return t.tg.limits
+ case pgalloc.CtxMemoryFile:
+ return t.k.mf
+ case pgalloc.CtxMemoryFileProvider:
+ return t.k
+ case platform.CtxPlatform:
+ return t.k
+ case uniqueid.CtxGlobalUniqueID:
+ return t.k.UniqueID()
+ case uniqueid.CtxGlobalUniqueIDProvider:
+ return t.k
+ case uniqueid.CtxInotifyCookie:
+ return t.k.GenerateInotifyCookie()
+ case unimpl.CtxEvents:
+ return t.k
+ default:
+ return nil
}
- return newTC, nil
}
-// Arch returns t's arch.Context.
-//
-// Preconditions: The caller must be running on the task goroutine, or t.mu
-// must be locked.
-func (t *Task) Arch() arch.Context {
- return t.tc.Arch
+// taskAsyncContext implements context.Context for a goroutine that performs
+// work on behalf of a Task, but is not the task goroutine.
+type taskAsyncContext struct {
+ context.NoopSleeper
+
+ t *Task
}
-// MemoryManager returns t's MemoryManager. MemoryManager does not take an
-// additional reference on the returned MM.
-//
-// Preconditions: The caller must be running on the task goroutine, or t.mu
-// must be locked.
-func (t *Task) MemoryManager() *mm.MemoryManager {
- return t.tc.MemoryManager
+// AsyncContext returns a context.Context representing t. The returned
+// context.Context is intended for use by goroutines other than t's task
+// goroutine; for example, signal delivery to t will not interrupt goroutines
+// that are blocking using the returned context.Context.
+func (t *Task) AsyncContext() context.Context {
+ return taskAsyncContext{t: t}
}
-// SyscallTable returns t's syscall table.
-//
-// Preconditions: The caller must be running on the task goroutine, or t.mu
-// must be locked.
-func (t *Task) SyscallTable() *SyscallTable {
- return t.tc.st
+// Debugf implements log.Logger.Debugf.
+func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) {
+ ctx.t.Debugf(format, v...)
}
-// Stack returns the userspace stack.
-//
-// Preconditions: The caller must be running on the task goroutine, or t.mu
-// must be locked.
-func (t *Task) Stack() *arch.Stack {
- return &arch.Stack{
- Arch: t.Arch(),
- IO: t.MemoryManager(),
- Bottom: usermem.Addr(t.Arch().Stack()),
- }
+// Infof implements log.Logger.Infof.
+func (ctx taskAsyncContext) Infof(format string, v ...interface{}) {
+ ctx.t.Infof(format, v...)
}
-// LoadTaskImage loads a specified file into a new TaskContext.
-//
-// args.MemoryManager does not need to be set by the caller.
-func (k *Kernel) LoadTaskImage(ctx context.Context, args loader.LoadArgs) (*TaskContext, *syserr.Error) {
- // If File is not nil, we should load that instead of resolving Filename.
- if args.File != nil {
- args.Filename = args.File.PathnameWithDeleted(ctx)
- }
+// Warningf implements log.Logger.Warningf.
+func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) {
+ ctx.t.Warningf(format, v...)
+}
+
+// IsLogging implements log.Logger.IsLogging.
+func (ctx taskAsyncContext) IsLogging(level log.Level) bool {
+ return ctx.t.IsLogging(level)
+}
- // Prepare a new user address space to load into.
- m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation)
- defer m.DecUsers(ctx)
- args.MemoryManager = m
+// Deadline implements context.Context.Deadline.
+func (ctx taskAsyncContext) Deadline() (time.Time, bool) {
+ return time.Time{}, false
+}
- os, ac, name, err := loader.Load(ctx, args, k.extraAuxv, k.vdso)
- if err != nil {
- return nil, err
- }
+// Done implements context.Context.Done.
+func (ctx taskAsyncContext) Done() <-chan struct{} {
+ return nil
+}
- // Lookup our new syscall table.
- st, ok := LookupSyscallTable(os, ac.Arch())
- if !ok {
- // No syscall table found. This means that the ELF binary does not match
- // the architecture.
- return nil, errNoSyscalls
- }
+// Err implements context.Context.Err.
+func (ctx taskAsyncContext) Err() error {
+ return nil
+}
- if !m.IncUsers() {
- panic("Failed to increment users count on new MM")
- }
- return &TaskContext{
- Name: name,
- Arch: ac,
- MemoryManager: m,
- fu: k.futexes.Fork(),
- st: st,
- }, nil
+// Value implements context.Context.Value.
+func (ctx taskAsyncContext) Value(key interface{}) interface{} {
+ return ctx.t.contextValue(key, false /* isTaskGoroutine */)
}
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 412d471d3..d9897e802 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -83,11 +83,12 @@ type execStop struct{}
func (*execStop) Killable() bool { return true }
// Execve implements the execve(2) syscall by killing all other tasks in its
-// thread group and switching to newTC. Execve always takes ownership of newTC.
+// thread group and switching to newImage. Execve always takes ownership of
+// newImage.
//
// Preconditions: The caller must be running Task.doSyscallInvoke on the task
// goroutine.
-func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) {
+func (t *Task) Execve(newImage *TaskImage) (*SyscallControl, error) {
t.tg.pidns.owner.mu.Lock()
defer t.tg.pidns.owner.mu.Unlock()
t.tg.signalHandlers.mu.Lock()
@@ -96,7 +97,7 @@ func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) {
if t.tg.exiting || t.tg.execing != nil {
// We lost to a racing group-exit, kill, or exec from another thread
// and should just exit.
- newTC.release()
+ newImage.release()
return nil, syserror.EINTR
}
@@ -118,7 +119,7 @@ func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) {
t.beginInternalStopLocked((*execStop)(nil))
}
- return &SyscallControl{next: &runSyscallAfterExecStop{newTC}, ignoreReturn: true}, nil
+ return &SyscallControl{next: &runSyscallAfterExecStop{newImage}, ignoreReturn: true}, nil
}
// The runSyscallAfterExecStop state continues execve(2) after all siblings of
@@ -126,16 +127,16 @@ func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) {
//
// +stateify savable
type runSyscallAfterExecStop struct {
- tc *TaskContext
+ image *TaskImage
}
func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
- t.traceExecEvent(r.tc)
+ t.traceExecEvent(r.image)
t.tg.pidns.owner.mu.Lock()
t.tg.execing = nil
if t.killed() {
t.tg.pidns.owner.mu.Unlock()
- r.tc.release()
+ r.image.release()
return (*runInterrupt)(nil)
}
// We are the thread group leader now. Save our old thread ID for
@@ -214,7 +215,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
// executables (set-user/group-ID bits and file capabilities). This
// allows us to unconditionally enable user dumpability on the new mm.
// See fs/exec.c:setup_new_exec.
- r.tc.MemoryManager.SetDumpability(mm.UserDumpable)
+ r.image.MemoryManager.SetDumpability(mm.UserDumpable)
// Switch to the new process.
t.MemoryManager().Deactivate()
@@ -222,8 +223,8 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
// Update credentials to reflect the execve. This should precede switching
// MMs to ensure that dumpability has been reset first, if needed.
t.updateCredsForExecLocked()
- t.tc.release()
- t.tc = *r.tc
+ t.image.release()
+ t.image = *r.image
t.mu.Unlock()
t.unstopVforkParent()
t.p.FullStateChanged()
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index ce7b9641d..16986244c 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -266,7 +266,7 @@ func (*runExitMain) execute(t *Task) taskRunState {
t.updateRSSLocked()
t.tg.pidns.owner.mu.Unlock()
t.mu.Lock()
- t.tc.release()
+ t.image.release()
t.mu.Unlock()
// Releasing the MM unblocks a blocked CLONE_VFORK parent.
@@ -368,8 +368,8 @@ func (t *Task) exitChildren() {
Signo: int32(sig),
Code: arch.SignalInfoUser,
}
- siginfo.SetPid(int32(c.tg.pidns.tids[t]))
- siginfo.SetUid(int32(t.Credentials().RealKUID.In(c.UserNamespace()).OrOverflow()))
+ siginfo.SetPID(int32(c.tg.pidns.tids[t]))
+ siginfo.SetUID(int32(t.Credentials().RealKUID.In(c.UserNamespace()).OrOverflow()))
c.tg.signalHandlers.mu.Lock()
c.sendSignalLocked(siginfo, true /* group */)
c.tg.signalHandlers.mu.Unlock()
@@ -698,8 +698,8 @@ func (t *Task) exitNotificationSignal(sig linux.Signal, receiver *Task) *arch.Si
info := &arch.SignalInfo{
Signo: int32(sig),
}
- info.SetPid(int32(receiver.tg.pidns.tids[t]))
- info.SetUid(int32(t.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow()))
+ info.SetPID(int32(receiver.tg.pidns.tids[t]))
+ info.SetUID(int32(t.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow()))
if t.exitStatus.Signaled() {
info.Code = arch.CLD_KILLED
info.SetStatus(int32(t.exitStatus.Signo))
diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go
index c80391475..195c7da9b 100644
--- a/pkg/sentry/kernel/task_futex.go
+++ b/pkg/sentry/kernel/task_futex.go
@@ -26,7 +26,7 @@ import (
// Preconditions: The caller must be running on the task goroutine, or t.mu
// must be locked.
func (t *Task) Futex() *futex.Manager {
- return t.tc.fu
+ return t.image.fu
}
// SwapUint32 implements futex.Target.SwapUint32.
diff --git a/pkg/sentry/kernel/task_image.go b/pkg/sentry/kernel/task_image.go
new file mode 100644
index 000000000..ce5fbd299
--- /dev/null
+++ b/pkg/sentry/kernel/task_image.go
@@ -0,0 +1,173 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/futex"
+ "gvisor.dev/gvisor/pkg/sentry/loader"
+ "gvisor.dev/gvisor/pkg/sentry/mm"
+ "gvisor.dev/gvisor/pkg/syserr"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+var errNoSyscalls = syserr.New("no syscall table found", linux.ENOEXEC)
+
+// Auxmap contains miscellaneous data for the task.
+type Auxmap map[string]interface{}
+
+// TaskImage is the subset of a task's data that is provided by the loader.
+//
+// +stateify savable
+type TaskImage struct {
+ // Name is the thread name set by the prctl(PR_SET_NAME) system call.
+ Name string
+
+ // Arch is the architecture-specific context (registers, etc.)
+ Arch arch.Context
+
+ // MemoryManager is the task's address space.
+ MemoryManager *mm.MemoryManager
+
+ // fu implements futexes in the address space.
+ fu *futex.Manager
+
+ // st is the task's syscall table.
+ st *SyscallTable `state:".(syscallTableInfo)"`
+}
+
+// release releases all resources held by the TaskImage. release is called by
+// the task when it execs into a new TaskImage or exits.
+func (image *TaskImage) release() {
+ // Nil out pointers so that if the task is saved after release, it doesn't
+ // follow the pointers to possibly now-invalid objects.
+ if image.MemoryManager != nil {
+ image.MemoryManager.DecUsers(context.Background())
+ image.MemoryManager = nil
+ }
+ image.fu = nil
+}
+
+// Fork returns a duplicate of image. The copied TaskImage always has an
+// independent arch.Context. If shareAddressSpace is true, the copied
+// TaskImage shares an address space with the original; otherwise, the copied
+// TaskImage has an independent address space that is initially a duplicate
+// of the original's.
+func (image *TaskImage) Fork(ctx context.Context, k *Kernel, shareAddressSpace bool) (*TaskImage, error) {
+ newImage := &TaskImage{
+ Name: image.Name,
+ Arch: image.Arch.Fork(),
+ st: image.st,
+ }
+ if shareAddressSpace {
+ newImage.MemoryManager = image.MemoryManager
+ if newImage.MemoryManager != nil {
+ if !newImage.MemoryManager.IncUsers() {
+ // Shouldn't be possible since image.MemoryManager should be a
+ // counted user.
+ panic(fmt.Sprintf("TaskImage.Fork called with userless TaskImage.MemoryManager"))
+ }
+ }
+ newImage.fu = image.fu
+ } else {
+ newMM, err := image.MemoryManager.Fork(ctx)
+ if err != nil {
+ return nil, err
+ }
+ newImage.MemoryManager = newMM
+ newImage.fu = k.futexes.Fork()
+ }
+ return newImage, nil
+}
+
+// Arch returns t's arch.Context.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) Arch() arch.Context {
+ return t.image.Arch
+}
+
+// MemoryManager returns t's MemoryManager. MemoryManager does not take an
+// additional reference on the returned MM.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) MemoryManager() *mm.MemoryManager {
+ return t.image.MemoryManager
+}
+
+// SyscallTable returns t's syscall table.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) SyscallTable() *SyscallTable {
+ return t.image.st
+}
+
+// Stack returns the userspace stack.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) Stack() *arch.Stack {
+ return &arch.Stack{
+ Arch: t.Arch(),
+ IO: t.MemoryManager(),
+ Bottom: usermem.Addr(t.Arch().Stack()),
+ }
+}
+
+// LoadTaskImage loads a specified file into a new TaskImage.
+//
+// args.MemoryManager does not need to be set by the caller.
+func (k *Kernel) LoadTaskImage(ctx context.Context, args loader.LoadArgs) (*TaskImage, *syserr.Error) {
+ // If File is not nil, we should load that instead of resolving Filename.
+ if args.File != nil {
+ args.Filename = args.File.PathnameWithDeleted(ctx)
+ }
+
+ // Prepare a new user address space to load into.
+ m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation)
+ defer m.DecUsers(ctx)
+ args.MemoryManager = m
+
+ os, ac, name, err := loader.Load(ctx, args, k.extraAuxv, k.vdso)
+ if err != nil {
+ return nil, err
+ }
+
+ // Lookup our new syscall table.
+ st, ok := LookupSyscallTable(os, ac.Arch())
+ if !ok {
+ // No syscall table found. This means that the ELF binary does not match
+ // the architecture.
+ return nil, errNoSyscalls
+ }
+
+ if !m.IncUsers() {
+ panic("Failed to increment users count on new MM")
+ }
+ return &TaskImage{
+ Name: name,
+ Arch: ac,
+ MemoryManager: m,
+ fu: k.futexes.Fork(),
+ st: st,
+ }, nil
+}
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
index d23cea802..c70e5e6ce 100644
--- a/pkg/sentry/kernel/task_log.go
+++ b/pkg/sentry/kernel/task_log.go
@@ -19,6 +19,7 @@ import (
"runtime/trace"
"sort"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -215,7 +216,7 @@ func (t *Task) rebuildTraceContext(tid ThreadID) {
// arbitrarily large (in general it won't be, especially for cases
// where we're collecting a brief profile), so using the TID is a
// reasonable compromise in this case.
- t.traceContext, t.traceTask = trace.NewTask(t, fmt.Sprintf("tid:%d", tid))
+ t.traceContext, t.traceTask = trace.NewTask(context.Background(), fmt.Sprintf("tid:%d", tid))
}
// traceCloneEvent is called when a new task is spawned.
@@ -237,11 +238,11 @@ func (t *Task) traceExitEvent() {
}
// traceExecEvent is called when a task calls exec.
-func (t *Task) traceExecEvent(tc *TaskContext) {
+func (t *Task) traceExecEvent(image *TaskImage) {
if !trace.IsEnabled() {
return
}
- file := tc.MemoryManager.Executable()
+ file := image.MemoryManager.Executable()
if file == nil {
trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>")
return
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index 8dc3fec90..3ccecf4b6 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -16,11 +16,13 @@ package kernel
import (
"bytes"
+ "fmt"
"runtime"
"runtime/trace"
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/goid"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/hostcpu"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
@@ -57,6 +59,8 @@ type taskRunState interface {
// make it visible in stack dumps. A goroutine for a given task can be identified
// searching for Task.run()'s argument value.
func (t *Task) run(threadID uintptr) {
+ atomic.StoreInt64(&t.goid, goid.Get())
+
// Construct t.blockingTimer here. We do this here because we can't
// reconstruct t.blockingTimer during restore in Task.afterLoad(), because
// kernel.timekeeper.SetClocks() hasn't been called yet.
@@ -99,6 +103,9 @@ func (t *Task) run(threadID uintptr) {
t.tg.pidns.owner.runningGoroutines.Done()
t.p.Release()
+ // Deferring this store triggers a false positive in the race
+ // detector (https://github.com/golang/go/issues/42599).
+ atomic.StoreInt64(&t.goid, 0)
// Keep argument alive because stack trace for dead variables may not be correct.
runtime.KeepAlive(threadID)
return
@@ -317,7 +324,7 @@ func (app *runApp) execute(t *Task) taskRunState {
// region. We should be able to easily identify
// vsyscalls by having a <fault><syscall> pair.
if at.Execute {
- if sysno, ok := t.tc.st.LookupEmulate(addr); ok {
+ if sysno, ok := t.image.st.LookupEmulate(addr); ok {
return t.doVsyscall(addr, sysno)
}
}
@@ -375,6 +382,19 @@ func (app *runApp) execute(t *Task) taskRunState {
}
}
+// assertTaskGoroutine panics if the caller is not running on t's task
+// goroutine.
+func (t *Task) assertTaskGoroutine() {
+ if got, want := goid.Get(), atomic.LoadInt64(&t.goid); got != want {
+ panic(fmt.Sprintf("running on goroutine %d (task goroutine for kernel.Task %p is %d)", got, t, want))
+ }
+}
+
+// GoroutineID returns the ID of t's task goroutine.
+func (t *Task) GoroutineID() int64 {
+ return atomic.LoadInt64(&t.goid)
+}
+
// waitGoroutineStoppedOrExited blocks until t's task goroutine stops or exits.
func (t *Task) waitGoroutineStoppedOrExited() {
t.goroutineStopped.Wait()
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
index 52c55d13d..9ba5f8d78 100644
--- a/pkg/sentry/kernel/task_sched.go
+++ b/pkg/sentry/kernel/task_sched.go
@@ -157,6 +157,18 @@ func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) {
t.goschedSeq.EndWrite()
}
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) accountTaskGoroutineRunning() {
+ now := t.k.CPUClockNow()
+ if t.gosched.State != TaskGoroutineRunningSys {
+ panic(fmt.Sprintf("Task goroutine in state %v (expected %v)", t.gosched.State, TaskGoroutineRunningSys))
+ }
+ t.goschedSeq.BeginWrite()
+ t.gosched.SysTicks += now - t.gosched.Timestamp
+ t.gosched.Timestamp = now
+ t.goschedSeq.EndWrite()
+}
+
// TaskGoroutineSchedInfo returns a copy of t's task goroutine scheduling info.
// Most clients should use t.CPUStats() instead.
func (t *Task) TaskGoroutineSchedInfo() TaskGoroutineSchedInfo {
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index ebdb83061..75af3af79 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -619,9 +619,6 @@ func (t *Task) setSignalMaskLocked(mask linux.SignalSet) {
return
}
})
- // We have to re-issue the interrupt consumed by t.interrupted() since
- // it might have been for a different reason.
- t.interruptSelf()
}
// Conversely, if the new mask unblocks any signals that were blocked by
@@ -917,8 +914,8 @@ func (t *Task) signalStop(target *Task, code int32, status int32) {
Signo: int32(linux.SIGCHLD),
Code: code,
}
- sigchld.SetPid(int32(t.tg.pidns.tids[target]))
- sigchld.SetUid(int32(target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
+ sigchld.SetPID(int32(t.tg.pidns.tids[target]))
+ sigchld.SetUID(int32(target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
sigchld.SetStatus(status)
// TODO(b/72102453): Set utime, stime.
t.sendSignalLocked(sigchld, true /* group */)
@@ -931,10 +928,10 @@ func (t *Task) signalStop(target *Task, code int32, status int32) {
type runInterrupt struct{}
func (*runInterrupt) execute(t *Task) taskRunState {
- // Interrupts are de-duplicated (if t is interrupted twice before
- // t.interrupted() is called, t.interrupted() will only return true once),
- // so early exits from this function must re-enter the runInterrupt state
- // to check for more interrupt-signaled conditions.
+ // Interrupts are de-duplicated (t.unsetInterrupted() will undo the effect
+ // of all previous calls to t.interrupted() regardless of how many such
+ // calls there have been), so early exits from this function must re-enter
+ // the runInterrupt state to check for more interrupt-signaled conditions.
t.tg.signalHandlers.mu.Lock()
@@ -1025,8 +1022,8 @@ func (*runInterrupt) execute(t *Task) taskRunState {
Signo: int32(sig),
Code: t.ptraceCode,
}
- t.ptraceSiginfo.SetPid(int32(t.tg.pidns.tids[t]))
- t.ptraceSiginfo.SetUid(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
+ t.ptraceSiginfo.SetPID(int32(t.tg.pidns.tids[t]))
+ t.ptraceSiginfo.SetUID(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
} else {
t.ptraceCode = int32(sig)
t.ptraceSiginfo = nil
@@ -1080,6 +1077,7 @@ func (*runInterrupt) execute(t *Task) taskRunState {
return t.deliverSignal(info, act)
}
+ t.unsetInterrupted()
t.tg.signalHandlers.mu.Unlock()
return (*runApp)(nil)
}
@@ -1116,11 +1114,11 @@ func (*runInterruptAfterSignalDeliveryStop) execute(t *Task) taskRunState {
if parent == nil {
// Tracer has detached and t was created by Kernel.CreateProcess().
// Pretend the parent is in an ancestor PID + user namespace.
- info.SetPid(0)
- info.SetUid(int32(auth.OverflowUID))
+ info.SetPID(0)
+ info.SetUID(int32(auth.OverflowUID))
} else {
- info.SetPid(int32(t.tg.pidns.tids[parent]))
- info.SetUid(int32(parent.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
+ info.SetPID(int32(t.tg.pidns.tids[parent]))
+ info.SetUID(int32(parent.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
}
}
t.tg.signalHandlers.mu.Lock()
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index 8e28230cc..36e1384f1 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -46,10 +46,10 @@ type TaskConfig struct {
// SignalMask is the new task's initial signal mask.
SignalMask linux.SignalSet
- // TaskContext is the TaskContext of the new task. Ownership of the
- // TaskContext is transferred to TaskSet.NewTask, whether or not it
+ // TaskImage is the TaskImage of the new task. Ownership of the
+ // TaskImage is transferred to TaskSet.NewTask, whether or not it
// succeeds.
- TaskContext *TaskContext
+ TaskImage *TaskImage
// FSContext is the FSContext of the new task. A reference must be held on
// FSContext, which is transferred to TaskSet.NewTask whether or not it
@@ -105,7 +105,7 @@ type TaskConfig struct {
func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) {
t, err := ts.newTask(cfg)
if err != nil {
- cfg.TaskContext.release()
+ cfg.TaskImage.release()
cfg.FSContext.DecRef(ctx)
cfg.FDTable.DecRef(ctx)
cfg.IPCNamespace.DecRef(ctx)
@@ -121,7 +121,7 @@ func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error)
// of cfg if it succeeds.
func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
tg := cfg.ThreadGroup
- tc := cfg.TaskContext
+ image := cfg.TaskImage
t := &Task{
taskNode: taskNode{
tg: tg,
@@ -132,7 +132,7 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
interruptChan: make(chan struct{}, 1),
signalMask: cfg.SignalMask,
signalStack: arch.SignalStack{Flags: arch.SignalStackFlagDisable},
- tc: *tc,
+ image: *image,
fsContext: cfg.FSContext,
fdTable: cfg.FDTable,
p: cfg.Kernel.Platform.NewContext(),
diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
index ce134bf54..94dabbcd8 100644
--- a/pkg/sentry/kernel/task_usermem.go
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -18,7 +18,8 @@ import (
"math"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/marshal"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/mm"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -281,29 +282,89 @@ func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOp
}, nil
}
-// copyContext implements marshal.CopyContext. It wraps a task to allow copying
-// memory to and from the task memory with custom usermem.IOOpts.
-type copyContext struct {
- *Task
+type taskCopyContext struct {
+ ctx context.Context
+ t *Task
opts usermem.IOOpts
}
-// AsCopyContext wraps the task and returns it as CopyContext.
-func (t *Task) AsCopyContext(opts usermem.IOOpts) marshal.CopyContext {
- return &copyContext{t, opts}
+// CopyContext returns a marshal.CopyContext that copies to/from t's address
+// space using opts.
+func (t *Task) CopyContext(ctx context.Context, opts usermem.IOOpts) *taskCopyContext {
+ return &taskCopyContext{
+ ctx: ctx,
+ t: t,
+ opts: opts,
+ }
+}
+
+// CopyScratchBuffer implements marshal.CopyContext.CopyScratchBuffer.
+func (cc *taskCopyContext) CopyScratchBuffer(size int) []byte {
+ if ctxTask, ok := cc.ctx.(*Task); ok {
+ return ctxTask.CopyScratchBuffer(size)
+ }
+ return make([]byte, size)
+}
+
+func (cc *taskCopyContext) getMemoryManager() (*mm.MemoryManager, error) {
+ cc.t.mu.Lock()
+ tmm := cc.t.MemoryManager()
+ cc.t.mu.Unlock()
+ if !tmm.IncUsers() {
+ return nil, syserror.EFAULT
+ }
+ return tmm, nil
+}
+
+// CopyInBytes implements marshal.CopyContext.CopyInBytes.
+func (cc *taskCopyContext) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
+ tmm, err := cc.getMemoryManager()
+ if err != nil {
+ return 0, err
+ }
+ defer tmm.DecUsers(cc.ctx)
+ return tmm.CopyIn(cc.ctx, addr, dst, cc.opts)
+}
+
+// CopyOutBytes implements marshal.CopyContext.CopyOutBytes.
+func (cc *taskCopyContext) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) {
+ tmm, err := cc.getMemoryManager()
+ if err != nil {
+ return 0, err
+ }
+ defer tmm.DecUsers(cc.ctx)
+ return tmm.CopyOut(cc.ctx, addr, src, cc.opts)
+}
+
+type ownTaskCopyContext struct {
+ t *Task
+ opts usermem.IOOpts
+}
+
+// OwnCopyContext returns a marshal.CopyContext that copies to/from t's address
+// space using opts. The returned CopyContext may only be used by t's task
+// goroutine.
+//
+// Since t already implements marshal.CopyContext, this is only needed to
+// override the usermem.IOOpts used for the copy.
+func (t *Task) OwnCopyContext(opts usermem.IOOpts) *ownTaskCopyContext {
+ return &ownTaskCopyContext{
+ t: t,
+ opts: opts,
+ }
}
-// CopyInString copies a string in from the task's memory.
-func (t *copyContext) CopyInString(addr usermem.Addr, maxLen int) (string, error) {
- return usermem.CopyStringIn(t, t.MemoryManager(), addr, maxLen, t.opts)
+// CopyScratchBuffer implements marshal.CopyContext.CopyScratchBuffer.
+func (cc *ownTaskCopyContext) CopyScratchBuffer(size int) []byte {
+ return cc.t.CopyScratchBuffer(size)
}
-// CopyInBytes copies task memory into dst from an IO context.
-func (t *copyContext) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
- return t.MemoryManager().CopyIn(t, addr, dst, t.opts)
+// CopyInBytes implements marshal.CopyContext.CopyInBytes.
+func (cc *ownTaskCopyContext) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
+ return cc.t.MemoryManager().CopyIn(cc.t, addr, dst, cc.opts)
}
-// CopyOutBytes copies src into task memoryfrom an IO context.
-func (t *copyContext) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) {
- return t.MemoryManager().CopyOut(t, addr, src, t.opts)
+// CopyOutBytes implements marshal.CopyContext.CopyOutBytes.
+func (cc *ownTaskCopyContext) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) {
+ return cc.t.MemoryManager().CopyOut(cc.t, addr, src, cc.opts)
}
diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go
index 9bc452e67..9e5c2d26f 100644
--- a/pkg/sentry/kernel/vdso.go
+++ b/pkg/sentry/kernel/vdso.go
@@ -115,7 +115,7 @@ func (v *VDSOParamPage) incrementSeq(paramPage safemem.Block) error {
}
if old != v.seq {
- return fmt.Errorf("unexpected VDSOParamPage seq value: got %d expected %d. Application may hang or get incorrect time from the VDSO.", old, v.seq)
+ return fmt.Errorf("unexpected VDSOParamPage seq value: got %d expected %d; application may hang or get incorrect time from the VDSO", old, v.seq)
}
v.seq = next