summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/kernel')
-rw-r--r--pkg/sentry/kernel/BUILD55
-rw-r--r--pkg/sentry/kernel/auth/BUILD1
-rw-r--r--pkg/sentry/kernel/auth/context.go20
-rw-r--r--pkg/sentry/kernel/auth/id.go4
-rw-r--r--pkg/sentry/kernel/fd_table.go183
-rw-r--r--pkg/sentry/kernel/fd_table_test.go8
-rw-r--r--pkg/sentry/kernel/fd_table_unsafe.go63
-rw-r--r--pkg/sentry/kernel/fs_context.go89
-rw-r--r--pkg/sentry/kernel/kcov.go321
-rw-r--r--pkg/sentry/kernel/kcov_unsafe.go28
-rw-r--r--pkg/sentry/kernel/kernel.go19
-rw-r--r--pkg/sentry/kernel/pipe/pipe.go14
-rw-r--r--pkg/sentry/kernel/pipe/vfs.go68
-rw-r--r--pkg/sentry/kernel/ptrace.go48
-rw-r--r--pkg/sentry/kernel/ptrace_amd64.go2
-rw-r--r--pkg/sentry/kernel/rseq.go31
-rw-r--r--pkg/sentry/kernel/sessions.go29
-rw-r--r--pkg/sentry/kernel/shm/BUILD13
-rw-r--r--pkg/sentry/kernel/shm/shm.go19
-rw-r--r--pkg/sentry/kernel/syscalls.go10
-rw-r--r--pkg/sentry/kernel/task.go18
-rw-r--r--pkg/sentry/kernel/task_clone.go6
-rw-r--r--pkg/sentry/kernel/task_exec.go7
-rw-r--r--pkg/sentry/kernel/task_exit.go5
-rw-r--r--pkg/sentry/kernel/task_futex.go7
-rw-r--r--pkg/sentry/kernel/task_run.go7
-rw-r--r--pkg/sentry/kernel/task_sched.go11
-rw-r--r--pkg/sentry/kernel/task_signals.go20
-rw-r--r--pkg/sentry/kernel/task_stop.go14
-rw-r--r--pkg/sentry/kernel/task_syscall.go76
-rw-r--r--pkg/sentry/kernel/task_usermem.go64
-rw-r--r--pkg/sentry/kernel/threads.go2
-rw-r--r--pkg/sentry/kernel/time/time.go6
-rw-r--r--pkg/sentry/kernel/vdso.go3
34 files changed, 892 insertions, 379 deletions
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 5416a310d..a43c549f1 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -74,6 +74,50 @@ go_template_instance(
},
)
+go_template_instance(
+ name = "fd_table_refs",
+ out = "fd_table_refs.go",
+ package = "kernel",
+ prefix = "FDTable",
+ template = "//pkg/refs_vfs2:refs_template",
+ types = {
+ "T": "FDTable",
+ },
+)
+
+go_template_instance(
+ name = "fs_context_refs",
+ out = "fs_context_refs.go",
+ package = "kernel",
+ prefix = "FSContext",
+ template = "//pkg/refs_vfs2:refs_template",
+ types = {
+ "T": "FSContext",
+ },
+)
+
+go_template_instance(
+ name = "process_group_refs",
+ out = "process_group_refs.go",
+ package = "kernel",
+ prefix = "ProcessGroup",
+ template = "//pkg/refs_vfs2:refs_template",
+ types = {
+ "T": "ProcessGroup",
+ },
+)
+
+go_template_instance(
+ name = "session_refs",
+ out = "session_refs.go",
+ package = "kernel",
+ prefix = "Session",
+ template = "//pkg/refs_vfs2:refs_template",
+ types = {
+ "T": "Session",
+ },
+)
+
proto_library(
name = "uncaught_signal",
srcs = ["uncaught_signal.proto"],
@@ -88,9 +132,13 @@ go_library(
"aio.go",
"context.go",
"fd_table.go",
+ "fd_table_refs.go",
"fd_table_unsafe.go",
"fs_context.go",
+ "fs_context_refs.go",
"ipc_namespace.go",
+ "kcov.go",
+ "kcov_unsafe.go",
"kernel.go",
"kernel_opts.go",
"kernel_state.go",
@@ -99,6 +147,7 @@ go_library(
"pending_signals_state.go",
"posixtimer.go",
"process_group_list.go",
+ "process_group_refs.go",
"ptrace.go",
"ptrace_amd64.go",
"ptrace_arm64.go",
@@ -106,6 +155,7 @@ go_library(
"seccomp.go",
"seqatomic_taskgoroutineschedinfo_unsafe.go",
"session_list.go",
+ "session_refs.go",
"sessions.go",
"signal.go",
"signal_handlers.go",
@@ -147,6 +197,7 @@ go_library(
"gvisor.dev/gvisor/pkg/sentry/device",
"gvisor.dev/gvisor/pkg/tcpip",
],
+ marshal = True,
visibility = ["//:sandbox"],
deps = [
":uncaught_signal_go_proto",
@@ -157,10 +208,13 @@ go_library(
"//pkg/bits",
"//pkg/bpf",
"//pkg/context",
+ "//pkg/coverage",
"//pkg/cpuid",
"//pkg/eventchannel",
"//pkg/fspath",
"//pkg/log",
+ "//pkg/marshal",
+ "//pkg/marshal/primitive",
"//pkg/metric",
"//pkg/refs",
"//pkg/refs_vfs2",
@@ -210,7 +264,6 @@ go_library(
"//pkg/tcpip/stack",
"//pkg/usermem",
"//pkg/waiter",
- "//tools/go_marshal/marshal",
],
)
diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD
index 2bc49483a..869e49ebc 100644
--- a/pkg/sentry/kernel/auth/BUILD
+++ b/pkg/sentry/kernel/auth/BUILD
@@ -57,6 +57,7 @@ go_library(
"id_map_set.go",
"user_namespace.go",
],
+ marshal = True,
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
diff --git a/pkg/sentry/kernel/auth/context.go b/pkg/sentry/kernel/auth/context.go
index ef5723127..c08d47787 100644
--- a/pkg/sentry/kernel/auth/context.go
+++ b/pkg/sentry/kernel/auth/context.go
@@ -34,3 +34,23 @@ func CredentialsFromContext(ctx context.Context) *Credentials {
}
return NewAnonymousCredentials()
}
+
+// ContextWithCredentials returns a copy of ctx carrying creds.
+func ContextWithCredentials(ctx context.Context, creds *Credentials) context.Context {
+ return &authContext{ctx, creds}
+}
+
+type authContext struct {
+ context.Context
+ creds *Credentials
+}
+
+// Value implements context.Context.
+func (ac *authContext) Value(key interface{}) interface{} {
+ switch key {
+ case CtxCredentials:
+ return ac.creds
+ default:
+ return ac.Context.Value(key)
+ }
+}
diff --git a/pkg/sentry/kernel/auth/id.go b/pkg/sentry/kernel/auth/id.go
index 0a58ba17c..4c32ee703 100644
--- a/pkg/sentry/kernel/auth/id.go
+++ b/pkg/sentry/kernel/auth/id.go
@@ -19,9 +19,13 @@ import (
)
// UID is a user ID in an unspecified user namespace.
+//
+// +marshal
type UID uint32
// GID is a group ID in an unspecified user namespace.
+//
+// +marshal slice:GIDSlice
type GID uint32
// In the root user namespace, user/group IDs have a 1-to-1 relationship with
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index ce53af69b..0ec7344cd 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -23,7 +23,6 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/limits"
@@ -78,7 +77,8 @@ type descriptor struct {
//
// +stateify savable
type FDTable struct {
- refs.AtomicRefCount
+ FDTableRefs
+
k *Kernel
// mu protects below.
@@ -111,8 +111,11 @@ func (f *FDTable) saveDescriptorTable() map[int32]descriptor {
func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) {
ctx := context.Background()
f.init() // Initialize table.
+ f.used = 0
for fd, d := range m {
- f.setAll(fd, d.file, d.fileVFS2, d.flags)
+ if file, fileVFS2 := f.setAll(ctx, fd, d.file, d.fileVFS2, d.flags); file != nil || fileVFS2 != nil {
+ panic("VFS1 or VFS2 files set")
+ }
// Note that we do _not_ need to acquire a extra table reference here. The
// table reference will already be accounted for in the file, so we drop the
@@ -127,7 +130,7 @@ func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) {
}
// drop drops the table reference.
-func (f *FDTable) drop(file *fs.File) {
+func (f *FDTable) drop(ctx context.Context, file *fs.File) {
// Release locks.
file.Dirent.Inode.LockCtx.Posix.UnlockRegion(f, lock.LockRange{0, lock.LockEOF})
@@ -145,14 +148,13 @@ func (f *FDTable) drop(file *fs.File) {
d.InotifyEvent(ev, 0)
// Drop the table reference.
- file.DecRef(context.Background())
+ file.DecRef(ctx)
}
// dropVFS2 drops the table reference.
-func (f *FDTable) dropVFS2(file *vfs.FileDescription) {
+func (f *FDTable) dropVFS2(ctx context.Context, file *vfs.FileDescription) {
// Release any POSIX lock possibly held by the FDTable. Range {0, 0} means the
// entire file.
- ctx := context.Background()
err := file.UnlockPOSIX(ctx, f, 0, 0, linux.SEEK_SET)
if err != nil && err != syserror.ENOLCK {
panic(fmt.Sprintf("UnlockPOSIX failed: %v", err))
@@ -176,22 +178,15 @@ func (k *Kernel) NewFDTable() *FDTable {
return f
}
-// destroy removes all of the file descriptors from the map.
-func (f *FDTable) destroy(ctx context.Context) {
- f.RemoveIf(ctx, func(*fs.File, *vfs.FileDescription, FDFlags) bool {
- return true
- })
-}
-
-// DecRef implements RefCounter.DecRef with destructor f.destroy.
+// DecRef implements RefCounter.DecRef.
+//
+// If f reaches zero references, all of its file descriptors are removed.
func (f *FDTable) DecRef(ctx context.Context) {
- f.DecRefWithDestructor(ctx, f.destroy)
-}
-
-// Size returns the number of file descriptor slots currently allocated.
-func (f *FDTable) Size() int {
- size := atomic.LoadInt32(&f.used)
- return int(size)
+ f.FDTableRefs.DecRef(func() {
+ f.RemoveIf(ctx, func(*fs.File, *vfs.FileDescription, FDFlags) bool {
+ return true
+ })
+ })
}
// forEach iterates over all non-nil files in sorted order.
@@ -280,7 +275,6 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
}
f.mu.Lock()
- defer f.mu.Unlock()
// From f.next to find available fd.
if fd < f.next {
@@ -290,15 +284,25 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
// Install all entries.
for i := fd; i < end && len(fds) < len(files); i++ {
if d, _, _ := f.get(i); d == nil {
- f.set(i, files[len(fds)], flags) // Set the descriptor.
- fds = append(fds, i) // Record the file descriptor.
+ // Set the descriptor.
+ f.set(ctx, i, files[len(fds)], flags)
+ fds = append(fds, i) // Record the file descriptor.
}
}
// Failure? Unwind existing FDs.
if len(fds) < len(files) {
for _, i := range fds {
- f.set(i, nil, FDFlags{}) // Zap entry.
+ f.set(ctx, i, nil, FDFlags{})
+ }
+ f.mu.Unlock()
+
+ // Drop the reference taken by the call to f.set() that
+ // originally installed the file. Don't call f.drop()
+ // (generating inotify events, etc.) since the file should
+ // appear to have never been inserted into f.
+ for _, file := range files[:len(fds)] {
+ file.DecRef(ctx)
}
return nil, syscall.EMFILE
}
@@ -308,6 +312,7 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
f.next = fds[len(fds)-1] + 1
}
+ f.mu.Unlock()
return fds, nil
}
@@ -335,7 +340,6 @@ func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDes
}
f.mu.Lock()
- defer f.mu.Unlock()
// From f.next to find available fd.
if fd < f.next {
@@ -345,15 +349,25 @@ func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDes
// Install all entries.
for i := fd; i < end && len(fds) < len(files); i++ {
if d, _, _ := f.getVFS2(i); d == nil {
- f.setVFS2(i, files[len(fds)], flags) // Set the descriptor.
- fds = append(fds, i) // Record the file descriptor.
+ // Set the descriptor.
+ f.setVFS2(ctx, i, files[len(fds)], flags)
+ fds = append(fds, i) // Record the file descriptor.
}
}
// Failure? Unwind existing FDs.
if len(fds) < len(files) {
for _, i := range fds {
- f.setVFS2(i, nil, FDFlags{}) // Zap entry.
+ f.setVFS2(ctx, i, nil, FDFlags{})
+ }
+ f.mu.Unlock()
+
+ // Drop the reference taken by the call to f.setVFS2() that
+ // originally installed the file. Don't call f.dropVFS2()
+ // (generating inotify events, etc.) since the file should
+ // appear to have never been inserted into f.
+ for _, file := range files[:len(fds)] {
+ file.DecRef(ctx)
}
return nil, syscall.EMFILE
}
@@ -363,6 +377,7 @@ func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDes
f.next = fds[len(fds)-1] + 1
}
+ f.mu.Unlock()
return fds, nil
}
@@ -398,7 +413,7 @@ func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDesc
}
for fd < end {
if d, _, _ := f.getVFS2(fd); d == nil {
- f.setVFS2(fd, file, flags)
+ f.setVFS2(ctx, fd, file, flags)
if fd == f.next {
// Update next search start position.
f.next = fd + 1
@@ -414,40 +429,55 @@ func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDesc
// reference for that FD, the ref count for that existing reference is
// decremented.
func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FDFlags) error {
- return f.newFDAt(ctx, fd, file, nil, flags)
+ df, _, err := f.newFDAt(ctx, fd, file, nil, flags)
+ if err != nil {
+ return err
+ }
+ if df != nil {
+ f.drop(ctx, df)
+ }
+ return nil
}
// NewFDAtVFS2 sets the file reference for the given FD. If there is an active
// reference for that FD, the ref count for that existing reference is
// decremented.
func (f *FDTable) NewFDAtVFS2(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) error {
- return f.newFDAt(ctx, fd, nil, file, flags)
+ _, dfVFS2, err := f.newFDAt(ctx, fd, nil, file, flags)
+ if err != nil {
+ return err
+ }
+ if dfVFS2 != nil {
+ f.dropVFS2(ctx, dfVFS2)
+ }
+ return nil
}
-func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) error {
+func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) (*fs.File, *vfs.FileDescription, error) {
if fd < 0 {
// Don't accept negative FDs.
- return syscall.EBADF
+ return nil, nil, syscall.EBADF
}
// Check the limit for the provided file.
if limitSet := limits.FromContext(ctx); limitSet != nil {
if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur {
- return syscall.EMFILE
+ return nil, nil, syscall.EMFILE
}
}
// Install the entry.
f.mu.Lock()
defer f.mu.Unlock()
- f.setAll(fd, file, fileVFS2, flags)
- return nil
+
+ df, dfVFS2 := f.setAll(ctx, fd, file, fileVFS2, flags)
+ return df, dfVFS2, nil
}
// SetFlags sets the flags for the given file descriptor.
//
// True is returned iff flags were changed.
-func (f *FDTable) SetFlags(fd int32, flags FDFlags) error {
+func (f *FDTable) SetFlags(ctx context.Context, fd int32, flags FDFlags) error {
if fd < 0 {
// Don't accept negative FDs.
return syscall.EBADF
@@ -463,14 +493,14 @@ func (f *FDTable) SetFlags(fd int32, flags FDFlags) error {
}
// Update the flags.
- f.set(fd, file, flags)
+ f.set(ctx, fd, file, flags)
return nil
}
// SetFlagsVFS2 sets the flags for the given file descriptor.
//
// True is returned iff flags were changed.
-func (f *FDTable) SetFlagsVFS2(fd int32, flags FDFlags) error {
+func (f *FDTable) SetFlagsVFS2(ctx context.Context, fd int32, flags FDFlags) error {
if fd < 0 {
// Don't accept negative FDs.
return syscall.EBADF
@@ -486,7 +516,7 @@ func (f *FDTable) SetFlagsVFS2(fd int32, flags FDFlags) error {
}
// Update the flags.
- f.setVFS2(fd, file, flags)
+ f.setVFS2(ctx, fd, file, flags)
return nil
}
@@ -552,30 +582,6 @@ func (f *FDTable) GetFDs(ctx context.Context) []int32 {
return fds
}
-// GetRefs returns a stable slice of references to all files and bumps the
-// reference count on each. The caller must use DecRef on each reference when
-// they're done using the slice.
-func (f *FDTable) GetRefs(ctx context.Context) []*fs.File {
- files := make([]*fs.File, 0, f.Size())
- f.forEach(ctx, func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
- file.IncRef() // Acquire a reference for caller.
- files = append(files, file)
- })
- return files
-}
-
-// GetRefsVFS2 returns a stable slice of references to all files and bumps the
-// reference count on each. The caller must use DecRef on each reference when
-// they're done using the slice.
-func (f *FDTable) GetRefsVFS2(ctx context.Context) []*vfs.FileDescription {
- files := make([]*vfs.FileDescription, 0, f.Size())
- f.forEach(ctx, func(_ int32, _ *fs.File, file *vfs.FileDescription, _ FDFlags) {
- file.IncRef() // Acquire a reference for caller.
- files = append(files, file)
- })
- return files
-}
-
// Fork returns an independent FDTable.
func (f *FDTable) Fork(ctx context.Context) *FDTable {
clone := f.k.NewFDTable()
@@ -583,11 +589,8 @@ func (f *FDTable) Fork(ctx context.Context) *FDTable {
f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
// The set function here will acquire an appropriate table
// reference for the clone. We don't need anything else.
- switch {
- case file != nil:
- clone.set(fd, file, flags)
- case fileVFS2 != nil:
- clone.setVFS2(fd, fileVFS2, flags)
+ if df, dfVFS2 := clone.setAll(ctx, fd, file, fileVFS2, flags); df != nil || dfVFS2 != nil {
+ panic("VFS1 or VFS2 files set")
}
})
return clone
@@ -596,13 +599,12 @@ func (f *FDTable) Fork(ctx context.Context) *FDTable {
// Remove removes an FD from and returns a non-file iff successful.
//
// N.B. Callers are required to use DecRef when they are done.
-func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) {
+func (f *FDTable) Remove(ctx context.Context, fd int32) (*fs.File, *vfs.FileDescription) {
if fd < 0 {
return nil, nil
}
f.mu.Lock()
- defer f.mu.Unlock()
// Update current available position.
if fd < f.next {
@@ -618,24 +620,51 @@ func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) {
case orig2 != nil:
orig2.IncRef()
}
+
if orig != nil || orig2 != nil {
- f.setAll(fd, nil, nil, FDFlags{}) // Zap entry.
+ orig, orig2 = f.setAll(ctx, fd, nil, nil, FDFlags{}) // Zap entry.
}
+ f.mu.Unlock()
+
+ if orig != nil {
+ f.drop(ctx, orig)
+ }
+ if orig2 != nil {
+ f.dropVFS2(ctx, orig2)
+ }
+
return orig, orig2
}
// RemoveIf removes all FDs where cond is true.
func (f *FDTable) RemoveIf(ctx context.Context, cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) {
- f.mu.Lock()
- defer f.mu.Unlock()
+ // TODO(gvisor.dev/issue/1624): Remove fs.File slice.
+ var files []*fs.File
+ var filesVFS2 []*vfs.FileDescription
+ f.mu.Lock()
f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
if cond(file, fileVFS2, flags) {
- f.set(fd, nil, FDFlags{}) // Clear from table.
+ df, dfVFS2 := f.setAll(ctx, fd, nil, nil, FDFlags{}) // Clear from table.
+ if df != nil {
+ files = append(files, df)
+ }
+ if dfVFS2 != nil {
+ filesVFS2 = append(filesVFS2, dfVFS2)
+ }
// Update current available position.
if fd < f.next {
f.next = fd
}
}
})
+ f.mu.Unlock()
+
+ for _, file := range files {
+ f.drop(ctx, file)
+ }
+
+ for _, file := range filesVFS2 {
+ f.dropVFS2(ctx, file)
+ }
}
diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go
index e3f30ba2a..bf5460083 100644
--- a/pkg/sentry/kernel/fd_table_test.go
+++ b/pkg/sentry/kernel/fd_table_test.go
@@ -72,7 +72,7 @@ func TestFDTableMany(t *testing.T) {
}
i := int32(2)
- fdTable.Remove(i)
+ fdTable.Remove(ctx, i)
if fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil || fds[0] != i {
t.Fatalf("Allocated %v FDs but wanted to allocate %v: %v", i, maxFD, err)
}
@@ -93,7 +93,7 @@ func TestFDTableOverLimit(t *testing.T) {
t.Fatalf("fdTable.NewFDs(maxFD-3, {f,f,f}): got %v, wanted nil", err)
} else {
for _, fd := range fds {
- fdTable.Remove(fd)
+ fdTable.Remove(ctx, fd)
}
}
@@ -150,13 +150,13 @@ func TestFDTable(t *testing.T) {
t.Fatalf("fdTable.Get(2): got a %v, wanted nil", ref)
}
- ref, _ := fdTable.Remove(1)
+ ref, _ := fdTable.Remove(ctx, 1)
if ref == nil {
t.Fatalf("fdTable.Remove(1) for an existing FD: failed, want success")
}
ref.DecRef(ctx)
- if ref, _ := fdTable.Remove(1); ref != nil {
+ if ref, _ := fdTable.Remove(ctx, 1); ref != nil {
t.Fatalf("r.Remove(1) for a removed FD: got success, want failure")
}
})
diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go
index 7fd97dc53..da79e6627 100644
--- a/pkg/sentry/kernel/fd_table_unsafe.go
+++ b/pkg/sentry/kernel/fd_table_unsafe.go
@@ -18,6 +18,7 @@ import (
"sync/atomic"
"unsafe"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/vfs"
)
@@ -31,6 +32,8 @@ type descriptorTable struct {
}
// init initializes the table.
+//
+// TODO(gvisor.dev/1486): Enable leak check for FDTable.
func (f *FDTable) init() {
var slice []unsafe.Pointer // Empty slice.
atomic.StorePointer(&f.slice, unsafe.Pointer(&slice))
@@ -76,33 +79,37 @@ func (f *FDTable) getAll(fd int32) (*fs.File, *vfs.FileDescription, FDFlags, boo
return d.file, d.fileVFS2, d.flags, true
}
-// set sets an entry.
-//
-// This handles accounting changes, as well as acquiring and releasing the
-// reference needed by the table iff the file is different.
+// CurrentMaxFDs returns the number of file descriptors that may be stored in f
+// without reallocation.
+func (f *FDTable) CurrentMaxFDs() int {
+ slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice))
+ return len(slice)
+}
+
+// set sets an entry for VFS1, refer to setAll().
//
// Precondition: mu must be held.
-func (f *FDTable) set(fd int32, file *fs.File, flags FDFlags) {
- f.setAll(fd, file, nil, flags)
+func (f *FDTable) set(ctx context.Context, fd int32, file *fs.File, flags FDFlags) *fs.File {
+ dropFile, _ := f.setAll(ctx, fd, file, nil, flags)
+ return dropFile
}
-// setVFS2 sets an entry.
-//
-// This handles accounting changes, as well as acquiring and releasing the
-// reference needed by the table iff the file is different.
+// setVFS2 sets an entry for VFS2, refer to setAll().
//
// Precondition: mu must be held.
-func (f *FDTable) setVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) {
- f.setAll(fd, nil, file, flags)
+func (f *FDTable) setVFS2(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) *vfs.FileDescription {
+ _, dropFile := f.setAll(ctx, fd, nil, file, flags)
+ return dropFile
}
-// setAll sets an entry.
-//
-// This handles accounting changes, as well as acquiring and releasing the
-// reference needed by the table iff the file is different.
+// setAll sets the file description referred to by fd to file/fileVFS2. If
+// file/fileVFS2 are non-nil, it takes a reference on them. If setAll replaces
+// an existing file description, it returns it with the FDTable's reference
+// transferred to the caller, which must call f.drop/dropVFS2() on the returned
+// file after unlocking f.mu.
//
// Precondition: mu must be held.
-func (f *FDTable) setAll(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+func (f *FDTable) setAll(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) (*fs.File, *vfs.FileDescription) {
if file != nil && fileVFS2 != nil {
panic("VFS1 and VFS2 files set")
}
@@ -145,25 +152,25 @@ func (f *FDTable) setAll(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription,
}
}
- // Drop the table reference.
+ // Adjust used.
+ switch {
+ case orig == nil && desc != nil:
+ atomic.AddInt32(&f.used, 1)
+ case orig != nil && desc == nil:
+ atomic.AddInt32(&f.used, -1)
+ }
+
if orig != nil {
switch {
case orig.file != nil:
if desc == nil || desc.file != orig.file {
- f.drop(orig.file)
+ return orig.file, nil
}
case orig.fileVFS2 != nil:
if desc == nil || desc.fileVFS2 != orig.fileVFS2 {
- f.dropVFS2(orig.fileVFS2)
+ return nil, orig.fileVFS2
}
}
}
-
- // Adjust used.
- switch {
- case orig == nil && desc != nil:
- atomic.AddInt32(&f.used, 1)
- case orig != nil && desc == nil:
- atomic.AddInt32(&f.used, -1)
- }
+ return nil, nil
}
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
index 8f2d36d5a..d46d1e1c1 100644
--- a/pkg/sentry/kernel/fs_context.go
+++ b/pkg/sentry/kernel/fs_context.go
@@ -18,7 +18,6 @@ import (
"fmt"
"gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
@@ -30,7 +29,7 @@ import (
//
// +stateify savable
type FSContext struct {
- refs.AtomicRefCount
+ FSContextRefs
// mu protects below.
mu sync.Mutex `state:"nosave"`
@@ -64,7 +63,7 @@ func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext {
cwd: cwd,
umask: umask,
}
- f.EnableLeakCheck("kernel.FSContext")
+ f.EnableLeakCheck()
return &f
}
@@ -77,54 +76,56 @@ func NewFSContextVFS2(root, cwd vfs.VirtualDentry, umask uint) *FSContext {
cwdVFS2: cwd,
umask: umask,
}
- f.EnableLeakCheck("kernel.FSContext")
+ f.EnableLeakCheck()
return &f
}
-// destroy is the destructor for an FSContext.
+// DecRef implements RefCounter.DecRef.
//
-// This will call DecRef on both root and cwd Dirents. If either call to
-// DecRef returns an error, then it will be propagated. If both calls to
-// DecRef return an error, then the one from root.DecRef will be propagated.
+// When f reaches zero references, DecRef will be called on both root and cwd
+// Dirents.
//
// Note that there may still be calls to WorkingDirectory() or RootDirectory()
// (that return nil). This is because valid references may still be held via
// proc files or other mechanisms.
-func (f *FSContext) destroy(ctx context.Context) {
- // Hold f.mu so that we don't race with RootDirectory() and
- // WorkingDirectory().
- f.mu.Lock()
- defer f.mu.Unlock()
-
- if VFS2Enabled {
- f.rootVFS2.DecRef(ctx)
- f.rootVFS2 = vfs.VirtualDentry{}
- f.cwdVFS2.DecRef(ctx)
- f.cwdVFS2 = vfs.VirtualDentry{}
- } else {
- f.root.DecRef(ctx)
- f.root = nil
- f.cwd.DecRef(ctx)
- f.cwd = nil
- }
-}
-
-// DecRef implements RefCounter.DecRef with destructor f.destroy.
func (f *FSContext) DecRef(ctx context.Context) {
- f.DecRefWithDestructor(ctx, f.destroy)
+ f.FSContextRefs.DecRef(func() {
+ // Hold f.mu so that we don't race with RootDirectory() and
+ // WorkingDirectory().
+ f.mu.Lock()
+ defer f.mu.Unlock()
+
+ if VFS2Enabled {
+ f.rootVFS2.DecRef(ctx)
+ f.rootVFS2 = vfs.VirtualDentry{}
+ f.cwdVFS2.DecRef(ctx)
+ f.cwdVFS2 = vfs.VirtualDentry{}
+ } else {
+ f.root.DecRef(ctx)
+ f.root = nil
+ f.cwd.DecRef(ctx)
+ f.cwd = nil
+ }
+ })
}
// Fork forks this FSContext.
//
-// This is not a valid call after destroy.
+// This is not a valid call after f is destroyed.
func (f *FSContext) Fork() *FSContext {
f.mu.Lock()
defer f.mu.Unlock()
if VFS2Enabled {
+ if !f.cwdVFS2.Ok() {
+ panic("FSContext.Fork() called after destroy")
+ }
f.cwdVFS2.IncRef()
f.rootVFS2.IncRef()
} else {
+ if f.cwd == nil {
+ panic("FSContext.Fork() called after destroy")
+ }
f.cwd.IncRef()
f.root.IncRef()
}
@@ -140,8 +141,8 @@ func (f *FSContext) Fork() *FSContext {
// WorkingDirectory returns the current working directory.
//
-// This will return nil if called after destroy(), otherwise it will return a
-// Dirent with a reference taken.
+// This will return nil if called after f is destroyed, otherwise it will return
+// a Dirent with a reference taken.
func (f *FSContext) WorkingDirectory() *fs.Dirent {
f.mu.Lock()
defer f.mu.Unlock()
@@ -152,8 +153,8 @@ func (f *FSContext) WorkingDirectory() *fs.Dirent {
// WorkingDirectoryVFS2 returns the current working directory.
//
-// This will return nil if called after destroy(), otherwise it will return a
-// Dirent with a reference taken.
+// This will return nil if called after f is destroyed, otherwise it will return
+// a Dirent with a reference taken.
func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry {
f.mu.Lock()
defer f.mu.Unlock()
@@ -165,7 +166,7 @@ func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry {
// SetWorkingDirectory sets the current working directory.
// This will take an extra reference on the Dirent.
//
-// This is not a valid call after destroy.
+// This is not a valid call after f is destroyed.
func (f *FSContext) SetWorkingDirectory(ctx context.Context, d *fs.Dirent) {
if d == nil {
panic("FSContext.SetWorkingDirectory called with nil dirent")
@@ -187,11 +188,15 @@ func (f *FSContext) SetWorkingDirectory(ctx context.Context, d *fs.Dirent) {
// SetWorkingDirectoryVFS2 sets the current working directory.
// This will take an extra reference on the VirtualDentry.
//
-// This is not a valid call after destroy.
+// This is not a valid call after f is destroyed.
func (f *FSContext) SetWorkingDirectoryVFS2(ctx context.Context, d vfs.VirtualDentry) {
f.mu.Lock()
defer f.mu.Unlock()
+ if !f.cwdVFS2.Ok() {
+ panic(fmt.Sprintf("FSContext.SetWorkingDirectoryVFS2(%v)) called after destroy", d))
+ }
+
old := f.cwdVFS2
f.cwdVFS2 = d
d.IncRef()
@@ -200,8 +205,8 @@ func (f *FSContext) SetWorkingDirectoryVFS2(ctx context.Context, d vfs.VirtualDe
// RootDirectory returns the current filesystem root.
//
-// This will return nil if called after destroy(), otherwise it will return a
-// Dirent with a reference taken.
+// This will return nil if called after f is destroyed, otherwise it will return
+// a Dirent with a reference taken.
func (f *FSContext) RootDirectory() *fs.Dirent {
f.mu.Lock()
defer f.mu.Unlock()
@@ -213,8 +218,8 @@ func (f *FSContext) RootDirectory() *fs.Dirent {
// RootDirectoryVFS2 returns the current filesystem root.
//
-// This will return nil if called after destroy(), otherwise it will return a
-// Dirent with a reference taken.
+// This will return nil if called after f is destroyed, otherwise it will return
+// a Dirent with a reference taken.
func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry {
f.mu.Lock()
defer f.mu.Unlock()
@@ -226,7 +231,7 @@ func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry {
// SetRootDirectory sets the root directory.
// This will take an extra reference on the Dirent.
//
-// This is not a valid call after free.
+// This is not a valid call after f is destroyed.
func (f *FSContext) SetRootDirectory(ctx context.Context, d *fs.Dirent) {
if d == nil {
panic("FSContext.SetRootDirectory called with nil dirent")
@@ -247,7 +252,7 @@ func (f *FSContext) SetRootDirectory(ctx context.Context, d *fs.Dirent) {
// SetRootDirectoryVFS2 sets the root directory. It takes a reference on vd.
//
-// This is not a valid call after free.
+// This is not a valid call after f is destroyed.
func (f *FSContext) SetRootDirectoryVFS2(ctx context.Context, vd vfs.VirtualDentry) {
if !vd.Ok() {
panic("FSContext.SetRootDirectoryVFS2 called with zero-value VirtualDentry")
diff --git a/pkg/sentry/kernel/kcov.go b/pkg/sentry/kernel/kcov.go
new file mode 100644
index 000000000..aad63aa99
--- /dev/null
+++ b/pkg/sentry/kernel/kcov.go
@@ -0,0 +1,321 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "fmt"
+ "io"
+ "sync"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/coverage"
+ "gvisor.dev/gvisor/pkg/safemem"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/mm"
+ "gvisor.dev/gvisor/pkg/sentry/pgalloc"
+ "gvisor.dev/gvisor/pkg/sentry/usage"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// kcovAreaSizeMax is the maximum number of uint64 entries allowed in the kcov
+// area. On Linux, the maximum is INT_MAX / 8.
+const kcovAreaSizeMax = 10 * 1024 * 1024
+
+// Kcov provides kernel coverage data to userspace through a memory-mapped
+// region, as kcov does in Linux.
+//
+// To give the illusion that the data is always up to date, we update the shared
+// memory every time before we return to userspace.
+type Kcov struct {
+ // mfp provides application memory. It is immutable after creation.
+ mfp pgalloc.MemoryFileProvider
+
+ // mu protects all of the fields below.
+ mu sync.RWMutex
+
+ // mode is the current kcov mode.
+ mode uint8
+
+ // size is the size of the mapping through which the kernel conveys coverage
+ // information to userspace.
+ size uint64
+
+ // owningTask is the task that currently owns coverage data on the system. The
+ // interface for kcov essentially requires that coverage is only going to a
+ // single task. Note that kcov should only generate coverage data for the
+ // owning task, but we currently generate global coverage.
+ owningTask *Task
+
+ // count is a locally cached version of the first uint64 in the kcov data,
+ // which is the number of subsequent entries representing PCs.
+ //
+ // It is used with kcovInode.countBlock(), to copy in/out the first element of
+ // the actual data in an efficient manner, avoid boilerplate, and prevent
+ // accidental garbage escapes by the temporary counts.
+ count uint64
+
+ mappable *mm.SpecialMappable
+}
+
+// NewKcov creates and returns a Kcov instance.
+func (k *Kernel) NewKcov() *Kcov {
+ return &Kcov{
+ mfp: k,
+ }
+}
+
+var coveragePool = sync.Pool{
+ New: func() interface{} {
+ return make([]byte, 0)
+ },
+}
+
+// TaskWork implements TaskWorker.TaskWork.
+func (kcov *Kcov) TaskWork(t *Task) {
+ kcov.mu.Lock()
+ defer kcov.mu.Unlock()
+
+ rw := &kcovReadWriter{
+ mf: kcov.mfp.MemoryFile(),
+ fr: kcov.mappable.FileRange(),
+ }
+
+ // Read in the PC count.
+ if _, err := safemem.ReadFullToBlocks(rw, kcov.countBlock()); err != nil {
+ panic(fmt.Sprintf("Internal error reading count from kcov area: %v", err))
+ }
+
+ rw.off = 8 * (1 + kcov.count)
+ n := coverage.ConsumeCoverageData(&kcovIOWriter{rw})
+
+ // Update the pc count, based on the number of entries written. Note that if
+ // we reached the end of the kcov area, we may not have written everything in
+ // output.
+ kcov.count += uint64(n / 8)
+ rw.off = 0
+ if _, err := safemem.WriteFullFromBlocks(rw, kcov.countBlock()); err != nil {
+ panic(fmt.Sprintf("Internal error writing count to kcov area: %v", err))
+ }
+
+ // Re-register for future work.
+ t.RegisterWork(kcov)
+}
+
+// InitTrace performs the KCOV_INIT_TRACE ioctl.
+func (kcov *Kcov) InitTrace(size uint64) error {
+ kcov.mu.Lock()
+ defer kcov.mu.Unlock()
+
+ if kcov.mode != linux.KCOV_MODE_DISABLED {
+ return syserror.EBUSY
+ }
+
+ // To simplify all the logic around mapping, we require that the length of the
+ // shared region is a multiple of the system page size.
+ if (8*size)&(usermem.PageSize-1) != 0 {
+ return syserror.EINVAL
+ }
+
+ // We need space for at least two uint64s to hold current position and a
+ // single PC.
+ if size < 2 || size > kcovAreaSizeMax {
+ return syserror.EINVAL
+ }
+
+ kcov.size = size
+ kcov.mode = linux.KCOV_MODE_INIT
+ return nil
+}
+
+// EnableTrace performs the KCOV_ENABLE_TRACE ioctl.
+func (kcov *Kcov) EnableTrace(ctx context.Context, traceMode uint8) error {
+ t := TaskFromContext(ctx)
+ if t == nil {
+ panic("kcovInode.EnableTrace() cannot be used outside of a task goroutine")
+ }
+
+ kcov.mu.Lock()
+ defer kcov.mu.Unlock()
+
+ // KCOV_ENABLE must be preceded by KCOV_INIT_TRACE and an mmap call.
+ if kcov.mode != linux.KCOV_MODE_INIT || kcov.mappable == nil {
+ return syserror.EINVAL
+ }
+
+ switch traceMode {
+ case linux.KCOV_TRACE_PC:
+ kcov.mode = traceMode
+ case linux.KCOV_TRACE_CMP:
+ // We do not support KCOV_MODE_TRACE_CMP.
+ return syserror.ENOTSUP
+ default:
+ return syserror.EINVAL
+ }
+
+ if kcov.owningTask != nil && kcov.owningTask != t {
+ return syserror.EBUSY
+ }
+
+ kcov.owningTask = t
+ t.RegisterWork(kcov)
+
+ // Clear existing coverage data; the task expects to read only coverage data
+ // from the time it is activated.
+ coverage.ClearCoverageData()
+ return nil
+}
+
+// DisableTrace performs the KCOV_DISABLE_TRACE ioctl.
+func (kcov *Kcov) DisableTrace(ctx context.Context) error {
+ kcov.mu.Lock()
+ defer kcov.mu.Unlock()
+
+ t := TaskFromContext(ctx)
+ if t == nil {
+ panic("kcovInode.EnableTrace() cannot be used outside of a task goroutine")
+ }
+
+ if t != kcov.owningTask {
+ return syserror.EINVAL
+ }
+ kcov.owningTask = nil
+ kcov.mode = linux.KCOV_MODE_INIT
+ kcov.resetLocked()
+ return nil
+}
+
+// Reset is called when the owning task exits.
+func (kcov *Kcov) Reset() {
+ kcov.mu.Lock()
+ kcov.resetLocked()
+ kcov.mu.Unlock()
+}
+
+// The kcov instance is reset when the owning task exits or when tracing is
+// disabled.
+func (kcov *Kcov) resetLocked() {
+ kcov.owningTask = nil
+ if kcov.mappable != nil {
+ kcov.mappable = nil
+ }
+}
+
+// ConfigureMMap is called by the vfs.FileDescription for this kcov instance to
+// implement vfs.FileDescription.ConfigureMMap.
+func (kcov *Kcov) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+ kcov.mu.Lock()
+ defer kcov.mu.Unlock()
+
+ if kcov.mode != linux.KCOV_MODE_INIT {
+ return syserror.EINVAL
+ }
+
+ if kcov.mappable == nil {
+ // Set up the kcov area.
+ fr, err := kcov.mfp.MemoryFile().Allocate(kcov.size*8, usage.Anonymous)
+ if err != nil {
+ return err
+ }
+
+ // Get the thread id for the mmap name.
+ t := TaskFromContext(ctx)
+ if t == nil {
+ panic("ThreadFromContext returned nil")
+ }
+ // For convenience, a special mappable is used here. Note that these mappings
+ // will look different under /proc/[pid]/maps than they do on Linux.
+ kcov.mappable = mm.NewSpecialMappable(fmt.Sprintf("[kcov:%d]", t.ThreadID()), kcov.mfp, fr)
+ }
+ opts.Mappable = kcov.mappable
+ opts.MappingIdentity = kcov.mappable
+ return nil
+}
+
+// kcovReadWriter implements safemem.Reader and safemem.Writer.
+type kcovReadWriter struct {
+ off uint64
+ mf *pgalloc.MemoryFile
+ fr memmap.FileRange
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks.
+func (rw *kcovReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+ if dsts.IsEmpty() {
+ return 0, nil
+ }
+
+ // Limit the read to the kcov range and check for overflow.
+ if rw.fr.Length() <= rw.off {
+ return 0, io.EOF
+ }
+ start := rw.fr.Start + rw.off
+ end := rw.fr.Start + rw.fr.Length()
+ if rend := start + dsts.NumBytes(); rend < end {
+ end = rend
+ }
+
+ // Get internal mappings.
+ bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, usermem.Read)
+ if err != nil {
+ return 0, err
+ }
+
+ // Copy from internal mappings.
+ n, err := safemem.CopySeq(dsts, bs)
+ rw.off += n
+ return n, err
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+func (rw *kcovReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+ if srcs.IsEmpty() {
+ return 0, nil
+ }
+
+ // Limit the write to the kcov area and check for overflow.
+ if rw.fr.Length() <= rw.off {
+ return 0, io.EOF
+ }
+ start := rw.fr.Start + rw.off
+ end := rw.fr.Start + rw.fr.Length()
+ if wend := start + srcs.NumBytes(); wend < end {
+ end = wend
+ }
+
+ // Get internal mapping.
+ bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, usermem.Write)
+ if err != nil {
+ return 0, err
+ }
+
+ // Copy to internal mapping.
+ n, err := safemem.CopySeq(bs, srcs)
+ rw.off += n
+ return n, err
+}
+
+// kcovIOWriter implements io.Writer as a basic wrapper over kcovReadWriter.
+type kcovIOWriter struct {
+ rw *kcovReadWriter
+}
+
+// Write implements io.Writer.Write.
+func (w *kcovIOWriter) Write(p []byte) (int, error) {
+ bs := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(p))
+ n, err := safemem.WriteFullFromBlocks(w.rw, bs)
+ return int(n), err
+}
diff --git a/pkg/sentry/kernel/kcov_unsafe.go b/pkg/sentry/kernel/kcov_unsafe.go
new file mode 100644
index 000000000..6f64022eb
--- /dev/null
+++ b/pkg/sentry/kernel/kcov_unsafe.go
@@ -0,0 +1,28 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "unsafe"
+
+ "gvisor.dev/gvisor/pkg/safemem"
+)
+
+// countBlock provides a safemem.BlockSeq for k.count.
+//
+// Like k.count, the block returned is protected by k.mu.
+func (k *Kcov) countBlock() safemem.BlockSeq {
+ return safemem.BlockSeqOf(safemem.BlockFromSafePointer(unsafe.Pointer(&k.count), int(unsafe.Sizeof(k.count))))
+}
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 1028d13c6..22f9bb006 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -248,7 +248,7 @@ type Kernel struct {
// SpecialOpts contains special kernel options.
SpecialOpts
- // VFS keeps the filesystem state used across the kernel.
+ // vfs keeps the filesystem state used across the kernel.
vfs vfs.VirtualFilesystem
// hostMount is the Mount used for file descriptors that were imported
@@ -888,17 +888,18 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
opener fsbridge.Lookup
fsContext *FSContext
mntns *fs.MountNamespace
+ mntnsVFS2 *vfs.MountNamespace
)
if VFS2Enabled {
- mntnsVFS2 := args.MountNamespaceVFS2
+ mntnsVFS2 = args.MountNamespaceVFS2
if mntnsVFS2 == nil {
// MountNamespaceVFS2 adds a reference to the namespace, which is
// transferred to the new process.
mntnsVFS2 = k.globalInit.Leader().MountNamespaceVFS2()
}
// Get the root directory from the MountNamespace.
- root := args.MountNamespaceVFS2.Root()
+ root := mntnsVFS2.Root()
// The call to newFSContext below will take a reference on root, so we
// don't need to hold this one.
defer root.DecRef(ctx)
@@ -1008,7 +1009,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
UTSNamespace: args.UTSNamespace,
IPCNamespace: args.IPCNamespace,
AbstractSocketNamespace: args.AbstractSocketNamespace,
- MountNamespaceVFS2: args.MountNamespaceVFS2,
+ MountNamespaceVFS2: mntnsVFS2,
ContainerID: args.ContainerID,
}
t, err := k.tasks.NewTask(config)
@@ -1067,8 +1068,9 @@ func (k *Kernel) Start() error {
// pauseTimeLocked pauses all Timers and Timekeeper updates.
//
-// Preconditions: Any task goroutines running in k must be stopped. k.extMu
-// must be locked.
+// Preconditions:
+// * Any task goroutines running in k must be stopped.
+// * k.extMu must be locked.
func (k *Kernel) pauseTimeLocked(ctx context.Context) {
// k.cpuClockTicker may be nil since Kernel.SaveTo() may be called before
// Kernel.Start().
@@ -1111,8 +1113,9 @@ func (k *Kernel) pauseTimeLocked(ctx context.Context) {
// pauseTimeLocked has not been previously called, resumeTimeLocked has no
// effect.
//
-// Preconditions: Any task goroutines running in k must be stopped. k.extMu
-// must be locked.
+// Preconditions:
+// * Any task goroutines running in k must be stopped.
+// * k.extMu must be locked.
func (k *Kernel) resumeTimeLocked(ctx context.Context) {
if k.cpuClockTicker != nil {
k.cpuClockTicker.Resume()
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 297e8f28f..c410c96aa 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -200,17 +200,17 @@ type readOps struct {
//
// Precondition: this pipe must have readers.
func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) {
- // Don't block for a zero-length read even if the pipe is empty.
- if ops.left() == 0 {
- return 0, nil
- }
-
p.mu.Lock()
defer p.mu.Unlock()
return p.readLocked(ctx, ops)
}
func (p *Pipe) readLocked(ctx context.Context, ops readOps) (int64, error) {
+ // Don't block for a zero-length read even if the pipe is empty.
+ if ops.left() == 0 {
+ return 0, nil
+ }
+
// Is the pipe empty?
if p.view.Size() == 0 {
if !p.HasWriters() {
@@ -388,6 +388,10 @@ func (p *Pipe) rwReadiness() waiter.EventMask {
func (p *Pipe) queued() int64 {
p.mu.Lock()
defer p.mu.Unlock()
+ return p.queuedLocked()
+}
+
+func (p *Pipe) queuedLocked() int64 {
return p.view.Size()
}
diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go
index 28f998e45..f61039f5b 100644
--- a/pkg/sentry/kernel/pipe/vfs.go
+++ b/pkg/sentry/kernel/pipe/vfs.go
@@ -67,6 +67,11 @@ func (vp *VFSPipe) ReaderWriterPair(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlag
return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags, locks), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags, locks)
}
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (*VFSPipe) Allocate(context.Context, uint64, uint64, uint64) error {
+ return syserror.ESPIPE
+}
+
// Open opens the pipe represented by vp.
func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) (*vfs.FileDescription, error) {
vp.mu.Lock()
@@ -244,19 +249,57 @@ func (fd *VFSPipeFD) SetPipeSize(size int64) (int64, error) {
return fd.pipe.SetFifoSize(size)
}
-// IOSequence returns a useremm.IOSequence that reads up to count bytes from,
-// or writes up to count bytes to, fd.
-func (fd *VFSPipeFD) IOSequence(count int64) usermem.IOSequence {
- return usermem.IOSequence{
+// SpliceToNonPipe performs a splice operation from fd to a non-pipe file.
+func (fd *VFSPipeFD) SpliceToNonPipe(ctx context.Context, out *vfs.FileDescription, off, count int64) (int64, error) {
+ fd.pipe.mu.Lock()
+ defer fd.pipe.mu.Unlock()
+
+ // Cap the sequence at number of bytes actually available.
+ v := fd.pipe.queuedLocked()
+ if v < count {
+ count = v
+ }
+ src := usermem.IOSequence{
IO: fd,
Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}),
}
+
+ var (
+ n int64
+ err error
+ )
+ if off == -1 {
+ n, err = out.Write(ctx, src, vfs.WriteOptions{})
+ } else {
+ n, err = out.PWrite(ctx, src, off, vfs.WriteOptions{})
+ }
+ if n > 0 {
+ fd.pipe.view.TrimFront(n)
+ }
+ return n, err
+}
+
+// SpliceFromNonPipe performs a splice operation from a non-pipe file to fd.
+func (fd *VFSPipeFD) SpliceFromNonPipe(ctx context.Context, in *vfs.FileDescription, off, count int64) (int64, error) {
+ fd.pipe.mu.Lock()
+ defer fd.pipe.mu.Unlock()
+
+ dst := usermem.IOSequence{
+ IO: fd,
+ Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}),
+ }
+
+ if off == -1 {
+ return in.Read(ctx, dst, vfs.ReadOptions{})
+ }
+ return in.PRead(ctx, dst, off, vfs.ReadOptions{})
}
-// CopyIn implements usermem.IO.CopyIn.
+// CopyIn implements usermem.IO.CopyIn. Note that it is the caller's
+// responsibility to trim fd.pipe.view after the read is completed.
func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) {
origCount := int64(len(dst))
- n, err := fd.pipe.read(ctx, readOps{
+ n, err := fd.pipe.readLocked(ctx, readOps{
left: func() int64 {
return int64(len(dst))
},
@@ -265,7 +308,6 @@ func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte,
},
read: func(view *buffer.View) (int64, error) {
n, err := view.ReadAt(dst, 0)
- view.TrimFront(int64(n))
return int64(n), err
},
})
@@ -281,7 +323,7 @@ func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte,
// CopyOut implements usermem.IO.CopyOut.
func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) {
origCount := int64(len(src))
- n, err := fd.pipe.write(ctx, writeOps{
+ n, err := fd.pipe.writeLocked(ctx, writeOps{
left: func() int64 {
return int64(len(src))
},
@@ -305,7 +347,7 @@ func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte,
// ZeroOut implements usermem.IO.ZeroOut.
func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) {
origCount := toZero
- n, err := fd.pipe.write(ctx, writeOps{
+ n, err := fd.pipe.writeLocked(ctx, writeOps{
left: func() int64 {
return toZero
},
@@ -326,14 +368,15 @@ func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int6
return n, err
}
-// CopyInTo implements usermem.IO.CopyInTo.
+// CopyInTo implements usermem.IO.CopyInTo. Note that it is the caller's
+// responsibility to trim fd.pipe.view after the read is completed.
func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) {
count := ars.NumBytes()
if count == 0 {
return 0, nil
}
origCount := count
- n, err := fd.pipe.read(ctx, readOps{
+ n, err := fd.pipe.readLocked(ctx, readOps{
left: func() int64 {
return count
},
@@ -342,7 +385,6 @@ func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst
},
read: func(view *buffer.View) (int64, error) {
n, err := view.ReadToSafememWriter(dst, uint64(count))
- view.TrimFront(int64(n))
return int64(n), err
},
})
@@ -362,7 +404,7 @@ func (fd *VFSPipeFD) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq,
return 0, nil
}
origCount := count
- n, err := fd.pipe.write(ctx, writeOps{
+ n, err := fd.pipe.writeLocked(ctx, writeOps{
left: func() int64 {
return count
},
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index 619b0cb7c..1145faf13 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -18,6 +18,7 @@ import (
"fmt"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/mm"
"gvisor.dev/gvisor/pkg/syserror"
@@ -224,8 +225,9 @@ func (s *ptraceStop) Killable() bool {
// beginPtraceStopLocked does not signal t's tracer or wake it if it is
// waiting.
//
-// Preconditions: The TaskSet mutex must be locked. The caller must be running
-// on the task goroutine.
+// Preconditions:
+// * The TaskSet mutex must be locked.
+// * The caller must be running on the task goroutine.
func (t *Task) beginPtraceStopLocked() bool {
t.tg.signalHandlers.mu.Lock()
defer t.tg.signalHandlers.mu.Unlock()
@@ -270,8 +272,9 @@ func (t *Task) ptraceTrapLocked(code int32) {
// ptraceStop, temporarily preventing it from being removed by a concurrent
// Task.Kill, and returns true. Otherwise it returns false.
//
-// Preconditions: The TaskSet mutex must be locked. The caller must be running
-// on the task goroutine of t's tracer.
+// Preconditions:
+// * The TaskSet mutex must be locked.
+// * The caller must be running on the task goroutine of t's tracer.
func (t *Task) ptraceFreeze() bool {
t.tg.signalHandlers.mu.Lock()
defer t.tg.signalHandlers.mu.Unlock()
@@ -301,8 +304,9 @@ func (t *Task) ptraceUnfreeze() {
t.ptraceUnfreezeLocked()
}
-// Preconditions: t must be in a frozen ptraceStop. t's signal mutex must be
-// locked.
+// Preconditions:
+// * t must be in a frozen ptraceStop.
+// * t's signal mutex must be locked.
func (t *Task) ptraceUnfreezeLocked() {
// Do this even if the task has been killed to ensure a panic if t.stop is
// nil or not a ptraceStop.
@@ -497,8 +501,9 @@ func (t *Task) forgetTracerLocked() {
// ptraceSignalLocked is called after signal dequeueing to check if t should
// enter ptrace signal-delivery-stop.
//
-// Preconditions: The signal mutex must be locked. The caller must be running
-// on the task goroutine.
+// Preconditions:
+// * The signal mutex must be locked.
+// * The caller must be running on the task goroutine.
func (t *Task) ptraceSignalLocked(info *arch.SignalInfo) bool {
if linux.Signal(info.Signo) == linux.SIGKILL {
return false
@@ -828,8 +833,9 @@ func (t *Task) ptraceInterrupt(target *Task) error {
return nil
}
-// Preconditions: The TaskSet mutex must be locked for writing. t must have a
-// tracer.
+// Preconditions:
+// * The TaskSet mutex must be locked for writing.
+// * t must have a tracer.
func (t *Task) ptraceSetOptionsLocked(opts uintptr) error {
const valid = uintptr(linux.PTRACE_O_EXITKILL |
linux.PTRACE_O_TRACESYSGOOD |
@@ -994,18 +1000,15 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
// at the address specified by the data parameter, and the return value
// is the error flag." - ptrace(2)
word := t.Arch().Native(0)
- if _, err := usermem.CopyObjectIn(t, target.MemoryManager(), addr, word, usermem.IOOpts{
- IgnorePermissions: true,
- }); err != nil {
+ if _, err := word.CopyIn(target.AsCopyContext(usermem.IOOpts{IgnorePermissions: true}), addr); err != nil {
return err
}
- _, err := t.CopyOut(data, word)
+ _, err := word.CopyOut(t, data)
return err
case linux.PTRACE_POKETEXT, linux.PTRACE_POKEDATA:
- _, err := usermem.CopyObjectOut(t, target.MemoryManager(), addr, t.Arch().Native(uintptr(data)), usermem.IOOpts{
- IgnorePermissions: true,
- })
+ word := t.Arch().Native(uintptr(data))
+ _, err := word.CopyOut(target.AsCopyContext(usermem.IOOpts{IgnorePermissions: true}), addr)
return err
case linux.PTRACE_GETREGSET:
@@ -1073,12 +1076,12 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
if target.ptraceSiginfo == nil {
return syserror.EINVAL
}
- _, err := t.CopyOut(data, target.ptraceSiginfo)
+ _, err := target.ptraceSiginfo.CopyOut(t, data)
return err
case linux.PTRACE_SETSIGINFO:
var info arch.SignalInfo
- if _, err := t.CopyIn(data, &info); err != nil {
+ if _, err := info.CopyIn(t, data); err != nil {
return err
}
t.tg.pidns.owner.mu.RLock()
@@ -1093,7 +1096,8 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
if addr != linux.SignalSetSize {
return syserror.EINVAL
}
- _, err := t.CopyOut(data, target.SignalMask())
+ mask := target.SignalMask()
+ _, err := mask.CopyOut(t, data)
return err
case linux.PTRACE_SETSIGMASK:
@@ -1101,7 +1105,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
return syserror.EINVAL
}
var mask linux.SignalSet
- if _, err := t.CopyIn(data, &mask); err != nil {
+ if _, err := mask.CopyIn(t, data); err != nil {
return err
}
// The target's task goroutine is stopped, so this is safe:
@@ -1116,7 +1120,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
case linux.PTRACE_GETEVENTMSG:
t.tg.pidns.owner.mu.RLock()
defer t.tg.pidns.owner.mu.RUnlock()
- _, err := t.CopyOut(usermem.Addr(data), target.ptraceEventMsg)
+ _, err := primitive.CopyUint64Out(t, usermem.Addr(data), target.ptraceEventMsg)
return err
// PEEKSIGINFO is unimplemented but seems to have no users anywhere.
diff --git a/pkg/sentry/kernel/ptrace_amd64.go b/pkg/sentry/kernel/ptrace_amd64.go
index cef1276ec..609ad3941 100644
--- a/pkg/sentry/kernel/ptrace_amd64.go
+++ b/pkg/sentry/kernel/ptrace_amd64.go
@@ -30,7 +30,7 @@ func (t *Task) ptraceArch(target *Task, req int64, addr, data usermem.Addr) erro
if err != nil {
return err
}
- _, err = t.CopyOut(data, n)
+ _, err = n.CopyOut(t, data)
return err
case linux.PTRACE_POKEUSR: // aka PTRACE_POKEUSER
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index 18416643b..2a9023fdf 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -173,8 +173,10 @@ func (t *Task) OldRSeqCPUAddr() usermem.Addr {
// SetOldRSeqCPUAddr replaces the address that old rseq will keep updated with
// t's CPU number.
//
-// Preconditions: t.RSeqAvailable() == true. The caller must be running on the
-// task goroutine. t's AddressSpace must be active.
+// Preconditions:
+// * t.RSeqAvailable() == true.
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
func (t *Task) SetOldRSeqCPUAddr(addr usermem.Addr) error {
t.oldRSeqCPUAddr = addr
@@ -189,8 +191,9 @@ func (t *Task) SetOldRSeqCPUAddr(addr usermem.Addr) error {
return nil
}
-// Preconditions: The caller must be running on the task goroutine. t's
-// AddressSpace must be active.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
func (t *Task) rseqUpdateCPU() error {
if t.rseqAddr == 0 && t.oldRSeqCPUAddr == 0 {
t.rseqCPU = -1
@@ -209,8 +212,9 @@ func (t *Task) rseqUpdateCPU() error {
return oerr
}
-// Preconditions: The caller must be running on the task goroutine. t's
-// AddressSpace must be active.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
func (t *Task) oldRSeqCopyOutCPU() error {
if t.oldRSeqCPUAddr == 0 {
return nil
@@ -222,8 +226,9 @@ func (t *Task) oldRSeqCopyOutCPU() error {
return err
}
-// Preconditions: The caller must be running on the task goroutine. t's
-// AddressSpace must be active.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
func (t *Task) rseqCopyOutCPU() error {
if t.rseqAddr == 0 {
return nil
@@ -240,8 +245,9 @@ func (t *Task) rseqCopyOutCPU() error {
return err
}
-// Preconditions: The caller must be running on the task goroutine. t's
-// AddressSpace must be active.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
func (t *Task) rseqClearCPU() error {
buf := t.CopyScratchBuffer(8)
// CPUIDStart and CPUID are the first two fields in linux.RSeq.
@@ -269,8 +275,9 @@ func (t *Task) rseqClearCPU() error {
//
// See kernel/rseq.c:rseq_ip_fixup for reference.
//
-// Preconditions: The caller must be running on the task goroutine. t's
-// AddressSpace must be active.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
func (t *Task) rseqAddrInterrupt() {
if t.rseqAddr == 0 {
return
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index 5c4c622c2..df5c8421b 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -16,8 +16,6 @@ package kernel
import (
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -32,7 +30,7 @@ type ProcessGroupID ThreadID
//
// +stateify savable
type Session struct {
- refs refs.AtomicRefCount
+ SessionRefs
// leader is the originator of the Session.
//
@@ -62,16 +60,11 @@ type Session struct {
sessionEntry
}
-// incRef grabs a reference.
-func (s *Session) incRef() {
- s.refs.IncRef()
-}
-
-// decRef drops a reference.
+// DecRef drops a reference.
//
// Precondition: callers must hold TaskSet.mu for writing.
-func (s *Session) decRef() {
- s.refs.DecRefWithDestructor(nil, func(context.Context) {
+func (s *Session) DecRef() {
+ s.SessionRefs.DecRef(func() {
// Remove translations from the leader.
for ns := s.leader.pidns; ns != nil; ns = ns.parent {
id := ns.sids[s]
@@ -88,7 +81,7 @@ func (s *Session) decRef() {
//
// +stateify savable
type ProcessGroup struct {
- refs refs.AtomicRefCount // not exported.
+ refs ProcessGroupRefs
// originator is the originator of the group.
//
@@ -163,7 +156,7 @@ func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) {
}
alive := true
- pg.refs.DecRefWithDestructor(nil, func(context.Context) {
+ pg.refs.DecRef(func() {
alive = false // don't bother with handleOrphan.
// Remove translations from the originator.
@@ -175,7 +168,7 @@ func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) {
// Remove the list of process groups.
pg.session.processGroups.Remove(pg)
- pg.session.decRef()
+ pg.session.DecRef()
})
if alive {
pg.handleOrphan()
@@ -302,7 +295,7 @@ func (tg *ThreadGroup) createSession() error {
id: SessionID(id),
leader: tg,
}
- s.refs.EnableLeakCheck("kernel.Session")
+ s.EnableLeakCheck()
// Create a new ProcessGroup, belonging to that Session.
// This also has a single reference (assigned below).
@@ -316,7 +309,7 @@ func (tg *ThreadGroup) createSession() error {
session: s,
ancestors: 0,
}
- pg.refs.EnableLeakCheck("kernel.ProcessGroup")
+ pg.refs.EnableLeakCheck()
// Tie them and return the result.
s.processGroups.PushBack(pg)
@@ -396,13 +389,13 @@ func (tg *ThreadGroup) CreateProcessGroup() error {
//
// We manually adjust the ancestors if the parent is in the same
// session.
- tg.processGroup.session.incRef()
+ tg.processGroup.session.IncRef()
pg := ProcessGroup{
id: ProcessGroupID(id),
originator: tg,
session: tg.processGroup.session,
}
- pg.refs.EnableLeakCheck("kernel.ProcessGroup")
+ pg.refs.EnableLeakCheck()
if tg.leader.parent != nil && tg.leader.parent.tg.processGroup.session == pg.session {
pg.ancestors++
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
index c211fc8d0..b7e4b480d 100644
--- a/pkg/sentry/kernel/shm/BUILD
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -1,12 +1,25 @@
load("//tools:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
package(licenses = ["notice"])
+go_template_instance(
+ name = "shm_refs",
+ out = "shm_refs.go",
+ package = "shm",
+ prefix = "Shm",
+ template = "//pkg/refs_vfs2:refs_template",
+ types = {
+ "T": "Shm",
+ },
+)
+
go_library(
name = "shm",
srcs = [
"device.go",
"shm.go",
+ "shm_refs.go",
],
visibility = ["//pkg/sentry:internal"],
deps = [
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 13ec7afe0..00c03585e 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -39,7 +39,6 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/log"
- "gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
@@ -252,7 +251,7 @@ func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.Fi
creatorPID: pid,
changeTime: ktime.NowFromContext(ctx),
}
- shm.EnableLeakCheck("kernel.Shm")
+ shm.EnableLeakCheck()
// Find the next available ID.
for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ {
@@ -337,14 +336,14 @@ func (r *Registry) remove(s *Shm) {
//
// +stateify savable
type Shm struct {
- // AtomicRefCount tracks the number of references to this segment.
+ // ShmRefs tracks the number of references to this segment.
//
// A segment holds a reference to itself until it is marked for
// destruction.
//
// In addition to direct users, the MemoryManager will hold references
// via MappingIdentity.
- refs.AtomicRefCount
+ ShmRefs
mfp pgalloc.MemoryFileProvider
@@ -428,11 +427,14 @@ func (s *Shm) InodeID() uint64 {
return uint64(s.ID)
}
-// DecRef overrides refs.RefCount.DecRef with a destructor.
+// DecRef drops a reference on s.
//
// Precondition: Caller must not hold s.mu.
func (s *Shm) DecRef(ctx context.Context) {
- s.DecRefWithDestructor(ctx, s.destroy)
+ s.ShmRefs.DecRef(func() {
+ s.mfp.MemoryFile().DecRef(s.fr)
+ s.registry.remove(s)
+ })
}
// Msync implements memmap.MappingIdentity.Msync. Msync is a no-op for shm
@@ -642,11 +644,6 @@ func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error {
return nil
}
-func (s *Shm) destroy(context.Context) {
- s.mfp.MemoryFile().DecRef(s.fr)
- s.registry.remove(s)
-}
-
// MarkDestroyed marks a segment for destruction. The segment is actually
// destroyed once it has no references. MarkDestroyed may be called multiple
// times, and is safe to call after a segment has already been destroyed. See
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index 413111faf..332bdb8e8 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -348,6 +348,16 @@ func (s *SyscallTable) LookupName(sysno uintptr) string {
return fmt.Sprintf("sys_%d", sysno) // Unlikely.
}
+// LookupNo looks up a syscall number by name.
+func (s *SyscallTable) LookupNo(name string) (uintptr, error) {
+ for i, syscall := range s.Table {
+ if syscall.Name == name {
+ return uintptr(i), nil
+ }
+ }
+ return 0, fmt.Errorf("syscall %q not found", name)
+}
+
// LookupEmulate looks up an emulation syscall number.
func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) {
sysno, ok := s.Emulate[addr]
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 5aee699e7..a436610c9 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -574,6 +574,11 @@ type Task struct {
//
// startTime is protected by mu.
startTime ktime.Time
+
+ // kcov is the kcov instance providing code coverage owned by this task.
+ //
+ // kcov is exclusive to the task goroutine.
+ kcov *Kcov
}
func (t *Task) savePtraceTracer() *Task {
@@ -903,3 +908,16 @@ func (t *Task) UID() uint32 {
func (t *Task) GID() uint32 {
return uint32(t.Credentials().EffectiveKGID)
}
+
+// SetKcov sets the kcov instance associated with t.
+func (t *Task) SetKcov(k *Kcov) {
+ t.kcov = k
+}
+
+// ResetKcov clears the kcov instance associated with t.
+func (t *Task) ResetKcov() {
+ if t.kcov != nil {
+ t.kcov.Reset()
+ t.kcov = nil
+ }
+}
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 9d7a9128f..fce1064a7 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -341,12 +341,12 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
nt.SetClearTID(opts.ChildTID)
}
if opts.ChildSetTID {
- // Can't use Task.CopyOut, which assumes AddressSpaceActive.
- usermem.CopyObjectOut(t, nt.MemoryManager(), opts.ChildTID, nt.ThreadID(), usermem.IOOpts{})
+ ctid := nt.ThreadID()
+ ctid.CopyOut(nt.AsCopyContext(usermem.IOOpts{AddressSpaceActive: false}), opts.ChildTID)
}
ntid := t.tg.pidns.IDOfTask(nt)
if opts.ParentSetTID {
- t.CopyOut(opts.ParentTID, ntid)
+ ntid.CopyOut(t, opts.ParentTID)
}
kind := ptraceCloneKindClone
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 5e4fb3e3a..412d471d3 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -237,9 +237,10 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
// promoteLocked makes t the leader of its thread group. If t is already the
// thread group leader, promoteLocked is a no-op.
//
-// Preconditions: All other tasks in t's thread group, including the existing
-// leader (if it is not t), have reached TaskExitZombie. The TaskSet mutex must
-// be locked for writing.
+// Preconditions:
+// * All other tasks in t's thread group, including the existing leader (if it
+// is not t), have reached TaskExitZombie.
+// * The TaskSet mutex must be locked for writing.
func (t *Task) promoteLocked() {
oldLeader := t.tg.leader
if t == oldLeader {
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index c165d6cb1..b400a8b41 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -239,6 +239,8 @@ func (*runExitMain) execute(t *Task) taskRunState {
t.traceExitEvent()
lastExiter := t.exitThreadGroup()
+ t.ResetKcov()
+
// If the task has a cleartid, and the thread group wasn't killed by a
// signal, handle that before releasing the MM.
if t.cleartid != 0 {
@@ -246,7 +248,8 @@ func (*runExitMain) execute(t *Task) taskRunState {
signaled := t.tg.exiting && t.tg.exitStatus.Signaled()
t.tg.signalHandlers.mu.Unlock()
if !signaled {
- if _, err := t.CopyOut(t.cleartid, ThreadID(0)); err == nil {
+ zero := ThreadID(0)
+ if _, err := zero.CopyOut(t, t.cleartid); err == nil {
t.Futex().Wake(t, t.cleartid, false, ^uint32(0), 1)
}
// If the CopyOut fails, there's nothing we can do.
diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go
index 4b535c949..c80391475 100644
--- a/pkg/sentry/kernel/task_futex.go
+++ b/pkg/sentry/kernel/task_futex.go
@@ -16,6 +16,7 @@ package kernel
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -87,7 +88,7 @@ func (t *Task) exitRobustList() {
return
}
- next := rl.List
+ next := primitive.Uint64(rl.List)
done := 0
var pendingLockAddr usermem.Addr
if rl.ListOpPending != 0 {
@@ -99,12 +100,12 @@ func (t *Task) exitRobustList() {
// We traverse to the next element of the list before we
// actually wake anything. This prevents the race where waking
// this futex causes a modification of the list.
- thisLockAddr := usermem.Addr(next + rl.FutexOffset)
+ thisLockAddr := usermem.Addr(uint64(next) + rl.FutexOffset)
// Try to decode the next element in the list before waking the
// current futex. But don't check the error until after we've
// woken the current futex. Linux does it in this order too
- _, nextErr := t.CopyIn(usermem.Addr(next), &next)
+ _, nextErr := next.CopyIn(t, usermem.Addr(next))
// Wakeup the current futex if it's not pending.
if thisLockAddr != pendingLockAddr {
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index abaf29216..8dc3fec90 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -26,6 +26,7 @@ import (
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/platform"
+ "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -140,7 +141,7 @@ func (*runApp) handleCPUIDInstruction(t *Task) error {
region := trace.StartRegion(t.traceContext, cpuidRegion)
expected := arch.CPUIDInstruction[:]
found := make([]byte, len(expected))
- _, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found)
+ _, err := t.CopyInBytes(usermem.Addr(t.Arch().IP()), found)
if err == nil && bytes.Equal(expected, found) {
// Skip the cpuid instruction.
t.Arch().CPUIDEmulate(t)
@@ -189,8 +190,8 @@ func (app *runApp) execute(t *Task) taskRunState {
// a pending signal, causing another interruption, but that signal should
// not interact with the interrupted syscall.)
if t.haveSyscallReturn {
- if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
- if sre == ERESTART_RESTARTBLOCK {
+ if sre, ok := syserror.SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
+ if sre == syserror.ERESTART_RESTARTBLOCK {
t.Debugf("Restarting syscall %d with restart block after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre)
t.Arch().RestartSyscallWithRestartBlock()
} else {
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
index 09366b60c..52c55d13d 100644
--- a/pkg/sentry/kernel/task_sched.go
+++ b/pkg/sentry/kernel/task_sched.go
@@ -133,9 +133,10 @@ func (t *Task) accountTaskGoroutineEnter(state TaskGoroutineState) {
}
}
-// Preconditions: The caller must be running on the task goroutine, and leaving
-// a state indicated by a previous call to
-// t.accountTaskGoroutineEnter(state).
+// Preconditions:
+// * The caller must be running on the task goroutine
+// * The caller must be leaving a state indicated by a previous call to
+// t.accountTaskGoroutineEnter(state).
func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) {
if state != TaskGoroutineRunningApp {
// Task is unblocking/continuing.
@@ -191,8 +192,8 @@ func (tg *ThreadGroup) CPUStats() usage.CPUStats {
return tg.cpuStatsAtLocked(tg.leader.k.CPUClockNow())
}
-// Preconditions: As for TaskGoroutineSchedInfo.userTicksAt. The TaskSet mutex
-// must be locked.
+// Preconditions: Same as TaskGoroutineSchedInfo.userTicksAt, plus:
+// * The TaskSet mutex must be locked.
func (tg *ThreadGroup) cpuStatsAtLocked(now uint64) usage.CPUStats {
stats := tg.exitedCPUStats
// Account for live tasks.
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index cff2a8365..feaa38596 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -159,7 +159,7 @@ func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunS
sigact := computeAction(linux.Signal(info.Signo), act)
if t.haveSyscallReturn {
- if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
+ if sre, ok := syserror.SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
// Signals that are ignored, cause a thread group stop, or
// terminate the thread group do not interact with interrupted
// syscalls; in Linux terms, they are never returned to the signal
@@ -168,11 +168,11 @@ func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunS
// signal that is actually handled (by userspace).
if sigact == SignalActionHandler {
switch {
- case sre == ERESTARTNOHAND:
+ case sre == syserror.ERESTARTNOHAND:
fallthrough
- case sre == ERESTART_RESTARTBLOCK:
+ case sre == syserror.ERESTART_RESTARTBLOCK:
fallthrough
- case (sre == ERESTARTSYS && !act.IsRestart()):
+ case (sre == syserror.ERESTARTSYS && !act.IsRestart()):
t.Debugf("Not restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo)
t.Arch().SetReturn(uintptr(-ExtractErrno(syserror.EINTR, -1)))
default:
@@ -319,8 +319,9 @@ func (t *Task) SignalReturn(rt bool) (*SyscallControl, error) {
// Sigtimedwait implements the semantics of sigtimedwait(2).
//
-// Preconditions: The caller must be running on the task goroutine. t.exitState
-// < TaskExitZombie.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t.exitState < TaskExitZombie.
func (t *Task) Sigtimedwait(set linux.SignalSet, timeout time.Duration) (*arch.SignalInfo, error) {
// set is the set of signals we're interested in; invert it to get the set
// of signals to block.
@@ -584,8 +585,9 @@ func (t *Task) SignalMask() linux.SignalSet {
// SetSignalMask sets t's signal mask.
//
-// Preconditions: SetSignalMask can only be called by the task goroutine.
-// t.exitState < TaskExitZombie.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t.exitState < TaskExitZombie.
func (t *Task) SetSignalMask(mask linux.SignalSet) {
// By precondition, t prevents t.tg from completing an execve and mutating
// t.tg.signalHandlers, so we can skip the TaskSet mutex.
@@ -631,7 +633,7 @@ func (t *Task) setSignalMaskLocked(mask linux.SignalSet) {
// SetSavedSignalMask sets the saved signal mask (see Task.savedSignalMask's
// comment).
//
-// Preconditions: SetSavedSignalMask can only be called by the task goroutine.
+// Preconditions: The caller must be running on the task goroutine.
func (t *Task) SetSavedSignalMask(mask linux.SignalSet) {
t.savedSignalMask = mask
t.haveSavedSignalMask = true
diff --git a/pkg/sentry/kernel/task_stop.go b/pkg/sentry/kernel/task_stop.go
index 296735d32..a35948a5f 100644
--- a/pkg/sentry/kernel/task_stop.go
+++ b/pkg/sentry/kernel/task_stop.go
@@ -99,8 +99,9 @@ type TaskStop interface {
// beginInternalStop indicates the start of an internal stop that applies to t.
//
-// Preconditions: The task must not already be in an internal stop (i.e. t.stop
-// == nil). The caller must be running on the task goroutine.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * The task must not already be in an internal stop (i.e. t.stop == nil).
func (t *Task) beginInternalStop(s TaskStop) {
t.tg.pidns.owner.mu.RLock()
defer t.tg.pidns.owner.mu.RUnlock()
@@ -109,8 +110,8 @@ func (t *Task) beginInternalStop(s TaskStop) {
t.beginInternalStopLocked(s)
}
-// Preconditions: The signal mutex must be locked. All preconditions for
-// Task.beginInternalStop also apply.
+// Preconditions: Same as beginInternalStop, plus:
+// * The signal mutex must be locked.
func (t *Task) beginInternalStopLocked(s TaskStop) {
if t.stop != nil {
panic(fmt.Sprintf("Attempting to enter internal stop %#v when already in internal stop %#v", s, t.stop))
@@ -128,8 +129,9 @@ func (t *Task) beginInternalStopLocked(s TaskStop) {
// t.stop, which is why there is no endInternalStop that locks the signal mutex
// for you.
//
-// Preconditions: The signal mutex must be locked. The task must be in an
-// internal stop (i.e. t.stop != nil).
+// Preconditions:
+// * The signal mutex must be locked.
+// * The task must be in an internal stop (i.e. t.stop != nil).
func (t *Task) endInternalStopLocked() {
if t.stop == nil {
panic("Attempting to leave non-existent internal stop")
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index a5903b0b5..0141459e7 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -22,6 +22,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/bits"
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/metric"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/memmap"
@@ -29,75 +30,8 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
-// SyscallRestartErrno represents a ERESTART* errno defined in the Linux's kernel
-// include/linux/errno.h. These errnos are never returned to userspace
-// directly, but are used to communicate the expected behavior of an
-// interrupted syscall from the syscall to signal handling.
-type SyscallRestartErrno int
-
-// These numeric values are significant because ptrace syscall exit tracing can
-// observe them.
-//
-// For all of the following errnos, if the syscall is not interrupted by a
-// signal delivered to a user handler, the syscall is restarted.
-const (
- // ERESTARTSYS is returned by an interrupted syscall to indicate that it
- // should be converted to EINTR if interrupted by a signal delivered to a
- // user handler without SA_RESTART set, and restarted otherwise.
- ERESTARTSYS = SyscallRestartErrno(512)
-
- // ERESTARTNOINTR is returned by an interrupted syscall to indicate that it
- // should always be restarted.
- ERESTARTNOINTR = SyscallRestartErrno(513)
-
- // ERESTARTNOHAND is returned by an interrupted syscall to indicate that it
- // should be converted to EINTR if interrupted by a signal delivered to a
- // user handler, and restarted otherwise.
- ERESTARTNOHAND = SyscallRestartErrno(514)
-
- // ERESTART_RESTARTBLOCK is returned by an interrupted syscall to indicate
- // that it should be restarted using a custom function. The interrupted
- // syscall must register a custom restart function by calling
- // Task.SetRestartSyscallFn.
- ERESTART_RESTARTBLOCK = SyscallRestartErrno(516)
-)
-
var vsyscallCount = metric.MustCreateNewUint64Metric("/kernel/vsyscall_count", false /* sync */, "Number of times vsyscalls were invoked by the application")
-// Error implements error.Error.
-func (e SyscallRestartErrno) Error() string {
- // Descriptions are borrowed from strace.
- switch e {
- case ERESTARTSYS:
- return "to be restarted if SA_RESTART is set"
- case ERESTARTNOINTR:
- return "to be restarted"
- case ERESTARTNOHAND:
- return "to be restarted if no handler"
- case ERESTART_RESTARTBLOCK:
- return "interrupted by signal"
- default:
- return "(unknown interrupt error)"
- }
-}
-
-// SyscallRestartErrnoFromReturn returns the SyscallRestartErrno represented by
-// rv, the value in a syscall return register.
-func SyscallRestartErrnoFromReturn(rv uintptr) (SyscallRestartErrno, bool) {
- switch int(rv) {
- case -int(ERESTARTSYS):
- return ERESTARTSYS, true
- case -int(ERESTARTNOINTR):
- return ERESTARTNOINTR, true
- case -int(ERESTARTNOHAND):
- return ERESTARTNOHAND, true
- case -int(ERESTART_RESTARTBLOCK):
- return ERESTART_RESTARTBLOCK, true
- default:
- return 0, false
- }
-}
-
// SyscallRestartBlock represents the restart block for a syscall restartable
// with a custom function. It encapsulates the state required to restart a
// syscall across a S/R.
@@ -354,7 +288,7 @@ func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState {
// Grab the caller up front, to make sure there's a sensible stack.
caller := t.Arch().Native(uintptr(0))
- if _, err := t.CopyIn(usermem.Addr(t.Arch().Stack()), caller); err != nil {
+ if _, err := caller.CopyIn(t, usermem.Addr(t.Arch().Stack())); err != nil {
t.Debugf("vsyscall %d: error reading return address from stack: %v", sysno, err)
t.forceSignal(linux.SIGSEGV, false /* unconditional */)
t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
@@ -390,7 +324,7 @@ func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState {
type runVsyscallAfterPtraceEventSeccomp struct {
addr usermem.Addr
sysno uintptr
- caller interface{}
+ caller marshal.Marshallable
}
func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
@@ -413,7 +347,7 @@ func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
return t.doVsyscallInvoke(sysno, t.Arch().SyscallArgs(), r.caller)
}
-func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller interface{}) taskRunState {
+func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller marshal.Marshallable) taskRunState {
rval, ctrl, err := t.executeSyscall(sysno, args)
if ctrl != nil {
t.Debugf("vsyscall %d, caller %x: syscall control: %v", sysno, t.Arch().Value(caller), ctrl)
@@ -447,7 +381,7 @@ func ExtractErrno(err error, sysno int) int {
return 0
case syscall.Errno:
return int(err)
- case SyscallRestartErrno:
+ case syserror.SyscallRestartErrno:
return int(err)
case *memmap.BusError:
// Bus errors may generate SIGBUS, but for syscalls they still
diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
index b02044ad2..ce134bf54 100644
--- a/pkg/sentry/kernel/task_usermem.go
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -18,6 +18,7 @@ import (
"math"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -43,17 +44,6 @@ func (t *Task) Deactivate() {
}
}
-// CopyIn copies a fixed-size value or slice of fixed-size values in from the
-// task's memory. The copy will fail with syscall.EFAULT if it traverses user
-// memory that is unmapped or not readable by the user.
-//
-// This Task's AddressSpace must be active.
-func (t *Task) CopyIn(addr usermem.Addr, dst interface{}) (int, error) {
- return usermem.CopyObjectIn(t, t.MemoryManager(), addr, dst, usermem.IOOpts{
- AddressSpaceActive: true,
- })
-}
-
// CopyInBytes is a fast version of CopyIn if the caller can serialize the
// data without reflection and pass in a byte slice.
//
@@ -64,17 +54,6 @@ func (t *Task) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
})
}
-// CopyOut copies a fixed-size value or slice of fixed-size values out to the
-// task's memory. The copy will fail with syscall.EFAULT if it traverses user
-// memory that is unmapped or not writeable by the user.
-//
-// This Task's AddressSpace must be active.
-func (t *Task) CopyOut(addr usermem.Addr, src interface{}) (int, error) {
- return usermem.CopyObjectOut(t, t.MemoryManager(), addr, src, usermem.IOOpts{
- AddressSpaceActive: true,
- })
-}
-
// CopyOutBytes is a fast version of CopyOut if the caller can serialize the
// data without reflection and pass in a byte slice.
//
@@ -114,7 +93,7 @@ func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([
var v []string
for {
argAddr := t.Arch().Native(0)
- if _, err := t.CopyIn(addr, argAddr); err != nil {
+ if _, err := argAddr.CopyIn(t, addr); err != nil {
return v, err
}
if t.Arch().Value(argAddr) == 0 {
@@ -143,8 +122,9 @@ func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([
// CopyOutIovecs converts src to an array of struct iovecs and copies it to the
// memory mapped at addr.
//
-// Preconditions: As for usermem.IO.CopyOut. The caller must be running on the
-// task goroutine. t's AddressSpace must be active.
+// Preconditions: Same as usermem.IO.CopyOut, plus:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error {
switch t.Arch().Width() {
case 8:
@@ -191,8 +171,9 @@ func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error
// combined length of all AddrRanges would otherwise exceed this amount, ranges
// beyond MAX_RW_COUNT are silently truncated.
//
-// Preconditions: As for usermem.IO.CopyIn. The caller must be running on the
-// task goroutine. t's AddressSpace must be active.
+// Preconditions: Same as usermem.IO.CopyIn, plus:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRangeSeq, error) {
if numIovecs == 0 {
return usermem.AddrRangeSeq{}, nil
@@ -284,7 +265,7 @@ func (t *Task) SingleIOSequence(addr usermem.Addr, length int, opts usermem.IOOp
//
// IovecsIOSequence is analogous to Linux's lib/iov_iter.c:import_iovec().
//
-// Preconditions: As for Task.CopyInIovecs.
+// Preconditions: Same as Task.CopyInIovecs.
func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOpts) (usermem.IOSequence, error) {
if iovcnt < 0 || iovcnt > linux.UIO_MAXIOV {
return usermem.IOSequence{}, syserror.EINVAL
@@ -299,3 +280,30 @@ func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOp
Opts: opts,
}, nil
}
+
+// copyContext implements marshal.CopyContext. It wraps a task to allow copying
+// memory to and from the task memory with custom usermem.IOOpts.
+type copyContext struct {
+ *Task
+ opts usermem.IOOpts
+}
+
+// AsCopyContext wraps the task and returns it as CopyContext.
+func (t *Task) AsCopyContext(opts usermem.IOOpts) marshal.CopyContext {
+ return &copyContext{t, opts}
+}
+
+// CopyInString copies a string in from the task's memory.
+func (t *copyContext) CopyInString(addr usermem.Addr, maxLen int) (string, error) {
+ return usermem.CopyStringIn(t, t.MemoryManager(), addr, maxLen, t.opts)
+}
+
+// CopyInBytes copies task memory into dst from an IO context.
+func (t *copyContext) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
+ return t.MemoryManager().CopyIn(t, addr, dst, t.opts)
+}
+
+// CopyOutBytes copies src into task memoryfrom an IO context.
+func (t *copyContext) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) {
+ return t.MemoryManager().CopyOut(t, addr, src, t.opts)
+}
diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
index 872e1a82d..5ae5906e8 100644
--- a/pkg/sentry/kernel/threads.go
+++ b/pkg/sentry/kernel/threads.go
@@ -36,6 +36,8 @@ import (
const TasksLimit = (1 << 16)
// ThreadID is a generic thread identifier.
+//
+// +marshal
type ThreadID int32
// String returns a decimal representation of the ThreadID.
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
index e959700f2..f61a8e164 100644
--- a/pkg/sentry/kernel/time/time.go
+++ b/pkg/sentry/kernel/time/time.go
@@ -616,8 +616,10 @@ func (t *Timer) Swap(s Setting) (Time, Setting) {
// Timer's Clock) at which the Setting was changed. Setting s.Enabled to true
// starts the timer, while setting s.Enabled to false stops it.
//
-// Preconditions: The Timer must not be paused. f cannot call any Timer methods
-// since it is called with the Timer mutex locked.
+// Preconditions:
+// * The Timer must not be paused.
+// * f cannot call any Timer methods since it is called with the Timer mutex
+// locked.
func (t *Timer) SwapAnd(s Setting, f func()) (Time, Setting) {
now := t.clock.Now()
t.mu.Lock()
diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go
index 290c32466..e44a139b3 100644
--- a/pkg/sentry/kernel/vdso.go
+++ b/pkg/sentry/kernel/vdso.go
@@ -73,13 +73,10 @@ type VDSOParamPage struct {
// NewVDSOParamPage returns a VDSOParamPage.
//
// Preconditions:
-//
// * fr is a single page allocated from mfp.MemoryFile(). VDSOParamPage does
// not take ownership of fr; it must remain allocated for the lifetime of the
// VDSOParamPage.
-//
// * VDSOParamPage must be the only writer to fr.
-//
// * mfp.MemoryFile().MapInternal(fr) must return a single safemem.Block.
func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr memmap.FileRange) *VDSOParamPage {
return &VDSOParamPage{mfp: mfp, fr: fr}