summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/kernel')
-rw-r--r--pkg/sentry/kernel/BUILD62
-rw-r--r--pkg/sentry/kernel/abstract_socket_namespace.go82
-rw-r--r--pkg/sentry/kernel/auth/BUILD1
-rw-r--r--pkg/sentry/kernel/auth/context.go20
-rw-r--r--pkg/sentry/kernel/auth/id.go4
-rw-r--r--pkg/sentry/kernel/epoll/epoll.go14
-rw-r--r--pkg/sentry/kernel/epoll/epoll_test.go5
-rw-r--r--pkg/sentry/kernel/eventfd/eventfd.go4
-rw-r--r--pkg/sentry/kernel/fd_table.go218
-rw-r--r--pkg/sentry/kernel/fd_table_test.go14
-rw-r--r--pkg/sentry/kernel/fd_table_unsafe.go63
-rw-r--r--pkg/sentry/kernel/fs_context.go108
-rw-r--r--pkg/sentry/kernel/futex/BUILD1
-rw-r--r--pkg/sentry/kernel/futex/futex.go43
-rw-r--r--pkg/sentry/kernel/futex/futex_test.go66
-rw-r--r--pkg/sentry/kernel/kcov.go335
-rw-r--r--pkg/sentry/kernel/kcov_unsafe.go28
-rw-r--r--pkg/sentry/kernel/kernel.go186
-rw-r--r--pkg/sentry/kernel/pipe/BUILD1
-rw-r--r--pkg/sentry/kernel/pipe/node.go6
-rw-r--r--pkg/sentry/kernel/pipe/node_test.go2
-rw-r--r--pkg/sentry/kernel/pipe/pipe.go19
-rw-r--r--pkg/sentry/kernel/pipe/pipe_test.go16
-rw-r--r--pkg/sentry/kernel/pipe/pipe_util.go14
-rw-r--r--pkg/sentry/kernel/pipe/reader.go3
-rw-r--r--pkg/sentry/kernel/pipe/vfs.go76
-rw-r--r--pkg/sentry/kernel/pipe/writer.go3
-rw-r--r--pkg/sentry/kernel/ptrace.go58
-rw-r--r--pkg/sentry/kernel/ptrace_amd64.go2
-rw-r--r--pkg/sentry/kernel/rseq.go31
-rw-r--r--pkg/sentry/kernel/seccomp.go46
-rw-r--r--pkg/sentry/kernel/sessions.go28
-rw-r--r--pkg/sentry/kernel/shm/BUILD14
-rw-r--r--pkg/sentry/kernel/shm/shm.go28
-rw-r--r--pkg/sentry/kernel/signalfd/BUILD1
-rw-r--r--pkg/sentry/kernel/signalfd/signalfd.go16
-rw-r--r--pkg/sentry/kernel/syscalls.go10
-rw-r--r--pkg/sentry/kernel/syslog.go9
-rw-r--r--pkg/sentry/kernel/task.go45
-rw-r--r--pkg/sentry/kernel/task_clone.go20
-rw-r--r--pkg/sentry/kernel/task_context.go6
-rw-r--r--pkg/sentry/kernel/task_exec.go17
-rw-r--r--pkg/sentry/kernel/task_exit.go16
-rw-r--r--pkg/sentry/kernel/task_futex.go126
-rw-r--r--pkg/sentry/kernel/task_log.go45
-rw-r--r--pkg/sentry/kernel/task_run.go26
-rw-r--r--pkg/sentry/kernel/task_sched.go11
-rw-r--r--pkg/sentry/kernel/task_signals.go34
-rw-r--r--pkg/sentry/kernel/task_start.go6
-rw-r--r--pkg/sentry/kernel/task_stop.go30
-rw-r--r--pkg/sentry/kernel/task_syscall.go76
-rw-r--r--pkg/sentry/kernel/task_usermem.go64
-rw-r--r--pkg/sentry/kernel/task_work.go38
-rw-r--r--pkg/sentry/kernel/thread_group.go4
-rw-r--r--pkg/sentry/kernel/threads.go2
-rw-r--r--pkg/sentry/kernel/time/BUILD1
-rw-r--r--pkg/sentry/kernel/time/tcpip.go131
-rw-r--r--pkg/sentry/kernel/time/time.go6
-rw-r--r--pkg/sentry/kernel/timekeeper.go7
-rw-r--r--pkg/sentry/kernel/vdso.go28
60 files changed, 1720 insertions, 656 deletions
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 25fe1921b..5de70aecb 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -69,8 +69,52 @@ go_template_instance(
prefix = "socket",
template = "//pkg/ilist:generic_list",
types = {
- "Element": "*SocketEntry",
- "Linker": "*SocketEntry",
+ "Element": "*SocketRecordVFS1",
+ "Linker": "*SocketRecordVFS1",
+ },
+)
+
+go_template_instance(
+ name = "fd_table_refs",
+ out = "fd_table_refs.go",
+ package = "kernel",
+ prefix = "FDTable",
+ template = "//pkg/refs_vfs2:refs_template",
+ types = {
+ "T": "FDTable",
+ },
+)
+
+go_template_instance(
+ name = "fs_context_refs",
+ out = "fs_context_refs.go",
+ package = "kernel",
+ prefix = "FSContext",
+ template = "//pkg/refs_vfs2:refs_template",
+ types = {
+ "T": "FSContext",
+ },
+)
+
+go_template_instance(
+ name = "process_group_refs",
+ out = "process_group_refs.go",
+ package = "kernel",
+ prefix = "ProcessGroup",
+ template = "//pkg/refs_vfs2:refs_template",
+ types = {
+ "T": "ProcessGroup",
+ },
+)
+
+go_template_instance(
+ name = "session_refs",
+ out = "session_refs.go",
+ package = "kernel",
+ prefix = "Session",
+ template = "//pkg/refs_vfs2:refs_template",
+ types = {
+ "T": "Session",
},
)
@@ -88,9 +132,13 @@ go_library(
"aio.go",
"context.go",
"fd_table.go",
+ "fd_table_refs.go",
"fd_table_unsafe.go",
"fs_context.go",
+ "fs_context_refs.go",
"ipc_namespace.go",
+ "kcov.go",
+ "kcov_unsafe.go",
"kernel.go",
"kernel_opts.go",
"kernel_state.go",
@@ -99,6 +147,7 @@ go_library(
"pending_signals_state.go",
"posixtimer.go",
"process_group_list.go",
+ "process_group_refs.go",
"ptrace.go",
"ptrace_amd64.go",
"ptrace_arm64.go",
@@ -106,6 +155,7 @@ go_library(
"seccomp.go",
"seqatomic_taskgoroutineschedinfo_unsafe.go",
"session_list.go",
+ "session_refs.go",
"sessions.go",
"signal.go",
"signal_handlers.go",
@@ -132,6 +182,7 @@ go_library(
"task_stop.go",
"task_syscall.go",
"task_usermem.go",
+ "task_work.go",
"thread_group.go",
"threads.go",
"timekeeper.go",
@@ -146,22 +197,26 @@ go_library(
"gvisor.dev/gvisor/pkg/sentry/device",
"gvisor.dev/gvisor/pkg/tcpip",
],
+ marshal = True,
visibility = ["//:sandbox"],
deps = [
":uncaught_signal_go_proto",
"//pkg/abi",
"//pkg/abi/linux",
"//pkg/amutex",
- "//pkg/binary",
"//pkg/bits",
"//pkg/bpf",
"//pkg/context",
+ "//pkg/coverage",
"//pkg/cpuid",
"//pkg/eventchannel",
"//pkg/fspath",
"//pkg/log",
+ "//pkg/marshal",
+ "//pkg/marshal/primitive",
"//pkg/metric",
"//pkg/refs",
+ "//pkg/refs_vfs2",
"//pkg/safemem",
"//pkg/secio",
"//pkg/sentry/arch",
@@ -208,7 +263,6 @@ go_library(
"//pkg/tcpip/stack",
"//pkg/usermem",
"//pkg/waiter",
- "//tools/go_marshal/marshal",
],
)
diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go
index 920fe4329..1b9721534 100644
--- a/pkg/sentry/kernel/abstract_socket_namespace.go
+++ b/pkg/sentry/kernel/abstract_socket_namespace.go
@@ -15,28 +15,21 @@
package kernel
import (
+ "fmt"
"syscall"
- "gvisor.dev/gvisor/pkg/refs"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/refs_vfs2"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/sync"
)
// +stateify savable
type abstractEndpoint struct {
- ep transport.BoundEndpoint
- wr *refs.WeakRef
- name string
- ns *AbstractSocketNamespace
-}
-
-// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
-func (e *abstractEndpoint) WeakRefGone() {
- e.ns.mu.Lock()
- if e.ns.endpoints[e.name].ep == e.ep {
- delete(e.ns.endpoints, e.name)
- }
- e.ns.mu.Unlock()
+ ep transport.BoundEndpoint
+ socket refs_vfs2.RefCounter
+ name string
+ ns *AbstractSocketNamespace
}
// AbstractSocketNamespace is used to implement the Linux abstract socket functionality.
@@ -45,7 +38,11 @@ func (e *abstractEndpoint) WeakRefGone() {
type AbstractSocketNamespace struct {
mu sync.Mutex `state:"nosave"`
- // Keeps mapping from name to endpoint.
+ // Keeps a mapping from name to endpoint. AbstractSocketNamespace does not hold
+ // any references on any sockets that it contains; when retrieving a socket,
+ // TryIncRef() must be called in case the socket is concurrently being
+ // destroyed. It is the responsibility of the socket to remove itself from the
+ // abstract socket namespace when it is destroyed.
endpoints map[string]abstractEndpoint
}
@@ -57,16 +54,16 @@ func NewAbstractSocketNamespace() *AbstractSocketNamespace {
}
// A boundEndpoint wraps a transport.BoundEndpoint to maintain a reference on
-// its backing object.
+// its backing socket.
type boundEndpoint struct {
transport.BoundEndpoint
- rc refs.RefCounter
+ socket refs_vfs2.RefCounter
}
// Release implements transport.BoundEndpoint.Release.
-func (e *boundEndpoint) Release() {
- e.rc.DecRef()
- e.BoundEndpoint.Release()
+func (e *boundEndpoint) Release(ctx context.Context) {
+ e.socket.DecRef(ctx)
+ e.BoundEndpoint.Release(ctx)
}
// BoundEndpoint retrieves the endpoint bound to the given name. The return
@@ -80,32 +77,59 @@ func (a *AbstractSocketNamespace) BoundEndpoint(name string) transport.BoundEndp
return nil
}
- rc := ep.wr.Get()
- if rc == nil {
- delete(a.endpoints, name)
+ if !ep.socket.TryIncRef() {
+ // The socket has reached zero references and is being destroyed.
return nil
}
- return &boundEndpoint{ep.ep, rc}
+ return &boundEndpoint{ep.ep, ep.socket}
}
// Bind binds the given socket.
//
-// When the last reference managed by rc is dropped, ep may be removed from the
+// When the last reference managed by socket is dropped, ep may be removed from the
// namespace.
-func (a *AbstractSocketNamespace) Bind(name string, ep transport.BoundEndpoint, rc refs.RefCounter) error {
+func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, socket refs_vfs2.RefCounter) error {
a.mu.Lock()
defer a.mu.Unlock()
+ // Check if there is already a socket (which has not yet been destroyed) bound at name.
if ep, ok := a.endpoints[name]; ok {
- if rc := ep.wr.Get(); rc != nil {
- rc.DecRef()
+ if ep.socket.TryIncRef() {
+ ep.socket.DecRef(ctx)
return syscall.EADDRINUSE
}
}
ae := abstractEndpoint{ep: ep, name: name, ns: a}
- ae.wr = refs.NewWeakRef(rc, &ae)
+ ae.socket = socket
a.endpoints[name] = ae
return nil
}
+
+// Remove removes the specified socket at name from the abstract socket
+// namespace, if it has not yet been replaced.
+func (a *AbstractSocketNamespace) Remove(name string, socket refs_vfs2.RefCounter) {
+ a.mu.Lock()
+ defer a.mu.Unlock()
+
+ ep, ok := a.endpoints[name]
+ if !ok {
+ // We never delete a map entry apart from a socket's destructor (although the
+ // map entry may be overwritten). Therefore, a socket should exist, even if it
+ // may not be the one we expect.
+ panic(fmt.Sprintf("expected socket to exist at '%s' in abstract socket namespace", name))
+ }
+
+ // A Bind() operation may race with callers of Remove(), e.g. in the
+ // following case:
+ // socket1 reaches zero references and begins destruction
+ // a.Bind("foo", ep, socket2) replaces socket1 with socket2
+ // socket1's destructor calls a.Remove("foo", socket1)
+ //
+ // Therefore, we need to check that the socket at name is what we expect
+ // before modifying the map.
+ if ep.socket == socket {
+ delete(a.endpoints, name)
+ }
+}
diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD
index 2bc49483a..869e49ebc 100644
--- a/pkg/sentry/kernel/auth/BUILD
+++ b/pkg/sentry/kernel/auth/BUILD
@@ -57,6 +57,7 @@ go_library(
"id_map_set.go",
"user_namespace.go",
],
+ marshal = True,
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
diff --git a/pkg/sentry/kernel/auth/context.go b/pkg/sentry/kernel/auth/context.go
index ef5723127..c08d47787 100644
--- a/pkg/sentry/kernel/auth/context.go
+++ b/pkg/sentry/kernel/auth/context.go
@@ -34,3 +34,23 @@ func CredentialsFromContext(ctx context.Context) *Credentials {
}
return NewAnonymousCredentials()
}
+
+// ContextWithCredentials returns a copy of ctx carrying creds.
+func ContextWithCredentials(ctx context.Context, creds *Credentials) context.Context {
+ return &authContext{ctx, creds}
+}
+
+type authContext struct {
+ context.Context
+ creds *Credentials
+}
+
+// Value implements context.Context.
+func (ac *authContext) Value(key interface{}) interface{} {
+ switch key {
+ case CtxCredentials:
+ return ac.creds
+ default:
+ return ac.Context.Value(key)
+ }
+}
diff --git a/pkg/sentry/kernel/auth/id.go b/pkg/sentry/kernel/auth/id.go
index 0a58ba17c..4c32ee703 100644
--- a/pkg/sentry/kernel/auth/id.go
+++ b/pkg/sentry/kernel/auth/id.go
@@ -19,9 +19,13 @@ import (
)
// UID is a user ID in an unspecified user namespace.
+//
+// +marshal
type UID uint32
// GID is a group ID in an unspecified user namespace.
+//
+// +marshal slice:GIDSlice
type GID uint32
// In the root user namespace, user/group IDs have a 1-to-1 relationship with
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index 4c0f1e41f..15519f0df 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -76,8 +76,8 @@ type pollEntry struct {
// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
// weakReferenceGone is called when the file in the weak reference is destroyed.
// The poll entry is removed in response to this.
-func (p *pollEntry) WeakRefGone() {
- p.epoll.RemoveEntry(p.id)
+func (p *pollEntry) WeakRefGone(ctx context.Context) {
+ p.epoll.RemoveEntry(ctx, p.id)
}
// EventPoll holds all the state associated with an event poll object, that is,
@@ -144,14 +144,14 @@ func NewEventPoll(ctx context.Context) *fs.File {
// name matches fs/eventpoll.c:epoll_create1.
dirent := fs.NewDirent(ctx, anon.NewInode(ctx), fmt.Sprintf("anon_inode:[eventpoll]"))
// Release the initial dirent reference after NewFile takes a reference.
- defer dirent.DecRef()
+ defer dirent.DecRef(ctx)
return fs.NewFile(ctx, dirent, fs.FileFlags{}, &EventPoll{
files: make(map[FileIdentifier]*pollEntry),
})
}
// Release implements fs.FileOperations.Release.
-func (e *EventPoll) Release() {
+func (e *EventPoll) Release(ctx context.Context) {
// We need to take the lock now because files may be attempting to
// remove entries in parallel if they get destroyed.
e.mu.Lock()
@@ -160,7 +160,7 @@ func (e *EventPoll) Release() {
// Go through all entries and clean up.
for _, entry := range e.files {
entry.id.File.EventUnregister(&entry.waiter)
- entry.file.Drop()
+ entry.file.Drop(ctx)
}
e.files = nil
}
@@ -423,7 +423,7 @@ func (e *EventPoll) UpdateEntry(id FileIdentifier, flags EntryFlags, mask waiter
}
// RemoveEntry a files from the collection of observed files.
-func (e *EventPoll) RemoveEntry(id FileIdentifier) error {
+func (e *EventPoll) RemoveEntry(ctx context.Context, id FileIdentifier) error {
e.mu.Lock()
defer e.mu.Unlock()
@@ -445,7 +445,7 @@ func (e *EventPoll) RemoveEntry(id FileIdentifier) error {
// Remove file from map, and drop weak reference.
delete(e.files, id)
- entry.file.Drop()
+ entry.file.Drop(ctx)
return nil
}
diff --git a/pkg/sentry/kernel/epoll/epoll_test.go b/pkg/sentry/kernel/epoll/epoll_test.go
index 22630e9c5..55b505593 100644
--- a/pkg/sentry/kernel/epoll/epoll_test.go
+++ b/pkg/sentry/kernel/epoll/epoll_test.go
@@ -26,7 +26,8 @@ func TestFileDestroyed(t *testing.T) {
f := filetest.NewTestFile(t)
id := FileIdentifier{f, 12}
- efile := NewEventPoll(contexttest.Context(t))
+ ctx := contexttest.Context(t)
+ efile := NewEventPoll(ctx)
e := efile.FileOperations.(*EventPoll)
if err := e.AddEntry(id, 0, waiter.EventIn, [2]int32{}); err != nil {
t.Fatalf("addEntry failed: %v", err)
@@ -44,7 +45,7 @@ func TestFileDestroyed(t *testing.T) {
}
// Destroy the file. Check that we get no more events.
- f.DecRef()
+ f.DecRef(ctx)
evt = e.ReadEvents(1)
if len(evt) != 0 {
diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go
index 87951adeb..bbf568dfc 100644
--- a/pkg/sentry/kernel/eventfd/eventfd.go
+++ b/pkg/sentry/kernel/eventfd/eventfd.go
@@ -70,7 +70,7 @@ func New(ctx context.Context, initVal uint64, semMode bool) *fs.File {
// name matches fs/eventfd.c:eventfd_file_create.
dirent := fs.NewDirent(ctx, anon.NewInode(ctx), "anon_inode:[eventfd]")
// Release the initial dirent reference after NewFile takes a reference.
- defer dirent.DecRef()
+ defer dirent.DecRef(ctx)
return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &EventOperations{
val: initVal,
semMode: semMode,
@@ -106,7 +106,7 @@ func (e *EventOperations) HostFD() (int, error) {
}
// Release implements fs.FileOperations.Release.
-func (e *EventOperations) Release() {
+func (e *EventOperations) Release(context.Context) {
e.mu.Lock()
defer e.mu.Unlock()
if e.hostfd >= 0 {
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index 4b7d234a4..0ec7344cd 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -23,7 +23,6 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/limits"
@@ -78,7 +77,8 @@ type descriptor struct {
//
// +stateify savable
type FDTable struct {
- refs.AtomicRefCount
+ FDTableRefs
+
k *Kernel
// mu protects below.
@@ -98,7 +98,7 @@ type FDTable struct {
func (f *FDTable) saveDescriptorTable() map[int32]descriptor {
m := make(map[int32]descriptor)
- f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+ f.forEach(context.Background(), func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
m[fd] = descriptor{
file: file,
fileVFS2: fileVFS2,
@@ -109,24 +109,28 @@ func (f *FDTable) saveDescriptorTable() map[int32]descriptor {
}
func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) {
+ ctx := context.Background()
f.init() // Initialize table.
+ f.used = 0
for fd, d := range m {
- f.setAll(fd, d.file, d.fileVFS2, d.flags)
+ if file, fileVFS2 := f.setAll(ctx, fd, d.file, d.fileVFS2, d.flags); file != nil || fileVFS2 != nil {
+ panic("VFS1 or VFS2 files set")
+ }
// Note that we do _not_ need to acquire a extra table reference here. The
// table reference will already be accounted for in the file, so we drop the
// reference taken by set above.
switch {
case d.file != nil:
- d.file.DecRef()
+ d.file.DecRef(ctx)
case d.fileVFS2 != nil:
- d.fileVFS2.DecRef()
+ d.fileVFS2.DecRef(ctx)
}
}
}
// drop drops the table reference.
-func (f *FDTable) drop(file *fs.File) {
+func (f *FDTable) drop(ctx context.Context, file *fs.File) {
// Release locks.
file.Dirent.Inode.LockCtx.Posix.UnlockRegion(f, lock.LockRange{0, lock.LockEOF})
@@ -144,14 +148,14 @@ func (f *FDTable) drop(file *fs.File) {
d.InotifyEvent(ev, 0)
// Drop the table reference.
- file.DecRef()
+ file.DecRef(ctx)
}
// dropVFS2 drops the table reference.
-func (f *FDTable) dropVFS2(file *vfs.FileDescription) {
+func (f *FDTable) dropVFS2(ctx context.Context, file *vfs.FileDescription) {
// Release any POSIX lock possibly held by the FDTable. Range {0, 0} means the
// entire file.
- err := file.UnlockPOSIX(context.Background(), f, 0, 0, linux.SEEK_SET)
+ err := file.UnlockPOSIX(ctx, f, 0, 0, linux.SEEK_SET)
if err != nil && err != syserror.ENOLCK {
panic(fmt.Sprintf("UnlockPOSIX failed: %v", err))
}
@@ -161,10 +165,10 @@ func (f *FDTable) dropVFS2(file *vfs.FileDescription) {
if file.IsWritable() {
ev = linux.IN_CLOSE_WRITE
}
- file.Dentry().InotifyWithParent(ev, 0, vfs.PathEvent)
+ file.Dentry().InotifyWithParent(ctx, ev, 0, vfs.PathEvent)
// Drop the table's reference.
- file.DecRef()
+ file.DecRef(ctx)
}
// NewFDTable allocates a new FDTable that may be used by tasks in k.
@@ -174,28 +178,21 @@ func (k *Kernel) NewFDTable() *FDTable {
return f
}
-// destroy removes all of the file descriptors from the map.
-func (f *FDTable) destroy() {
- f.RemoveIf(func(*fs.File, *vfs.FileDescription, FDFlags) bool {
- return true
+// DecRef implements RefCounter.DecRef.
+//
+// If f reaches zero references, all of its file descriptors are removed.
+func (f *FDTable) DecRef(ctx context.Context) {
+ f.FDTableRefs.DecRef(func() {
+ f.RemoveIf(ctx, func(*fs.File, *vfs.FileDescription, FDFlags) bool {
+ return true
+ })
})
}
-// DecRef implements RefCounter.DecRef with destructor f.destroy.
-func (f *FDTable) DecRef() {
- f.DecRefWithDestructor(f.destroy)
-}
-
-// Size returns the number of file descriptor slots currently allocated.
-func (f *FDTable) Size() int {
- size := atomic.LoadInt32(&f.used)
- return int(size)
-}
-
// forEach iterates over all non-nil files in sorted order.
//
// It is the caller's responsibility to acquire an appropriate lock.
-func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) {
+func (f *FDTable) forEach(ctx context.Context, fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) {
// retries tracks the number of failed TryIncRef attempts for the same FD.
retries := 0
fd := int32(0)
@@ -214,7 +211,7 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes
continue // Race caught.
}
fn(fd, file, nil, flags)
- file.DecRef()
+ file.DecRef(ctx)
case fileVFS2 != nil:
if !fileVFS2.TryIncRef() {
retries++
@@ -224,7 +221,7 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes
continue // Race caught.
}
fn(fd, nil, fileVFS2, flags)
- fileVFS2.DecRef()
+ fileVFS2.DecRef(ctx)
}
retries = 0
fd++
@@ -234,7 +231,8 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes
// String is a stringer for FDTable.
func (f *FDTable) String() string {
var buf strings.Builder
- f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+ ctx := context.Background()
+ f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
switch {
case file != nil:
n, _ := file.Dirent.FullName(nil /* root */)
@@ -242,7 +240,7 @@ func (f *FDTable) String() string {
case fileVFS2 != nil:
vfsObj := fileVFS2.Mount().Filesystem().VirtualFilesystem()
- name, err := vfsObj.PathnameWithDeleted(context.Background(), vfs.VirtualDentry{}, fileVFS2.VirtualDentry())
+ name, err := vfsObj.PathnameWithDeleted(ctx, vfs.VirtualDentry{}, fileVFS2.VirtualDentry())
if err != nil {
fmt.Fprintf(&buf, "<err: %v>\n", err)
return
@@ -277,7 +275,6 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
}
f.mu.Lock()
- defer f.mu.Unlock()
// From f.next to find available fd.
if fd < f.next {
@@ -287,15 +284,25 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
// Install all entries.
for i := fd; i < end && len(fds) < len(files); i++ {
if d, _, _ := f.get(i); d == nil {
- f.set(i, files[len(fds)], flags) // Set the descriptor.
- fds = append(fds, i) // Record the file descriptor.
+ // Set the descriptor.
+ f.set(ctx, i, files[len(fds)], flags)
+ fds = append(fds, i) // Record the file descriptor.
}
}
// Failure? Unwind existing FDs.
if len(fds) < len(files) {
for _, i := range fds {
- f.set(i, nil, FDFlags{}) // Zap entry.
+ f.set(ctx, i, nil, FDFlags{})
+ }
+ f.mu.Unlock()
+
+ // Drop the reference taken by the call to f.set() that
+ // originally installed the file. Don't call f.drop()
+ // (generating inotify events, etc.) since the file should
+ // appear to have never been inserted into f.
+ for _, file := range files[:len(fds)] {
+ file.DecRef(ctx)
}
return nil, syscall.EMFILE
}
@@ -305,6 +312,7 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
f.next = fds[len(fds)-1] + 1
}
+ f.mu.Unlock()
return fds, nil
}
@@ -332,7 +340,6 @@ func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDes
}
f.mu.Lock()
- defer f.mu.Unlock()
// From f.next to find available fd.
if fd < f.next {
@@ -342,15 +349,25 @@ func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDes
// Install all entries.
for i := fd; i < end && len(fds) < len(files); i++ {
if d, _, _ := f.getVFS2(i); d == nil {
- f.setVFS2(i, files[len(fds)], flags) // Set the descriptor.
- fds = append(fds, i) // Record the file descriptor.
+ // Set the descriptor.
+ f.setVFS2(ctx, i, files[len(fds)], flags)
+ fds = append(fds, i) // Record the file descriptor.
}
}
// Failure? Unwind existing FDs.
if len(fds) < len(files) {
for _, i := range fds {
- f.setVFS2(i, nil, FDFlags{}) // Zap entry.
+ f.setVFS2(ctx, i, nil, FDFlags{})
+ }
+ f.mu.Unlock()
+
+ // Drop the reference taken by the call to f.setVFS2() that
+ // originally installed the file. Don't call f.dropVFS2()
+ // (generating inotify events, etc.) since the file should
+ // appear to have never been inserted into f.
+ for _, file := range files[:len(fds)] {
+ file.DecRef(ctx)
}
return nil, syscall.EMFILE
}
@@ -360,6 +377,7 @@ func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDes
f.next = fds[len(fds)-1] + 1
}
+ f.mu.Unlock()
return fds, nil
}
@@ -395,7 +413,7 @@ func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDesc
}
for fd < end {
if d, _, _ := f.getVFS2(fd); d == nil {
- f.setVFS2(fd, file, flags)
+ f.setVFS2(ctx, fd, file, flags)
if fd == f.next {
// Update next search start position.
f.next = fd + 1
@@ -411,40 +429,55 @@ func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDesc
// reference for that FD, the ref count for that existing reference is
// decremented.
func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FDFlags) error {
- return f.newFDAt(ctx, fd, file, nil, flags)
+ df, _, err := f.newFDAt(ctx, fd, file, nil, flags)
+ if err != nil {
+ return err
+ }
+ if df != nil {
+ f.drop(ctx, df)
+ }
+ return nil
}
// NewFDAtVFS2 sets the file reference for the given FD. If there is an active
// reference for that FD, the ref count for that existing reference is
// decremented.
func (f *FDTable) NewFDAtVFS2(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) error {
- return f.newFDAt(ctx, fd, nil, file, flags)
+ _, dfVFS2, err := f.newFDAt(ctx, fd, nil, file, flags)
+ if err != nil {
+ return err
+ }
+ if dfVFS2 != nil {
+ f.dropVFS2(ctx, dfVFS2)
+ }
+ return nil
}
-func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) error {
+func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) (*fs.File, *vfs.FileDescription, error) {
if fd < 0 {
// Don't accept negative FDs.
- return syscall.EBADF
+ return nil, nil, syscall.EBADF
}
// Check the limit for the provided file.
if limitSet := limits.FromContext(ctx); limitSet != nil {
if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur {
- return syscall.EMFILE
+ return nil, nil, syscall.EMFILE
}
}
// Install the entry.
f.mu.Lock()
defer f.mu.Unlock()
- f.setAll(fd, file, fileVFS2, flags)
- return nil
+
+ df, dfVFS2 := f.setAll(ctx, fd, file, fileVFS2, flags)
+ return df, dfVFS2, nil
}
// SetFlags sets the flags for the given file descriptor.
//
// True is returned iff flags were changed.
-func (f *FDTable) SetFlags(fd int32, flags FDFlags) error {
+func (f *FDTable) SetFlags(ctx context.Context, fd int32, flags FDFlags) error {
if fd < 0 {
// Don't accept negative FDs.
return syscall.EBADF
@@ -460,14 +493,14 @@ func (f *FDTable) SetFlags(fd int32, flags FDFlags) error {
}
// Update the flags.
- f.set(fd, file, flags)
+ f.set(ctx, fd, file, flags)
return nil
}
// SetFlagsVFS2 sets the flags for the given file descriptor.
//
// True is returned iff flags were changed.
-func (f *FDTable) SetFlagsVFS2(fd int32, flags FDFlags) error {
+func (f *FDTable) SetFlagsVFS2(ctx context.Context, fd int32, flags FDFlags) error {
if fd < 0 {
// Don't accept negative FDs.
return syscall.EBADF
@@ -483,7 +516,7 @@ func (f *FDTable) SetFlagsVFS2(fd int32, flags FDFlags) error {
}
// Update the flags.
- f.setVFS2(fd, file, flags)
+ f.setVFS2(ctx, fd, file, flags)
return nil
}
@@ -541,50 +574,23 @@ func (f *FDTable) GetVFS2(fd int32) (*vfs.FileDescription, FDFlags) {
//
// Precondition: The caller must be running on the task goroutine, or Task.mu
// must be locked.
-func (f *FDTable) GetFDs() []int32 {
+func (f *FDTable) GetFDs(ctx context.Context) []int32 {
fds := make([]int32, 0, int(atomic.LoadInt32(&f.used)))
- f.forEach(func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) {
+ f.forEach(ctx, func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) {
fds = append(fds, fd)
})
return fds
}
-// GetRefs returns a stable slice of references to all files and bumps the
-// reference count on each. The caller must use DecRef on each reference when
-// they're done using the slice.
-func (f *FDTable) GetRefs() []*fs.File {
- files := make([]*fs.File, 0, f.Size())
- f.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
- file.IncRef() // Acquire a reference for caller.
- files = append(files, file)
- })
- return files
-}
-
-// GetRefsVFS2 returns a stable slice of references to all files and bumps the
-// reference count on each. The caller must use DecRef on each reference when
-// they're done using the slice.
-func (f *FDTable) GetRefsVFS2() []*vfs.FileDescription {
- files := make([]*vfs.FileDescription, 0, f.Size())
- f.forEach(func(_ int32, _ *fs.File, file *vfs.FileDescription, _ FDFlags) {
- file.IncRef() // Acquire a reference for caller.
- files = append(files, file)
- })
- return files
-}
-
// Fork returns an independent FDTable.
-func (f *FDTable) Fork() *FDTable {
+func (f *FDTable) Fork(ctx context.Context) *FDTable {
clone := f.k.NewFDTable()
- f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+ f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
// The set function here will acquire an appropriate table
// reference for the clone. We don't need anything else.
- switch {
- case file != nil:
- clone.set(fd, file, flags)
- case fileVFS2 != nil:
- clone.setVFS2(fd, fileVFS2, flags)
+ if df, dfVFS2 := clone.setAll(ctx, fd, file, fileVFS2, flags); df != nil || dfVFS2 != nil {
+ panic("VFS1 or VFS2 files set")
}
})
return clone
@@ -593,13 +599,12 @@ func (f *FDTable) Fork() *FDTable {
// Remove removes an FD from and returns a non-file iff successful.
//
// N.B. Callers are required to use DecRef when they are done.
-func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) {
+func (f *FDTable) Remove(ctx context.Context, fd int32) (*fs.File, *vfs.FileDescription) {
if fd < 0 {
return nil, nil
}
f.mu.Lock()
- defer f.mu.Unlock()
// Update current available position.
if fd < f.next {
@@ -615,24 +620,51 @@ func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) {
case orig2 != nil:
orig2.IncRef()
}
+
if orig != nil || orig2 != nil {
- f.setAll(fd, nil, nil, FDFlags{}) // Zap entry.
+ orig, orig2 = f.setAll(ctx, fd, nil, nil, FDFlags{}) // Zap entry.
+ }
+ f.mu.Unlock()
+
+ if orig != nil {
+ f.drop(ctx, orig)
}
+ if orig2 != nil {
+ f.dropVFS2(ctx, orig2)
+ }
+
return orig, orig2
}
// RemoveIf removes all FDs where cond is true.
-func (f *FDTable) RemoveIf(cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) {
- f.mu.Lock()
- defer f.mu.Unlock()
+func (f *FDTable) RemoveIf(ctx context.Context, cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) {
+ // TODO(gvisor.dev/issue/1624): Remove fs.File slice.
+ var files []*fs.File
+ var filesVFS2 []*vfs.FileDescription
- f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+ f.mu.Lock()
+ f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
if cond(file, fileVFS2, flags) {
- f.set(fd, nil, FDFlags{}) // Clear from table.
+ df, dfVFS2 := f.setAll(ctx, fd, nil, nil, FDFlags{}) // Clear from table.
+ if df != nil {
+ files = append(files, df)
+ }
+ if dfVFS2 != nil {
+ filesVFS2 = append(filesVFS2, dfVFS2)
+ }
// Update current available position.
if fd < f.next {
f.next = fd
}
}
})
+ f.mu.Unlock()
+
+ for _, file := range files {
+ f.drop(ctx, file)
+ }
+
+ for _, file := range filesVFS2 {
+ f.dropVFS2(ctx, file)
+ }
}
diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go
index 29f95a2c4..bf5460083 100644
--- a/pkg/sentry/kernel/fd_table_test.go
+++ b/pkg/sentry/kernel/fd_table_test.go
@@ -72,7 +72,7 @@ func TestFDTableMany(t *testing.T) {
}
i := int32(2)
- fdTable.Remove(i)
+ fdTable.Remove(ctx, i)
if fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil || fds[0] != i {
t.Fatalf("Allocated %v FDs but wanted to allocate %v: %v", i, maxFD, err)
}
@@ -93,7 +93,7 @@ func TestFDTableOverLimit(t *testing.T) {
t.Fatalf("fdTable.NewFDs(maxFD-3, {f,f,f}): got %v, wanted nil", err)
} else {
for _, fd := range fds {
- fdTable.Remove(fd)
+ fdTable.Remove(ctx, fd)
}
}
@@ -150,13 +150,13 @@ func TestFDTable(t *testing.T) {
t.Fatalf("fdTable.Get(2): got a %v, wanted nil", ref)
}
- ref, _ := fdTable.Remove(1)
+ ref, _ := fdTable.Remove(ctx, 1)
if ref == nil {
t.Fatalf("fdTable.Remove(1) for an existing FD: failed, want success")
}
- ref.DecRef()
+ ref.DecRef(ctx)
- if ref, _ := fdTable.Remove(1); ref != nil {
+ if ref, _ := fdTable.Remove(ctx, 1); ref != nil {
t.Fatalf("r.Remove(1) for a removed FD: got success, want failure")
}
})
@@ -191,7 +191,7 @@ func BenchmarkFDLookupAndDecRef(b *testing.B) {
b.StartTimer() // Benchmark.
for i := 0; i < b.N; i++ {
tf, _ := fdTable.Get(fds[i%len(fds)])
- tf.DecRef()
+ tf.DecRef(ctx)
}
})
}
@@ -219,7 +219,7 @@ func BenchmarkFDLookupAndDecRefConcurrent(b *testing.B) {
defer wg.Done()
for i := 0; i < each; i++ {
tf, _ := fdTable.Get(fds[i%len(fds)])
- tf.DecRef()
+ tf.DecRef(ctx)
}
}()
}
diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go
index 7fd97dc53..da79e6627 100644
--- a/pkg/sentry/kernel/fd_table_unsafe.go
+++ b/pkg/sentry/kernel/fd_table_unsafe.go
@@ -18,6 +18,7 @@ import (
"sync/atomic"
"unsafe"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/vfs"
)
@@ -31,6 +32,8 @@ type descriptorTable struct {
}
// init initializes the table.
+//
+// TODO(gvisor.dev/1486): Enable leak check for FDTable.
func (f *FDTable) init() {
var slice []unsafe.Pointer // Empty slice.
atomic.StorePointer(&f.slice, unsafe.Pointer(&slice))
@@ -76,33 +79,37 @@ func (f *FDTable) getAll(fd int32) (*fs.File, *vfs.FileDescription, FDFlags, boo
return d.file, d.fileVFS2, d.flags, true
}
-// set sets an entry.
-//
-// This handles accounting changes, as well as acquiring and releasing the
-// reference needed by the table iff the file is different.
+// CurrentMaxFDs returns the number of file descriptors that may be stored in f
+// without reallocation.
+func (f *FDTable) CurrentMaxFDs() int {
+ slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice))
+ return len(slice)
+}
+
+// set sets an entry for VFS1, refer to setAll().
//
// Precondition: mu must be held.
-func (f *FDTable) set(fd int32, file *fs.File, flags FDFlags) {
- f.setAll(fd, file, nil, flags)
+func (f *FDTable) set(ctx context.Context, fd int32, file *fs.File, flags FDFlags) *fs.File {
+ dropFile, _ := f.setAll(ctx, fd, file, nil, flags)
+ return dropFile
}
-// setVFS2 sets an entry.
-//
-// This handles accounting changes, as well as acquiring and releasing the
-// reference needed by the table iff the file is different.
+// setVFS2 sets an entry for VFS2, refer to setAll().
//
// Precondition: mu must be held.
-func (f *FDTable) setVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) {
- f.setAll(fd, nil, file, flags)
+func (f *FDTable) setVFS2(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) *vfs.FileDescription {
+ _, dropFile := f.setAll(ctx, fd, nil, file, flags)
+ return dropFile
}
-// setAll sets an entry.
-//
-// This handles accounting changes, as well as acquiring and releasing the
-// reference needed by the table iff the file is different.
+// setAll sets the file description referred to by fd to file/fileVFS2. If
+// file/fileVFS2 are non-nil, it takes a reference on them. If setAll replaces
+// an existing file description, it returns it with the FDTable's reference
+// transferred to the caller, which must call f.drop/dropVFS2() on the returned
+// file after unlocking f.mu.
//
// Precondition: mu must be held.
-func (f *FDTable) setAll(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+func (f *FDTable) setAll(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) (*fs.File, *vfs.FileDescription) {
if file != nil && fileVFS2 != nil {
panic("VFS1 and VFS2 files set")
}
@@ -145,25 +152,25 @@ func (f *FDTable) setAll(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription,
}
}
- // Drop the table reference.
+ // Adjust used.
+ switch {
+ case orig == nil && desc != nil:
+ atomic.AddInt32(&f.used, 1)
+ case orig != nil && desc == nil:
+ atomic.AddInt32(&f.used, -1)
+ }
+
if orig != nil {
switch {
case orig.file != nil:
if desc == nil || desc.file != orig.file {
- f.drop(orig.file)
+ return orig.file, nil
}
case orig.fileVFS2 != nil:
if desc == nil || desc.fileVFS2 != orig.fileVFS2 {
- f.dropVFS2(orig.fileVFS2)
+ return nil, orig.fileVFS2
}
}
}
-
- // Adjust used.
- switch {
- case orig == nil && desc != nil:
- atomic.AddInt32(&f.used, 1)
- case orig != nil && desc == nil:
- atomic.AddInt32(&f.used, -1)
- }
+ return nil, nil
}
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
index 47f78df9a..d46d1e1c1 100644
--- a/pkg/sentry/kernel/fs_context.go
+++ b/pkg/sentry/kernel/fs_context.go
@@ -17,7 +17,7 @@ package kernel
import (
"fmt"
- "gvisor.dev/gvisor/pkg/refs"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
@@ -29,7 +29,7 @@ import (
//
// +stateify savable
type FSContext struct {
- refs.AtomicRefCount
+ FSContextRefs
// mu protects below.
mu sync.Mutex `state:"nosave"`
@@ -63,7 +63,7 @@ func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext {
cwd: cwd,
umask: umask,
}
- f.EnableLeakCheck("kernel.FSContext")
+ f.EnableLeakCheck()
return &f
}
@@ -76,54 +76,56 @@ func NewFSContextVFS2(root, cwd vfs.VirtualDentry, umask uint) *FSContext {
cwdVFS2: cwd,
umask: umask,
}
- f.EnableLeakCheck("kernel.FSContext")
+ f.EnableLeakCheck()
return &f
}
-// destroy is the destructor for an FSContext.
+// DecRef implements RefCounter.DecRef.
//
-// This will call DecRef on both root and cwd Dirents. If either call to
-// DecRef returns an error, then it will be propagated. If both calls to
-// DecRef return an error, then the one from root.DecRef will be propagated.
+// When f reaches zero references, DecRef will be called on both root and cwd
+// Dirents.
//
// Note that there may still be calls to WorkingDirectory() or RootDirectory()
// (that return nil). This is because valid references may still be held via
// proc files or other mechanisms.
-func (f *FSContext) destroy() {
- // Hold f.mu so that we don't race with RootDirectory() and
- // WorkingDirectory().
- f.mu.Lock()
- defer f.mu.Unlock()
-
- if VFS2Enabled {
- f.rootVFS2.DecRef()
- f.rootVFS2 = vfs.VirtualDentry{}
- f.cwdVFS2.DecRef()
- f.cwdVFS2 = vfs.VirtualDentry{}
- } else {
- f.root.DecRef()
- f.root = nil
- f.cwd.DecRef()
- f.cwd = nil
- }
-}
-
-// DecRef implements RefCounter.DecRef with destructor f.destroy.
-func (f *FSContext) DecRef() {
- f.DecRefWithDestructor(f.destroy)
+func (f *FSContext) DecRef(ctx context.Context) {
+ f.FSContextRefs.DecRef(func() {
+ // Hold f.mu so that we don't race with RootDirectory() and
+ // WorkingDirectory().
+ f.mu.Lock()
+ defer f.mu.Unlock()
+
+ if VFS2Enabled {
+ f.rootVFS2.DecRef(ctx)
+ f.rootVFS2 = vfs.VirtualDentry{}
+ f.cwdVFS2.DecRef(ctx)
+ f.cwdVFS2 = vfs.VirtualDentry{}
+ } else {
+ f.root.DecRef(ctx)
+ f.root = nil
+ f.cwd.DecRef(ctx)
+ f.cwd = nil
+ }
+ })
}
// Fork forks this FSContext.
//
-// This is not a valid call after destroy.
+// This is not a valid call after f is destroyed.
func (f *FSContext) Fork() *FSContext {
f.mu.Lock()
defer f.mu.Unlock()
if VFS2Enabled {
+ if !f.cwdVFS2.Ok() {
+ panic("FSContext.Fork() called after destroy")
+ }
f.cwdVFS2.IncRef()
f.rootVFS2.IncRef()
} else {
+ if f.cwd == nil {
+ panic("FSContext.Fork() called after destroy")
+ }
f.cwd.IncRef()
f.root.IncRef()
}
@@ -139,8 +141,8 @@ func (f *FSContext) Fork() *FSContext {
// WorkingDirectory returns the current working directory.
//
-// This will return nil if called after destroy(), otherwise it will return a
-// Dirent with a reference taken.
+// This will return nil if called after f is destroyed, otherwise it will return
+// a Dirent with a reference taken.
func (f *FSContext) WorkingDirectory() *fs.Dirent {
f.mu.Lock()
defer f.mu.Unlock()
@@ -151,8 +153,8 @@ func (f *FSContext) WorkingDirectory() *fs.Dirent {
// WorkingDirectoryVFS2 returns the current working directory.
//
-// This will return nil if called after destroy(), otherwise it will return a
-// Dirent with a reference taken.
+// This will return nil if called after f is destroyed, otherwise it will return
+// a Dirent with a reference taken.
func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry {
f.mu.Lock()
defer f.mu.Unlock()
@@ -164,8 +166,8 @@ func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry {
// SetWorkingDirectory sets the current working directory.
// This will take an extra reference on the Dirent.
//
-// This is not a valid call after destroy.
-func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) {
+// This is not a valid call after f is destroyed.
+func (f *FSContext) SetWorkingDirectory(ctx context.Context, d *fs.Dirent) {
if d == nil {
panic("FSContext.SetWorkingDirectory called with nil dirent")
}
@@ -180,27 +182,31 @@ func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) {
old := f.cwd
f.cwd = d
d.IncRef()
- old.DecRef()
+ old.DecRef(ctx)
}
// SetWorkingDirectoryVFS2 sets the current working directory.
// This will take an extra reference on the VirtualDentry.
//
-// This is not a valid call after destroy.
-func (f *FSContext) SetWorkingDirectoryVFS2(d vfs.VirtualDentry) {
+// This is not a valid call after f is destroyed.
+func (f *FSContext) SetWorkingDirectoryVFS2(ctx context.Context, d vfs.VirtualDentry) {
f.mu.Lock()
defer f.mu.Unlock()
+ if !f.cwdVFS2.Ok() {
+ panic(fmt.Sprintf("FSContext.SetWorkingDirectoryVFS2(%v)) called after destroy", d))
+ }
+
old := f.cwdVFS2
f.cwdVFS2 = d
d.IncRef()
- old.DecRef()
+ old.DecRef(ctx)
}
// RootDirectory returns the current filesystem root.
//
-// This will return nil if called after destroy(), otherwise it will return a
-// Dirent with a reference taken.
+// This will return nil if called after f is destroyed, otherwise it will return
+// a Dirent with a reference taken.
func (f *FSContext) RootDirectory() *fs.Dirent {
f.mu.Lock()
defer f.mu.Unlock()
@@ -212,8 +218,8 @@ func (f *FSContext) RootDirectory() *fs.Dirent {
// RootDirectoryVFS2 returns the current filesystem root.
//
-// This will return nil if called after destroy(), otherwise it will return a
-// Dirent with a reference taken.
+// This will return nil if called after f is destroyed, otherwise it will return
+// a Dirent with a reference taken.
func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry {
f.mu.Lock()
defer f.mu.Unlock()
@@ -225,8 +231,8 @@ func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry {
// SetRootDirectory sets the root directory.
// This will take an extra reference on the Dirent.
//
-// This is not a valid call after free.
-func (f *FSContext) SetRootDirectory(d *fs.Dirent) {
+// This is not a valid call after f is destroyed.
+func (f *FSContext) SetRootDirectory(ctx context.Context, d *fs.Dirent) {
if d == nil {
panic("FSContext.SetRootDirectory called with nil dirent")
}
@@ -241,13 +247,13 @@ func (f *FSContext) SetRootDirectory(d *fs.Dirent) {
old := f.root
f.root = d
d.IncRef()
- old.DecRef()
+ old.DecRef(ctx)
}
// SetRootDirectoryVFS2 sets the root directory. It takes a reference on vd.
//
-// This is not a valid call after free.
-func (f *FSContext) SetRootDirectoryVFS2(vd vfs.VirtualDentry) {
+// This is not a valid call after f is destroyed.
+func (f *FSContext) SetRootDirectoryVFS2(ctx context.Context, vd vfs.VirtualDentry) {
if !vd.Ok() {
panic("FSContext.SetRootDirectoryVFS2 called with zero-value VirtualDentry")
}
@@ -263,7 +269,7 @@ func (f *FSContext) SetRootDirectoryVFS2(vd vfs.VirtualDentry) {
vd.IncRef()
f.rootVFS2 = vd
f.mu.Unlock()
- old.DecRef()
+ old.DecRef(ctx)
}
// Umask returns the current umask.
diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
index c5021f2db..daa2dae76 100644
--- a/pkg/sentry/kernel/futex/BUILD
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -51,6 +51,7 @@ go_test(
srcs = ["futex_test.go"],
library = ":futex",
deps = [
+ "//pkg/context",
"//pkg/sync",
"//pkg/usermem",
],
diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
index 732e66da4..e4dcc4d40 100644
--- a/pkg/sentry/kernel/futex/futex.go
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -19,6 +19,7 @@ package futex
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
@@ -66,9 +67,9 @@ type Key struct {
Offset uint64
}
-func (k *Key) release() {
+func (k *Key) release(t Target) {
if k.MappingIdentity != nil {
- k.MappingIdentity.DecRef()
+ k.MappingIdentity.DecRef(t)
}
k.Mappable = nil
k.MappingIdentity = nil
@@ -94,6 +95,8 @@ func (k *Key) matches(k2 *Key) bool {
// Target abstracts memory accesses and keys.
type Target interface {
+ context.Context
+
// SwapUint32 gives access to usermem.IO.SwapUint32.
SwapUint32(addr usermem.Addr, new uint32) (uint32, error)
@@ -296,7 +299,7 @@ func (b *bucket) wakeWaiterLocked(w *Waiter) {
// bucket "to".
//
// Preconditions: b and to must be locked.
-func (b *bucket) requeueLocked(to *bucket, key, nkey *Key, n int) int {
+func (b *bucket) requeueLocked(t Target, to *bucket, key, nkey *Key, n int) int {
done := 0
for w := b.waiters.Front(); done < n && w != nil; {
if !w.key.matches(key) {
@@ -308,7 +311,7 @@ func (b *bucket) requeueLocked(to *bucket, key, nkey *Key, n int) int {
requeued := w
w = w.Next() // Next iteration.
b.waiters.Remove(requeued)
- requeued.key.release()
+ requeued.key.release(t)
requeued.key = nkey.clone()
to.waiters.PushBack(requeued)
requeued.bucket.Store(to)
@@ -456,7 +459,7 @@ func (m *Manager) Wake(t Target, addr usermem.Addr, private bool, bitmask uint32
r := b.wakeLocked(&k, bitmask, n)
b.mu.Unlock()
- k.release()
+ k.release(t)
return r, nil
}
@@ -465,12 +468,12 @@ func (m *Manager) doRequeue(t Target, addr, naddr usermem.Addr, private bool, ch
if err != nil {
return 0, err
}
- defer k1.release()
+ defer k1.release(t)
k2, err := getKey(t, naddr, private)
if err != nil {
return 0, err
}
- defer k2.release()
+ defer k2.release(t)
b1, b2 := m.lockBuckets(&k1, &k2)
defer b1.mu.Unlock()
@@ -488,7 +491,7 @@ func (m *Manager) doRequeue(t Target, addr, naddr usermem.Addr, private bool, ch
done := b1.wakeLocked(&k1, ^uint32(0), nwake)
// Requeue the number required.
- b1.requeueLocked(b2, &k1, &k2, nreq)
+ b1.requeueLocked(t, b2, &k1, &k2, nreq)
return done, nil
}
@@ -515,12 +518,12 @@ func (m *Manager) WakeOp(t Target, addr1, addr2 usermem.Addr, private bool, nwak
if err != nil {
return 0, err
}
- defer k1.release()
+ defer k1.release(t)
k2, err := getKey(t, addr2, private)
if err != nil {
return 0, err
}
- defer k2.release()
+ defer k2.release(t)
b1, b2 := m.lockBuckets(&k1, &k2)
defer b1.mu.Unlock()
@@ -571,7 +574,7 @@ func (m *Manager) WaitPrepare(w *Waiter, t Target, addr usermem.Addr, private bo
// Perform our atomic check.
if err := check(t, addr, val); err != nil {
b.mu.Unlock()
- w.key.release()
+ w.key.release(t)
return err
}
@@ -585,7 +588,7 @@ func (m *Manager) WaitPrepare(w *Waiter, t Target, addr usermem.Addr, private bo
// WaitComplete must be called when a Waiter previously added by WaitPrepare is
// no longer eligible to be woken.
-func (m *Manager) WaitComplete(w *Waiter) {
+func (m *Manager) WaitComplete(w *Waiter, t Target) {
// Remove w from the bucket it's in.
for {
b := w.bucket.Load()
@@ -617,7 +620,7 @@ func (m *Manager) WaitComplete(w *Waiter) {
}
// Release references held by the waiter.
- w.key.release()
+ w.key.release(t)
}
// LockPI attempts to lock the futex following the Priority-inheritance futex
@@ -648,13 +651,13 @@ func (m *Manager) LockPI(w *Waiter, t Target, addr usermem.Addr, tid uint32, pri
success, err := m.lockPILocked(w, t, addr, tid, b, try)
if err != nil {
- w.key.release()
+ w.key.release(t)
b.mu.Unlock()
return false, err
}
if success || try {
// Release waiter if it's not going to be a wait.
- w.key.release()
+ w.key.release(t)
}
b.mu.Unlock()
return success, nil
@@ -717,10 +720,10 @@ func (m *Manager) lockPILocked(w *Waiter, t Target, addr usermem.Addr, tid uint3
}
}
-// UnlockPI unlock the futex following the Priority-inheritance futex
-// rules. The address provided must contain the caller's TID. If there are
-// waiters, TID of the next waiter (FIFO) is set to the given address, and the
-// waiter woken up. If there are no waiters, 0 is set to the address.
+// UnlockPI unlocks the futex following the Priority-inheritance futex rules.
+// The address provided must contain the caller's TID. If there are waiters,
+// TID of the next waiter (FIFO) is set to the given address, and the waiter
+// woken up. If there are no waiters, 0 is set to the address.
func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool) error {
k, err := getKey(t, addr, private)
if err != nil {
@@ -730,7 +733,7 @@ func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool
err = m.unlockPILocked(t, addr, tid, b, &k)
- k.release()
+ k.release(t)
b.mu.Unlock()
return err
}
diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go
index 7c5c7665b..d0128c548 100644
--- a/pkg/sentry/kernel/futex/futex_test.go
+++ b/pkg/sentry/kernel/futex/futex_test.go
@@ -22,6 +22,7 @@ import (
"testing"
"unsafe"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -29,28 +30,33 @@ import (
// testData implements the Target interface, and allows us to
// treat the address passed for futex operations as an index in
// a byte slice for testing simplicity.
-type testData []byte
+type testData struct {
+ context.Context
+ data []byte
+}
const sizeofInt32 = 4
func newTestData(size uint) testData {
- return make([]byte, size)
+ return testData{
+ data: make([]byte, size),
+ }
}
func (t testData) SwapUint32(addr usermem.Addr, new uint32) (uint32, error) {
- val := atomic.SwapUint32((*uint32)(unsafe.Pointer(&t[addr])), new)
+ val := atomic.SwapUint32((*uint32)(unsafe.Pointer(&t.data[addr])), new)
return val, nil
}
func (t testData) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error) {
- if atomic.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&t[addr])), old, new) {
+ if atomic.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&t.data[addr])), old, new) {
return old, nil
}
- return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))), nil
+ return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t.data[addr]))), nil
}
func (t testData) LoadUint32(addr usermem.Addr) (uint32, error) {
- return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))), nil
+ return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t.data[addr]))), nil
}
func (t testData) GetSharedKey(addr usermem.Addr) (Key, error) {
@@ -83,7 +89,7 @@ func TestFutexWake(t *testing.T) {
// Start waiting for wakeup.
w := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
- defer m.WaitComplete(w)
+ defer m.WaitComplete(w, d)
// Perform a wakeup.
if n, err := m.Wake(d, 0, private, ^uint32(0), 1); err != nil || n != 1 {
@@ -106,7 +112,7 @@ func TestFutexWakeBitmask(t *testing.T) {
// Start waiting for wakeup.
w := newPreparedTestWaiter(t, m, d, 0, private, 0, 0x0000ffff)
- defer m.WaitComplete(w)
+ defer m.WaitComplete(w, d)
// Perform a wakeup using the wrong bitmask.
if n, err := m.Wake(d, 0, private, 0xffff0000, 1); err != nil || n != 0 {
@@ -141,7 +147,7 @@ func TestFutexWakeTwo(t *testing.T) {
var ws [3]*Waiter
for i := range ws {
ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
- defer m.WaitComplete(ws[i])
+ defer m.WaitComplete(ws[i], d)
}
// Perform two wakeups.
@@ -174,9 +180,9 @@ func TestFutexWakeUnrelated(t *testing.T) {
// Start two waiters waiting for wakeup on different addresses.
w1 := newPreparedTestWaiter(t, m, d, 0*sizeofInt32, private, 0, ^uint32(0))
- defer m.WaitComplete(w1)
+ defer m.WaitComplete(w1, d)
w2 := newPreparedTestWaiter(t, m, d, 1*sizeofInt32, private, 0, ^uint32(0))
- defer m.WaitComplete(w2)
+ defer m.WaitComplete(w2, d)
// Perform two wakeups on the second address.
if n, err := m.Wake(d, 1*sizeofInt32, private, ^uint32(0), 2); err != nil || n != 1 {
@@ -216,9 +222,9 @@ func TestWakeOpFirstNonEmpty(t *testing.T) {
// Add two waiters on address 0.
w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
- defer m.WaitComplete(w1)
+ defer m.WaitComplete(w1, d)
w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
- defer m.WaitComplete(w2)
+ defer m.WaitComplete(w2, d)
// Perform 10 wakeups on address 0.
if n, err := m.WakeOp(d, 0, sizeofInt32, private, 10, 0, 0); err != nil || n != 2 {
@@ -244,9 +250,9 @@ func TestWakeOpSecondNonEmpty(t *testing.T) {
// Add two waiters on address sizeofInt32.
w1 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
- defer m.WaitComplete(w1)
+ defer m.WaitComplete(w1, d)
w2 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
- defer m.WaitComplete(w2)
+ defer m.WaitComplete(w2, d)
// Perform 10 wakeups on address sizeofInt32 (contingent on
// d.Op(0), which should succeed).
@@ -273,9 +279,9 @@ func TestWakeOpSecondNonEmptyFailingOp(t *testing.T) {
// Add two waiters on address sizeofInt32.
w1 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
- defer m.WaitComplete(w1)
+ defer m.WaitComplete(w1, d)
w2 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
- defer m.WaitComplete(w2)
+ defer m.WaitComplete(w2, d)
// Perform 10 wakeups on address sizeofInt32 (contingent on
// d.Op(1), which should fail).
@@ -302,15 +308,15 @@ func TestWakeOpAllNonEmpty(t *testing.T) {
// Add two waiters on address 0.
w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
- defer m.WaitComplete(w1)
+ defer m.WaitComplete(w1, d)
w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
- defer m.WaitComplete(w2)
+ defer m.WaitComplete(w2, d)
// Add two waiters on address sizeofInt32.
w3 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
- defer m.WaitComplete(w3)
+ defer m.WaitComplete(w3, d)
w4 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
- defer m.WaitComplete(w4)
+ defer m.WaitComplete(w4, d)
// Perform 10 wakeups on address 0 (unconditionally), and 10
// wakeups on address sizeofInt32 (contingent on d.Op(0), which
@@ -344,15 +350,15 @@ func TestWakeOpAllNonEmptyFailingOp(t *testing.T) {
// Add two waiters on address 0.
w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
- defer m.WaitComplete(w1)
+ defer m.WaitComplete(w1, d)
w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
- defer m.WaitComplete(w2)
+ defer m.WaitComplete(w2, d)
// Add two waiters on address sizeofInt32.
w3 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
- defer m.WaitComplete(w3)
+ defer m.WaitComplete(w3, d)
w4 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
- defer m.WaitComplete(w4)
+ defer m.WaitComplete(w4, d)
// Perform 10 wakeups on address 0 (unconditionally), and 10
// wakeups on address sizeofInt32 (contingent on d.Op(1), which
@@ -388,7 +394,7 @@ func TestWakeOpSameAddress(t *testing.T) {
var ws [4]*Waiter
for i := range ws {
ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
- defer m.WaitComplete(ws[i])
+ defer m.WaitComplete(ws[i], d)
}
// Perform 1 wakeup on address 0 (unconditionally), and 1 wakeup
@@ -422,7 +428,7 @@ func TestWakeOpSameAddressFailingOp(t *testing.T) {
var ws [4]*Waiter
for i := range ws {
ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
- defer m.WaitComplete(ws[i])
+ defer m.WaitComplete(ws[i], d)
}
// Perform 1 wakeup on address 0 (unconditionally), and 1 wakeup
@@ -472,7 +478,7 @@ func (t *testMutex) Lock() {
for {
// Attempt to grab the lock.
if atomic.CompareAndSwapUint32(
- (*uint32)(unsafe.Pointer(&t.d[t.a])),
+ (*uint32)(unsafe.Pointer(&t.d.data[t.a])),
testMutexUnlocked,
testMutexLocked) {
// Lock held.
@@ -490,7 +496,7 @@ func (t *testMutex) Lock() {
panic("WaitPrepare returned unexpected error: " + err.Error())
}
<-w.C
- t.m.WaitComplete(w)
+ t.m.WaitComplete(w, t.d)
}
}
@@ -498,7 +504,7 @@ func (t *testMutex) Lock() {
// This will notify any waiters via the futex manager.
func (t *testMutex) Unlock() {
// Unlock.
- atomic.StoreUint32((*uint32)(unsafe.Pointer(&t.d[t.a])), testMutexUnlocked)
+ atomic.StoreUint32((*uint32)(unsafe.Pointer(&t.d.data[t.a])), testMutexUnlocked)
// Notify all waiters.
t.m.Wake(t.d, t.a, true, ^uint32(0), math.MaxInt32)
diff --git a/pkg/sentry/kernel/kcov.go b/pkg/sentry/kernel/kcov.go
new file mode 100644
index 000000000..060c056df
--- /dev/null
+++ b/pkg/sentry/kernel/kcov.go
@@ -0,0 +1,335 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "fmt"
+ "io"
+ "sync"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/coverage"
+ "gvisor.dev/gvisor/pkg/safemem"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/mm"
+ "gvisor.dev/gvisor/pkg/sentry/pgalloc"
+ "gvisor.dev/gvisor/pkg/sentry/usage"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// kcovAreaSizeMax is the maximum number of uint64 entries allowed in the kcov
+// area. On Linux, the maximum is INT_MAX / 8.
+const kcovAreaSizeMax = 10 * 1024 * 1024
+
+// Kcov provides kernel coverage data to userspace through a memory-mapped
+// region, as kcov does in Linux.
+//
+// To give the illusion that the data is always up to date, we update the shared
+// memory every time before we return to userspace.
+type Kcov struct {
+ // mfp provides application memory. It is immutable after creation.
+ mfp pgalloc.MemoryFileProvider
+
+ // mu protects all of the fields below.
+ mu sync.RWMutex
+
+ // mode is the current kcov mode.
+ mode uint8
+
+ // size is the size of the mapping through which the kernel conveys coverage
+ // information to userspace.
+ size uint64
+
+ // owningTask is the task that currently owns coverage data on the system. The
+ // interface for kcov essentially requires that coverage is only going to a
+ // single task. Note that kcov should only generate coverage data for the
+ // owning task, but we currently generate global coverage.
+ owningTask *Task
+
+ // count is a locally cached version of the first uint64 in the kcov data,
+ // which is the number of subsequent entries representing PCs.
+ //
+ // It is used with kcovInode.countBlock(), to copy in/out the first element of
+ // the actual data in an efficient manner, avoid boilerplate, and prevent
+ // accidental garbage escapes by the temporary counts.
+ count uint64
+
+ mappable *mm.SpecialMappable
+}
+
+// NewKcov creates and returns a Kcov instance.
+func (k *Kernel) NewKcov() *Kcov {
+ return &Kcov{
+ mfp: k,
+ }
+}
+
+var coveragePool = sync.Pool{
+ New: func() interface{} {
+ return make([]byte, 0)
+ },
+}
+
+// TaskWork implements TaskWorker.TaskWork.
+func (kcov *Kcov) TaskWork(t *Task) {
+ kcov.mu.Lock()
+ defer kcov.mu.Unlock()
+
+ if kcov.mode != linux.KCOV_MODE_TRACE_PC {
+ return
+ }
+
+ rw := &kcovReadWriter{
+ mf: kcov.mfp.MemoryFile(),
+ fr: kcov.mappable.FileRange(),
+ }
+
+ // Read in the PC count.
+ if _, err := safemem.ReadFullToBlocks(rw, kcov.countBlock()); err != nil {
+ panic(fmt.Sprintf("Internal error reading count from kcov area: %v", err))
+ }
+
+ rw.off = 8 * (1 + kcov.count)
+ n := coverage.ConsumeCoverageData(&kcovIOWriter{rw})
+
+ // Update the pc count, based on the number of entries written. Note that if
+ // we reached the end of the kcov area, we may not have written everything in
+ // output.
+ kcov.count += uint64(n / 8)
+ rw.off = 0
+ if _, err := safemem.WriteFullFromBlocks(rw, kcov.countBlock()); err != nil {
+ panic(fmt.Sprintf("Internal error writing count to kcov area: %v", err))
+ }
+
+ // Re-register for future work.
+ t.RegisterWork(kcov)
+}
+
+// InitTrace performs the KCOV_INIT_TRACE ioctl.
+func (kcov *Kcov) InitTrace(size uint64) error {
+ kcov.mu.Lock()
+ defer kcov.mu.Unlock()
+
+ if kcov.mode != linux.KCOV_MODE_DISABLED {
+ return syserror.EBUSY
+ }
+
+ // To simplify all the logic around mapping, we require that the length of the
+ // shared region is a multiple of the system page size.
+ if (8*size)&(usermem.PageSize-1) != 0 {
+ return syserror.EINVAL
+ }
+
+ // We need space for at least two uint64s to hold current position and a
+ // single PC.
+ if size < 2 || size > kcovAreaSizeMax {
+ return syserror.EINVAL
+ }
+
+ kcov.size = size
+ kcov.mode = linux.KCOV_MODE_INIT
+ return nil
+}
+
+// EnableTrace performs the KCOV_ENABLE_TRACE ioctl.
+func (kcov *Kcov) EnableTrace(ctx context.Context, traceKind uint8) error {
+ t := TaskFromContext(ctx)
+ if t == nil {
+ panic("kcovInode.EnableTrace() cannot be used outside of a task goroutine")
+ }
+
+ kcov.mu.Lock()
+ defer kcov.mu.Unlock()
+
+ // KCOV_ENABLE must be preceded by KCOV_INIT_TRACE and an mmap call.
+ if kcov.mode != linux.KCOV_MODE_INIT || kcov.mappable == nil {
+ return syserror.EINVAL
+ }
+
+ switch traceKind {
+ case linux.KCOV_TRACE_PC:
+ kcov.mode = linux.KCOV_MODE_TRACE_PC
+ case linux.KCOV_TRACE_CMP:
+ // We do not support KCOV_MODE_TRACE_CMP.
+ return syserror.ENOTSUP
+ default:
+ return syserror.EINVAL
+ }
+
+ if kcov.owningTask != nil && kcov.owningTask != t {
+ return syserror.EBUSY
+ }
+
+ kcov.owningTask = t
+ t.SetKcov(kcov)
+ t.RegisterWork(kcov)
+
+ // Clear existing coverage data; the task expects to read only coverage data
+ // from the time it is activated.
+ coverage.ClearCoverageData()
+ return nil
+}
+
+// DisableTrace performs the KCOV_DISABLE_TRACE ioctl.
+func (kcov *Kcov) DisableTrace(ctx context.Context) error {
+ kcov.mu.Lock()
+ defer kcov.mu.Unlock()
+
+ t := TaskFromContext(ctx)
+ if t == nil {
+ panic("kcovInode.EnableTrace() cannot be used outside of a task goroutine")
+ }
+
+ if t != kcov.owningTask {
+ return syserror.EINVAL
+ }
+ kcov.mode = linux.KCOV_MODE_INIT
+ kcov.owningTask = nil
+ kcov.mappable = nil
+ return nil
+}
+
+// Clear resets the mode and clears the owning task and memory mapping for kcov.
+// It is called when the fd corresponding to kcov is closed. Note that the mode
+// needs to be set so that the next call to kcov.TaskWork() will exit early.
+func (kcov *Kcov) Clear() {
+ kcov.mu.Lock()
+ kcov.clearLocked()
+ kcov.mu.Unlock()
+}
+
+func (kcov *Kcov) clearLocked() {
+ kcov.mode = linux.KCOV_MODE_INIT
+ kcov.owningTask = nil
+ kcov.mappable = nil
+}
+
+// OnTaskExit is called when the owning task exits. It is similar to
+// kcov.Clear(), except the memory mapping is not cleared, so that the same
+// mapping can be used in the future if kcov is enabled again by another task.
+func (kcov *Kcov) OnTaskExit() {
+ kcov.mu.Lock()
+ kcov.mode = linux.KCOV_MODE_INIT
+ kcov.owningTask = nil
+ kcov.mu.Unlock()
+}
+
+// ConfigureMMap is called by the vfs.FileDescription for this kcov instance to
+// implement vfs.FileDescription.ConfigureMMap.
+func (kcov *Kcov) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+ kcov.mu.Lock()
+ defer kcov.mu.Unlock()
+
+ if kcov.mode != linux.KCOV_MODE_INIT {
+ return syserror.EINVAL
+ }
+
+ if kcov.mappable == nil {
+ // Set up the kcov area.
+ fr, err := kcov.mfp.MemoryFile().Allocate(kcov.size*8, usage.Anonymous)
+ if err != nil {
+ return err
+ }
+
+ // Get the thread id for the mmap name.
+ t := TaskFromContext(ctx)
+ if t == nil {
+ panic("ThreadFromContext returned nil")
+ }
+ // For convenience, a special mappable is used here. Note that these mappings
+ // will look different under /proc/[pid]/maps than they do on Linux.
+ kcov.mappable = mm.NewSpecialMappable(fmt.Sprintf("[kcov:%d]", t.ThreadID()), kcov.mfp, fr)
+ }
+ opts.Mappable = kcov.mappable
+ opts.MappingIdentity = kcov.mappable
+ return nil
+}
+
+// kcovReadWriter implements safemem.Reader and safemem.Writer.
+type kcovReadWriter struct {
+ off uint64
+ mf *pgalloc.MemoryFile
+ fr memmap.FileRange
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks.
+func (rw *kcovReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+ if dsts.IsEmpty() {
+ return 0, nil
+ }
+
+ // Limit the read to the kcov range and check for overflow.
+ if rw.fr.Length() <= rw.off {
+ return 0, io.EOF
+ }
+ start := rw.fr.Start + rw.off
+ end := rw.fr.Start + rw.fr.Length()
+ if rend := start + dsts.NumBytes(); rend < end {
+ end = rend
+ }
+
+ // Get internal mappings.
+ bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, usermem.Read)
+ if err != nil {
+ return 0, err
+ }
+
+ // Copy from internal mappings.
+ n, err := safemem.CopySeq(dsts, bs)
+ rw.off += n
+ return n, err
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+func (rw *kcovReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+ if srcs.IsEmpty() {
+ return 0, nil
+ }
+
+ // Limit the write to the kcov area and check for overflow.
+ if rw.fr.Length() <= rw.off {
+ return 0, io.EOF
+ }
+ start := rw.fr.Start + rw.off
+ end := rw.fr.Start + rw.fr.Length()
+ if wend := start + srcs.NumBytes(); wend < end {
+ end = wend
+ }
+
+ // Get internal mapping.
+ bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, usermem.Write)
+ if err != nil {
+ return 0, err
+ }
+
+ // Copy to internal mapping.
+ n, err := safemem.CopySeq(bs, srcs)
+ rw.off += n
+ return n, err
+}
+
+// kcovIOWriter implements io.Writer as a basic wrapper over kcovReadWriter.
+type kcovIOWriter struct {
+ rw *kcovReadWriter
+}
+
+// Write implements io.Writer.Write.
+func (w *kcovIOWriter) Write(p []byte) (int, error) {
+ bs := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(p))
+ n, err := safemem.WriteFullFromBlocks(w.rw, bs)
+ return int(n), err
+}
diff --git a/pkg/sentry/kernel/kcov_unsafe.go b/pkg/sentry/kernel/kcov_unsafe.go
new file mode 100644
index 000000000..6f64022eb
--- /dev/null
+++ b/pkg/sentry/kernel/kcov_unsafe.go
@@ -0,0 +1,28 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "unsafe"
+
+ "gvisor.dev/gvisor/pkg/safemem"
+)
+
+// countBlock provides a safemem.BlockSeq for k.count.
+//
+// Like k.count, the block returned is protected by k.mu.
+func (k *Kcov) countBlock() safemem.BlockSeq {
+ return safemem.BlockSeqOf(safemem.BlockFromSafePointer(unsafe.Pointer(&k.count), int(unsafe.Sizeof(k.count))))
+}
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 2177b785a..d6c21adb7 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -81,6 +81,10 @@ import (
// easy access everywhere. To be removed once VFS2 becomes the default.
var VFS2Enabled = false
+// FUSEEnabled is set to true when FUSE is enabled. Added as a global for allow
+// easy access everywhere. To be removed once FUSE is completed.
+var FUSEEnabled = false
+
// Kernel represents an emulated Linux kernel. It must be initialized by calling
// Init() or LoadFrom().
//
@@ -216,13 +220,18 @@ type Kernel struct {
// danglingEndpoints is used to save / restore tcpip.DanglingEndpoints.
danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"`
- // sockets is the list of all network sockets the system. Protected by
- // extMu.
+ // sockets is the list of all network sockets in the system.
+ // Protected by extMu.
+ // TODO(gvisor.dev/issue/1624): Only used by VFS1.
sockets socketList
- // nextSocketEntry is the next entry number to use in sockets. Protected
+ // socketsVFS2 records all network sockets in the system. Protected by
+ // extMu.
+ socketsVFS2 map[*vfs.FileDescription]*SocketRecord
+
+ // nextSocketRecord is the next entry number to use in sockets. Protected
// by extMu.
- nextSocketEntry uint64
+ nextSocketRecord uint64
// deviceRegistry is used to save/restore device.SimpleDevices.
deviceRegistry struct{} `state:".(*device.Registry)"`
@@ -244,7 +253,7 @@ type Kernel struct {
// SpecialOpts contains special kernel options.
SpecialOpts
- // VFS keeps the filesystem state used across the kernel.
+ // vfs keeps the filesystem state used across the kernel.
vfs vfs.VirtualFilesystem
// hostMount is the Mount used for file descriptors that were imported
@@ -372,7 +381,8 @@ func (k *Kernel) Init(args InitKernelArgs) error {
k.netlinkPorts = port.New()
if VFS2Enabled {
- if err := k.vfs.Init(); err != nil {
+ ctx := k.SupervisorContext()
+ if err := k.vfs.Init(ctx); err != nil {
return fmt.Errorf("failed to initialize VFS: %v", err)
}
@@ -380,19 +390,19 @@ func (k *Kernel) Init(args InitKernelArgs) error {
if err != nil {
return fmt.Errorf("failed to create pipefs filesystem: %v", err)
}
- defer pipeFilesystem.DecRef()
+ defer pipeFilesystem.DecRef(ctx)
pipeMount, err := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{})
if err != nil {
return fmt.Errorf("failed to create pipefs mount: %v", err)
}
k.pipeMount = pipeMount
- tmpfsFilesystem, tmpfsRoot, err := tmpfs.NewFilesystem(k.SupervisorContext(), &k.vfs, auth.NewRootCredentials(k.rootUserNamespace))
+ tmpfsFilesystem, tmpfsRoot, err := tmpfs.NewFilesystem(ctx, &k.vfs, auth.NewRootCredentials(k.rootUserNamespace))
if err != nil {
return fmt.Errorf("failed to create tmpfs filesystem: %v", err)
}
- defer tmpfsFilesystem.DecRef()
- defer tmpfsRoot.DecRef()
+ defer tmpfsFilesystem.DecRef(ctx)
+ defer tmpfsRoot.DecRef(ctx)
shmMount, err := k.vfs.NewDisconnectedMount(tmpfsFilesystem, tmpfsRoot, &vfs.MountOptions{})
if err != nil {
return fmt.Errorf("failed to create tmpfs mount: %v", err)
@@ -403,12 +413,14 @@ func (k *Kernel) Init(args InitKernelArgs) error {
if err != nil {
return fmt.Errorf("failed to create sockfs filesystem: %v", err)
}
- defer socketFilesystem.DecRef()
+ defer socketFilesystem.DecRef(ctx)
socketMount, err := k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{})
if err != nil {
return fmt.Errorf("failed to create sockfs mount: %v", err)
}
k.socketMount = socketMount
+
+ k.socketsVFS2 = make(map[*vfs.FileDescription]*SocketRecord)
}
return nil
@@ -426,8 +438,8 @@ func (k *Kernel) SaveTo(w wire.Writer) error {
defer k.extMu.Unlock()
// Stop time.
- k.pauseTimeLocked()
- defer k.resumeTimeLocked()
+ k.pauseTimeLocked(ctx)
+ defer k.resumeTimeLocked(ctx)
// Evict all evictable MemoryFile allocations.
k.mf.StartEvictions()
@@ -443,12 +455,12 @@ func (k *Kernel) SaveTo(w wire.Writer) error {
// Remove all epoll waiter objects from underlying wait queues.
// NOTE: for programs to resume execution in future snapshot scenarios,
// we will need to re-establish these waiter objects after saving.
- k.tasks.unregisterEpollWaiters()
+ k.tasks.unregisterEpollWaiters(ctx)
// Clear the dirent cache before saving because Dirents must be Loaded in a
// particular order (parents before children), and Loading dirents from a cache
// breaks that order.
- if err := k.flushMountSourceRefs(); err != nil {
+ if err := k.flushMountSourceRefs(ctx); err != nil {
return err
}
@@ -501,7 +513,11 @@ func (k *Kernel) SaveTo(w wire.Writer) error {
// flushMountSourceRefs flushes the MountSources for all mounted filesystems
// and open FDs.
-func (k *Kernel) flushMountSourceRefs() error {
+func (k *Kernel) flushMountSourceRefs(ctx context.Context) error {
+ if VFS2Enabled {
+ return nil // Not relevant.
+ }
+
// Flush all mount sources for currently mounted filesystems in each task.
flushed := make(map[*fs.MountNamespace]struct{})
k.tasks.mu.RLock()
@@ -517,7 +533,7 @@ func (k *Kernel) flushMountSourceRefs() error {
// There may be some open FDs whose filesystems have been unmounted. We
// must flush those as well.
- return k.tasks.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error {
+ return k.tasks.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error {
file.Dirent.Inode.MountSource.FlushDirentRefs()
return nil
})
@@ -527,12 +543,7 @@ func (k *Kernel) flushMountSourceRefs() error {
// each task.
//
// Precondition: Must be called with the kernel paused.
-func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) (err error) {
- // TODO(gvisor.dev/issue/1663): Add save support for VFS2.
- if VFS2Enabled {
- return nil
- }
-
+func (ts *TaskSet) forEachFDPaused(ctx context.Context, f func(*fs.File, *vfs.FileDescription) error) (err error) {
ts.mu.RLock()
defer ts.mu.RUnlock()
for t := range ts.Root.tids {
@@ -540,7 +551,7 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error)
if t.fdTable == nil {
continue
}
- t.fdTable.forEach(func(_ int32, file *fs.File, fileVFS2 *vfs.FileDescription, _ FDFlags) {
+ t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fileVFS2 *vfs.FileDescription, _ FDFlags) {
if lastErr := f(file, fileVFS2); lastErr != nil && err == nil {
err = lastErr
}
@@ -551,7 +562,11 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error)
func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
- return ts.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error {
+ if VFS2Enabled {
+ return nil
+ }
+
+ return ts.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error {
if flags := file.Flags(); !flags.Write {
return nil
}
@@ -598,7 +613,7 @@ func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error {
return nil
}
-func (ts *TaskSet) unregisterEpollWaiters() {
+func (ts *TaskSet) unregisterEpollWaiters(ctx context.Context) {
// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
if VFS2Enabled {
return
@@ -619,7 +634,7 @@ func (ts *TaskSet) unregisterEpollWaiters() {
if _, ok := processed[t.fdTable]; ok {
continue
}
- t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
+ t.fdTable.forEach(ctx, func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
if e, ok := file.FileOperations.(*epoll.EventPoll); ok {
e.UnregisterEpollWaiters()
}
@@ -883,20 +898,21 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
opener fsbridge.Lookup
fsContext *FSContext
mntns *fs.MountNamespace
+ mntnsVFS2 *vfs.MountNamespace
)
if VFS2Enabled {
- mntnsVFS2 := args.MountNamespaceVFS2
+ mntnsVFS2 = args.MountNamespaceVFS2
if mntnsVFS2 == nil {
// MountNamespaceVFS2 adds a reference to the namespace, which is
// transferred to the new process.
mntnsVFS2 = k.globalInit.Leader().MountNamespaceVFS2()
}
// Get the root directory from the MountNamespace.
- root := args.MountNamespaceVFS2.Root()
+ root := mntnsVFS2.Root()
// The call to newFSContext below will take a reference on root, so we
// don't need to hold this one.
- defer root.DecRef()
+ defer root.DecRef(ctx)
// Grab the working directory.
wd := root // Default.
@@ -914,7 +930,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
if err != nil {
return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
}
- defer wd.DecRef()
+ defer wd.DecRef(ctx)
}
opener = fsbridge.NewVFSLookup(mntnsVFS2, root, wd)
fsContext = NewFSContextVFS2(root, wd, args.Umask)
@@ -929,7 +945,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
root := mntns.Root()
// The call to newFSContext below will take a reference on root, so we
// don't need to hold this one.
- defer root.DecRef()
+ defer root.DecRef(ctx)
// Grab the working directory.
remainingTraversals := args.MaxSymlinkTraversals
@@ -940,7 +956,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
if err != nil {
return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
}
- defer wd.DecRef()
+ defer wd.DecRef(ctx)
}
opener = fsbridge.NewFSLookup(mntns, root, wd)
fsContext = newFSContext(root, wd, args.Umask)
@@ -1003,7 +1019,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
UTSNamespace: args.UTSNamespace,
IPCNamespace: args.IPCNamespace,
AbstractSocketNamespace: args.AbstractSocketNamespace,
- MountNamespaceVFS2: args.MountNamespaceVFS2,
+ MountNamespaceVFS2: mntnsVFS2,
ContainerID: args.ContainerID,
}
t, err := k.tasks.NewTask(config)
@@ -1050,7 +1066,7 @@ func (k *Kernel) Start() error {
// If k was created by LoadKernelFrom, timers were stopped during
// Kernel.SaveTo and need to be resumed. If k was created by NewKernel,
// this is a no-op.
- k.resumeTimeLocked()
+ k.resumeTimeLocked(k.SupervisorContext())
// Start task goroutines.
k.tasks.mu.RLock()
defer k.tasks.mu.RUnlock()
@@ -1062,9 +1078,10 @@ func (k *Kernel) Start() error {
// pauseTimeLocked pauses all Timers and Timekeeper updates.
//
-// Preconditions: Any task goroutines running in k must be stopped. k.extMu
-// must be locked.
-func (k *Kernel) pauseTimeLocked() {
+// Preconditions:
+// * Any task goroutines running in k must be stopped.
+// * k.extMu must be locked.
+func (k *Kernel) pauseTimeLocked(ctx context.Context) {
// k.cpuClockTicker may be nil since Kernel.SaveTo() may be called before
// Kernel.Start().
if k.cpuClockTicker != nil {
@@ -1086,7 +1103,7 @@ func (k *Kernel) pauseTimeLocked() {
// This means we'll iterate FDTables shared by multiple tasks repeatedly,
// but ktime.Timer.Pause is idempotent so this is harmless.
if t.fdTable != nil {
- t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
+ t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
if VFS2Enabled {
if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok {
tfd.PauseTimer()
@@ -1106,9 +1123,10 @@ func (k *Kernel) pauseTimeLocked() {
// pauseTimeLocked has not been previously called, resumeTimeLocked has no
// effect.
//
-// Preconditions: Any task goroutines running in k must be stopped. k.extMu
-// must be locked.
-func (k *Kernel) resumeTimeLocked() {
+// Preconditions:
+// * Any task goroutines running in k must be stopped.
+// * k.extMu must be locked.
+func (k *Kernel) resumeTimeLocked(ctx context.Context) {
if k.cpuClockTicker != nil {
k.cpuClockTicker.Resume()
}
@@ -1122,7 +1140,7 @@ func (k *Kernel) resumeTimeLocked() {
}
}
if t.fdTable != nil {
- t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
+ t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
if VFS2Enabled {
if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok {
tfd.ResumeTimer()
@@ -1258,6 +1276,13 @@ func (k *Kernel) Pause() {
k.tasks.aioGoroutines.Wait()
}
+// ReceiveTaskStates receives full states for all tasks.
+func (k *Kernel) ReceiveTaskStates() {
+ k.extMu.Lock()
+ k.tasks.PullFullState()
+ k.extMu.Unlock()
+}
+
// Unpause ends the effect of a previous call to Pause. If Unpause is called
// without a matching preceding call to Pause, Unpause may panic.
func (k *Kernel) Unpause() {
@@ -1465,6 +1490,11 @@ func (k *Kernel) NowMonotonic() int64 {
return now
}
+// AfterFunc implements tcpip.Clock.AfterFunc.
+func (k *Kernel) AfterFunc(d time.Duration, f func()) tcpip.Timer {
+ return ktime.TcpipAfterFunc(k.realtimeClock, d, f)
+}
+
// SetMemoryFile sets Kernel.mf. SetMemoryFile must be called before Init or
// LoadFrom.
func (k *Kernel) SetMemoryFile(mf *pgalloc.MemoryFile) {
@@ -1489,20 +1519,27 @@ func (k *Kernel) SupervisorContext() context.Context {
}
}
-// SocketEntry represents a socket recorded in Kernel.sockets. It implements
+// SocketRecord represents a socket recorded in Kernel.socketsVFS2.
+//
+// +stateify savable
+type SocketRecord struct {
+ k *Kernel
+ Sock *refs.WeakRef // TODO(gvisor.dev/issue/1624): Only used by VFS1.
+ SockVFS2 *vfs.FileDescription // Only used by VFS2.
+ ID uint64 // Socket table entry number.
+}
+
+// SocketRecordVFS1 represents a socket recorded in Kernel.sockets. It implements
// refs.WeakRefUser for sockets stored in the socket table.
//
// +stateify savable
-type SocketEntry struct {
+type SocketRecordVFS1 struct {
socketEntry
- k *Kernel
- Sock *refs.WeakRef
- SockVFS2 *vfs.FileDescription
- ID uint64 // Socket table entry number.
+ SocketRecord
}
// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
-func (s *SocketEntry) WeakRefGone() {
+func (s *SocketRecordVFS1) WeakRefGone(context.Context) {
s.k.extMu.Lock()
s.k.sockets.Remove(s)
s.k.extMu.Unlock()
@@ -1513,9 +1550,14 @@ func (s *SocketEntry) WeakRefGone() {
// Precondition: Caller must hold a reference to sock.
func (k *Kernel) RecordSocket(sock *fs.File) {
k.extMu.Lock()
- id := k.nextSocketEntry
- k.nextSocketEntry++
- s := &SocketEntry{k: k, ID: id}
+ id := k.nextSocketRecord
+ k.nextSocketRecord++
+ s := &SocketRecordVFS1{
+ SocketRecord: SocketRecord{
+ k: k,
+ ID: id,
+ },
+ }
s.Sock = refs.NewWeakRef(sock, s)
k.sockets.PushBack(s)
k.extMu.Unlock()
@@ -1527,29 +1569,45 @@ func (k *Kernel) RecordSocket(sock *fs.File) {
// Precondition: Caller must hold a reference to sock.
//
// Note that the socket table will not hold a reference on the
-// vfs.FileDescription, because we do not support weak refs on VFS2 files.
+// vfs.FileDescription.
func (k *Kernel) RecordSocketVFS2(sock *vfs.FileDescription) {
k.extMu.Lock()
- id := k.nextSocketEntry
- k.nextSocketEntry++
- s := &SocketEntry{
+ if _, ok := k.socketsVFS2[sock]; ok {
+ panic(fmt.Sprintf("Socket %p added twice", sock))
+ }
+ id := k.nextSocketRecord
+ k.nextSocketRecord++
+ s := &SocketRecord{
k: k,
ID: id,
SockVFS2: sock,
}
- k.sockets.PushBack(s)
+ k.socketsVFS2[sock] = s
+ k.extMu.Unlock()
+}
+
+// DeleteSocketVFS2 removes a VFS2 socket from the system-wide socket table.
+func (k *Kernel) DeleteSocketVFS2(sock *vfs.FileDescription) {
+ k.extMu.Lock()
+ delete(k.socketsVFS2, sock)
k.extMu.Unlock()
}
// ListSockets returns a snapshot of all sockets.
//
-// Callers of ListSockets() in VFS2 should use SocketEntry.SockVFS2.TryIncRef()
+// Callers of ListSockets() in VFS2 should use SocketRecord.SockVFS2.TryIncRef()
// to get a reference on a socket in the table.
-func (k *Kernel) ListSockets() []*SocketEntry {
+func (k *Kernel) ListSockets() []*SocketRecord {
k.extMu.Lock()
- var socks []*SocketEntry
- for s := k.sockets.Front(); s != nil; s = s.Next() {
- socks = append(socks, s)
+ var socks []*SocketRecord
+ if VFS2Enabled {
+ for _, s := range k.socketsVFS2 {
+ socks = append(socks, s)
+ }
+ } else {
+ for s := k.sockets.Front(); s != nil; s = s.Next() {
+ socks = append(socks, &s.SocketRecord)
+ }
}
k.extMu.Unlock()
return socks
@@ -1591,7 +1649,7 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
return vfs.VirtualDentry{}
}
mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
- defer mntns.DecRef()
+ defer mntns.DecRef(ctx)
// Root() takes a reference on the root dirent for us.
return mntns.Root()
case vfs.CtxMountNamespace:
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index 449643118..99134e634 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -21,6 +21,7 @@ go_library(
"//pkg/amutex",
"//pkg/buffer",
"//pkg/context",
+ "//pkg/marshal/primitive",
"//pkg/safemem",
"//pkg/sentry/arch",
"//pkg/sentry/device",
diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go
index 4b688c627..6497dc4ba 100644
--- a/pkg/sentry/kernel/pipe/node.go
+++ b/pkg/sentry/kernel/pipe/node.go
@@ -93,7 +93,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi
if i.p.isNamed && !flags.NonBlocking && !i.p.HasWriters() {
if !waitFor(&i.mu, &i.wWakeup, ctx) {
- r.DecRef()
+ r.DecRef(ctx)
return nil, syserror.ErrInterrupted
}
}
@@ -111,12 +111,12 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi
// On a nonblocking, write-only open, the open fails with ENXIO if the
// read side isn't open yet.
if flags.NonBlocking {
- w.DecRef()
+ w.DecRef(ctx)
return nil, syserror.ENXIO
}
if !waitFor(&i.mu, &i.rWakeup, ctx) {
- w.DecRef()
+ w.DecRef(ctx)
return nil, syserror.ErrInterrupted
}
}
diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go
index ab75a87ff..ce0db5583 100644
--- a/pkg/sentry/kernel/pipe/node_test.go
+++ b/pkg/sentry/kernel/pipe/node_test.go
@@ -167,7 +167,7 @@ func TestClosedReaderBlocksWriteOpen(t *testing.T) {
f := NewInodeOperations(ctx, perms, newNamedPipe(t))
rFile, _ := testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, NonBlocking: true}, nil)
- rFile.DecRef()
+ rFile.DecRef(ctx)
wDone := make(chan struct{})
// This open for write should block because the reader is now gone.
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 79645d7d2..67beb0ad6 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -17,6 +17,7 @@ package pipe
import (
"fmt"
+ "io"
"sync/atomic"
"syscall"
@@ -152,7 +153,7 @@ func NewConnectedPipe(ctx context.Context, sizeBytes, atomicIOBytes int64) (*fs.
d := fs.NewDirent(ctx, fs.NewInode(ctx, iops, ms, sattr), fmt.Sprintf("pipe:[%d]", ino))
// The p.Open calls below will each take a reference on the Dirent. We
// must drop the one we already have.
- defer d.DecRef()
+ defer d.DecRef(ctx)
return p.Open(ctx, d, fs.FileFlags{Read: true}), p.Open(ctx, d, fs.FileFlags{Write: true})
}
@@ -200,22 +201,22 @@ type readOps struct {
//
// Precondition: this pipe must have readers.
func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) {
- // Don't block for a zero-length read even if the pipe is empty.
- if ops.left() == 0 {
- return 0, nil
- }
-
p.mu.Lock()
defer p.mu.Unlock()
return p.readLocked(ctx, ops)
}
func (p *Pipe) readLocked(ctx context.Context, ops readOps) (int64, error) {
+ // Don't block for a zero-length read even if the pipe is empty.
+ if ops.left() == 0 {
+ return 0, nil
+ }
+
// Is the pipe empty?
if p.view.Size() == 0 {
if !p.HasWriters() {
// There are no writers, return EOF.
- return 0, nil
+ return 0, io.EOF
}
return 0, syserror.ErrWouldBlock
}
@@ -388,6 +389,10 @@ func (p *Pipe) rwReadiness() waiter.EventMask {
func (p *Pipe) queued() int64 {
p.mu.Lock()
defer p.mu.Unlock()
+ return p.queuedLocked()
+}
+
+func (p *Pipe) queuedLocked() int64 {
return p.view.Size()
}
diff --git a/pkg/sentry/kernel/pipe/pipe_test.go b/pkg/sentry/kernel/pipe/pipe_test.go
index bda739dbe..fe97e9800 100644
--- a/pkg/sentry/kernel/pipe/pipe_test.go
+++ b/pkg/sentry/kernel/pipe/pipe_test.go
@@ -27,8 +27,8 @@ import (
func TestPipeRW(t *testing.T) {
ctx := contexttest.Context(t)
r, w := NewConnectedPipe(ctx, 65536, 4096)
- defer r.DecRef()
- defer w.DecRef()
+ defer r.DecRef(ctx)
+ defer w.DecRef(ctx)
msg := []byte("here's some bytes")
wantN := int64(len(msg))
@@ -47,8 +47,8 @@ func TestPipeRW(t *testing.T) {
func TestPipeReadBlock(t *testing.T) {
ctx := contexttest.Context(t)
r, w := NewConnectedPipe(ctx, 65536, 4096)
- defer r.DecRef()
- defer w.DecRef()
+ defer r.DecRef(ctx)
+ defer w.DecRef(ctx)
n, err := r.Readv(ctx, usermem.BytesIOSequence(make([]byte, 1)))
if n != 0 || err != syserror.ErrWouldBlock {
@@ -62,8 +62,8 @@ func TestPipeWriteBlock(t *testing.T) {
ctx := contexttest.Context(t)
r, w := NewConnectedPipe(ctx, capacity, atomicIOBytes)
- defer r.DecRef()
- defer w.DecRef()
+ defer r.DecRef(ctx)
+ defer w.DecRef(ctx)
msg := make([]byte, capacity+1)
n, err := w.Writev(ctx, usermem.BytesIOSequence(msg))
@@ -77,8 +77,8 @@ func TestPipeWriteUntilEnd(t *testing.T) {
ctx := contexttest.Context(t)
r, w := NewConnectedPipe(ctx, atomicIOBytes, atomicIOBytes)
- defer r.DecRef()
- defer w.DecRef()
+ defer r.DecRef(ctx)
+ defer w.DecRef(ctx)
msg := []byte("here's some bytes")
diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go
index aacf28da2..f665920cb 100644
--- a/pkg/sentry/kernel/pipe/pipe_util.go
+++ b/pkg/sentry/kernel/pipe/pipe_util.go
@@ -23,6 +23,7 @@ import (
"gvisor.dev/gvisor/pkg/amutex"
"gvisor.dev/gvisor/pkg/buffer"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/usermem"
@@ -33,7 +34,7 @@ import (
// the old fs architecture.
// Release cleans up the pipe's state.
-func (p *Pipe) Release() {
+func (p *Pipe) Release(context.Context) {
p.rClose()
p.wClose()
@@ -145,9 +146,14 @@ func (p *Pipe) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArgume
v = math.MaxInt32 // Silently truncate.
}
// Copy result to userspace.
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ iocc := primitive.IOCopyContext{
+ IO: io,
+ Ctx: ctx,
+ Opts: usermem.IOOpts{
+ AddressSpaceActive: true,
+ },
+ }
+ _, err := primitive.CopyInt32Out(&iocc, args[2].Pointer(), int32(v))
return 0, err
default:
return 0, syscall.ENOTTY
diff --git a/pkg/sentry/kernel/pipe/reader.go b/pkg/sentry/kernel/pipe/reader.go
index 7724b4452..ac18785c0 100644
--- a/pkg/sentry/kernel/pipe/reader.go
+++ b/pkg/sentry/kernel/pipe/reader.go
@@ -15,6 +15,7 @@
package pipe
import (
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -29,7 +30,7 @@ type Reader struct {
// Release implements fs.FileOperations.Release.
//
// This overrides ReaderWriter.Release.
-func (r *Reader) Release() {
+func (r *Reader) Release(context.Context) {
r.Pipe.rClose()
// Wake up writers.
diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go
index 45d4c5fc1..f61039f5b 100644
--- a/pkg/sentry/kernel/pipe/vfs.go
+++ b/pkg/sentry/kernel/pipe/vfs.go
@@ -67,6 +67,11 @@ func (vp *VFSPipe) ReaderWriterPair(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlag
return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags, locks), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags, locks)
}
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (*VFSPipe) Allocate(context.Context, uint64, uint64, uint64) error {
+ return syserror.ESPIPE
+}
+
// Open opens the pipe represented by vp.
func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) (*vfs.FileDescription, error) {
vp.mu.Lock()
@@ -101,7 +106,7 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s
// If this pipe is being opened as blocking and there's no
// writer, we have to wait for a writer to open the other end.
if vp.pipe.isNamed && statusFlags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() && !waitFor(&vp.mu, &vp.wWakeup, ctx) {
- fd.DecRef()
+ fd.DecRef(ctx)
return nil, syserror.EINTR
}
@@ -112,12 +117,12 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s
// Non-blocking, write-only opens fail with ENXIO when the read
// side isn't open yet.
if statusFlags&linux.O_NONBLOCK != 0 {
- fd.DecRef()
+ fd.DecRef(ctx)
return nil, syserror.ENXIO
}
// Wait for a reader to open the other end.
if !waitFor(&vp.mu, &vp.rWakeup, ctx) {
- fd.DecRef()
+ fd.DecRef(ctx)
return nil, syserror.EINTR
}
}
@@ -169,7 +174,7 @@ type VFSPipeFD struct {
}
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *VFSPipeFD) Release() {
+func (fd *VFSPipeFD) Release(context.Context) {
var event waiter.EventMask
if fd.vfsfd.IsReadable() {
fd.pipe.rClose()
@@ -244,19 +249,57 @@ func (fd *VFSPipeFD) SetPipeSize(size int64) (int64, error) {
return fd.pipe.SetFifoSize(size)
}
-// IOSequence returns a useremm.IOSequence that reads up to count bytes from,
-// or writes up to count bytes to, fd.
-func (fd *VFSPipeFD) IOSequence(count int64) usermem.IOSequence {
- return usermem.IOSequence{
+// SpliceToNonPipe performs a splice operation from fd to a non-pipe file.
+func (fd *VFSPipeFD) SpliceToNonPipe(ctx context.Context, out *vfs.FileDescription, off, count int64) (int64, error) {
+ fd.pipe.mu.Lock()
+ defer fd.pipe.mu.Unlock()
+
+ // Cap the sequence at number of bytes actually available.
+ v := fd.pipe.queuedLocked()
+ if v < count {
+ count = v
+ }
+ src := usermem.IOSequence{
IO: fd,
Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}),
}
+
+ var (
+ n int64
+ err error
+ )
+ if off == -1 {
+ n, err = out.Write(ctx, src, vfs.WriteOptions{})
+ } else {
+ n, err = out.PWrite(ctx, src, off, vfs.WriteOptions{})
+ }
+ if n > 0 {
+ fd.pipe.view.TrimFront(n)
+ }
+ return n, err
+}
+
+// SpliceFromNonPipe performs a splice operation from a non-pipe file to fd.
+func (fd *VFSPipeFD) SpliceFromNonPipe(ctx context.Context, in *vfs.FileDescription, off, count int64) (int64, error) {
+ fd.pipe.mu.Lock()
+ defer fd.pipe.mu.Unlock()
+
+ dst := usermem.IOSequence{
+ IO: fd,
+ Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}),
+ }
+
+ if off == -1 {
+ return in.Read(ctx, dst, vfs.ReadOptions{})
+ }
+ return in.PRead(ctx, dst, off, vfs.ReadOptions{})
}
-// CopyIn implements usermem.IO.CopyIn.
+// CopyIn implements usermem.IO.CopyIn. Note that it is the caller's
+// responsibility to trim fd.pipe.view after the read is completed.
func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) {
origCount := int64(len(dst))
- n, err := fd.pipe.read(ctx, readOps{
+ n, err := fd.pipe.readLocked(ctx, readOps{
left: func() int64 {
return int64(len(dst))
},
@@ -265,7 +308,6 @@ func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte,
},
read: func(view *buffer.View) (int64, error) {
n, err := view.ReadAt(dst, 0)
- view.TrimFront(int64(n))
return int64(n), err
},
})
@@ -281,7 +323,7 @@ func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte,
// CopyOut implements usermem.IO.CopyOut.
func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) {
origCount := int64(len(src))
- n, err := fd.pipe.write(ctx, writeOps{
+ n, err := fd.pipe.writeLocked(ctx, writeOps{
left: func() int64 {
return int64(len(src))
},
@@ -305,7 +347,7 @@ func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte,
// ZeroOut implements usermem.IO.ZeroOut.
func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) {
origCount := toZero
- n, err := fd.pipe.write(ctx, writeOps{
+ n, err := fd.pipe.writeLocked(ctx, writeOps{
left: func() int64 {
return toZero
},
@@ -326,14 +368,15 @@ func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int6
return n, err
}
-// CopyInTo implements usermem.IO.CopyInTo.
+// CopyInTo implements usermem.IO.CopyInTo. Note that it is the caller's
+// responsibility to trim fd.pipe.view after the read is completed.
func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) {
count := ars.NumBytes()
if count == 0 {
return 0, nil
}
origCount := count
- n, err := fd.pipe.read(ctx, readOps{
+ n, err := fd.pipe.readLocked(ctx, readOps{
left: func() int64 {
return count
},
@@ -342,7 +385,6 @@ func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst
},
read: func(view *buffer.View) (int64, error) {
n, err := view.ReadToSafememWriter(dst, uint64(count))
- view.TrimFront(int64(n))
return int64(n), err
},
})
@@ -362,7 +404,7 @@ func (fd *VFSPipeFD) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq,
return 0, nil
}
origCount := count
- n, err := fd.pipe.write(ctx, writeOps{
+ n, err := fd.pipe.writeLocked(ctx, writeOps{
left: func() int64 {
return count
},
diff --git a/pkg/sentry/kernel/pipe/writer.go b/pkg/sentry/kernel/pipe/writer.go
index 5bc6aa931..ef4b70ca3 100644
--- a/pkg/sentry/kernel/pipe/writer.go
+++ b/pkg/sentry/kernel/pipe/writer.go
@@ -15,6 +15,7 @@
package pipe
import (
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -29,7 +30,7 @@ type Writer struct {
// Release implements fs.FileOperations.Release.
//
// This overrides ReaderWriter.Release.
-func (w *Writer) Release() {
+func (w *Writer) Release(context.Context) {
w.Pipe.wClose()
// Wake up readers.
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index e23e796ef..1145faf13 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -18,6 +18,7 @@ import (
"fmt"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/mm"
"gvisor.dev/gvisor/pkg/syserror"
@@ -224,8 +225,9 @@ func (s *ptraceStop) Killable() bool {
// beginPtraceStopLocked does not signal t's tracer or wake it if it is
// waiting.
//
-// Preconditions: The TaskSet mutex must be locked. The caller must be running
-// on the task goroutine.
+// Preconditions:
+// * The TaskSet mutex must be locked.
+// * The caller must be running on the task goroutine.
func (t *Task) beginPtraceStopLocked() bool {
t.tg.signalHandlers.mu.Lock()
defer t.tg.signalHandlers.mu.Unlock()
@@ -270,8 +272,9 @@ func (t *Task) ptraceTrapLocked(code int32) {
// ptraceStop, temporarily preventing it from being removed by a concurrent
// Task.Kill, and returns true. Otherwise it returns false.
//
-// Preconditions: The TaskSet mutex must be locked. The caller must be running
-// on the task goroutine of t's tracer.
+// Preconditions:
+// * The TaskSet mutex must be locked.
+// * The caller must be running on the task goroutine of t's tracer.
func (t *Task) ptraceFreeze() bool {
t.tg.signalHandlers.mu.Lock()
defer t.tg.signalHandlers.mu.Unlock()
@@ -301,8 +304,9 @@ func (t *Task) ptraceUnfreeze() {
t.ptraceUnfreezeLocked()
}
-// Preconditions: t must be in a frozen ptraceStop. t's signal mutex must be
-// locked.
+// Preconditions:
+// * t must be in a frozen ptraceStop.
+// * t's signal mutex must be locked.
func (t *Task) ptraceUnfreezeLocked() {
// Do this even if the task has been killed to ensure a panic if t.stop is
// nil or not a ptraceStop.
@@ -497,8 +501,9 @@ func (t *Task) forgetTracerLocked() {
// ptraceSignalLocked is called after signal dequeueing to check if t should
// enter ptrace signal-delivery-stop.
//
-// Preconditions: The signal mutex must be locked. The caller must be running
-// on the task goroutine.
+// Preconditions:
+// * The signal mutex must be locked.
+// * The caller must be running on the task goroutine.
func (t *Task) ptraceSignalLocked(info *arch.SignalInfo) bool {
if linux.Signal(info.Signo) == linux.SIGKILL {
return false
@@ -828,8 +833,9 @@ func (t *Task) ptraceInterrupt(target *Task) error {
return nil
}
-// Preconditions: The TaskSet mutex must be locked for writing. t must have a
-// tracer.
+// Preconditions:
+// * The TaskSet mutex must be locked for writing.
+// * t must have a tracer.
func (t *Task) ptraceSetOptionsLocked(opts uintptr) error {
const valid = uintptr(linux.PTRACE_O_EXITKILL |
linux.PTRACE_O_TRACESYSGOOD |
@@ -994,18 +1000,15 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
// at the address specified by the data parameter, and the return value
// is the error flag." - ptrace(2)
word := t.Arch().Native(0)
- if _, err := usermem.CopyObjectIn(t, target.MemoryManager(), addr, word, usermem.IOOpts{
- IgnorePermissions: true,
- }); err != nil {
+ if _, err := word.CopyIn(target.AsCopyContext(usermem.IOOpts{IgnorePermissions: true}), addr); err != nil {
return err
}
- _, err := t.CopyOut(data, word)
+ _, err := word.CopyOut(t, data)
return err
case linux.PTRACE_POKETEXT, linux.PTRACE_POKEDATA:
- _, err := usermem.CopyObjectOut(t, target.MemoryManager(), addr, t.Arch().Native(uintptr(data)), usermem.IOOpts{
- IgnorePermissions: true,
- })
+ word := t.Arch().Native(uintptr(data))
+ _, err := word.CopyOut(target.AsCopyContext(usermem.IOOpts{IgnorePermissions: true}), addr)
return err
case linux.PTRACE_GETREGSET:
@@ -1018,6 +1021,9 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
if err != nil {
return err
}
+
+ t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch())
+
ar := ars.Head()
n, err := target.Arch().PtraceGetRegSet(uintptr(addr), &usermem.IOReadWriter{
Ctx: t,
@@ -1044,10 +1050,14 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
if err != nil {
return err
}
+
+ mm := t.MemoryManager()
+ t.p.PullFullState(mm.AddressSpace(), t.Arch())
+
ar := ars.Head()
n, err := target.Arch().PtraceSetRegSet(uintptr(addr), &usermem.IOReadWriter{
Ctx: t,
- IO: t.MemoryManager(),
+ IO: mm,
Addr: ar.Start,
Opts: usermem.IOOpts{
AddressSpaceActive: true,
@@ -1056,6 +1066,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
if err != nil {
return err
}
+ t.p.FullStateChanged()
ar.End -= usermem.Addr(n)
return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar))
@@ -1065,12 +1076,12 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
if target.ptraceSiginfo == nil {
return syserror.EINVAL
}
- _, err := t.CopyOut(data, target.ptraceSiginfo)
+ _, err := target.ptraceSiginfo.CopyOut(t, data)
return err
case linux.PTRACE_SETSIGINFO:
var info arch.SignalInfo
- if _, err := t.CopyIn(data, &info); err != nil {
+ if _, err := info.CopyIn(t, data); err != nil {
return err
}
t.tg.pidns.owner.mu.RLock()
@@ -1085,7 +1096,8 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
if addr != linux.SignalSetSize {
return syserror.EINVAL
}
- _, err := t.CopyOut(data, target.SignalMask())
+ mask := target.SignalMask()
+ _, err := mask.CopyOut(t, data)
return err
case linux.PTRACE_SETSIGMASK:
@@ -1093,7 +1105,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
return syserror.EINVAL
}
var mask linux.SignalSet
- if _, err := t.CopyIn(data, &mask); err != nil {
+ if _, err := mask.CopyIn(t, data); err != nil {
return err
}
// The target's task goroutine is stopped, so this is safe:
@@ -1108,7 +1120,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
case linux.PTRACE_GETEVENTMSG:
t.tg.pidns.owner.mu.RLock()
defer t.tg.pidns.owner.mu.RUnlock()
- _, err := t.CopyOut(usermem.Addr(data), target.ptraceEventMsg)
+ _, err := primitive.CopyUint64Out(t, usermem.Addr(data), target.ptraceEventMsg)
return err
// PEEKSIGINFO is unimplemented but seems to have no users anywhere.
diff --git a/pkg/sentry/kernel/ptrace_amd64.go b/pkg/sentry/kernel/ptrace_amd64.go
index cef1276ec..609ad3941 100644
--- a/pkg/sentry/kernel/ptrace_amd64.go
+++ b/pkg/sentry/kernel/ptrace_amd64.go
@@ -30,7 +30,7 @@ func (t *Task) ptraceArch(target *Task, req int64, addr, data usermem.Addr) erro
if err != nil {
return err
}
- _, err = t.CopyOut(data, n)
+ _, err = n.CopyOut(t, data)
return err
case linux.PTRACE_POKEUSR: // aka PTRACE_POKEUSER
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index 18416643b..2a9023fdf 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -173,8 +173,10 @@ func (t *Task) OldRSeqCPUAddr() usermem.Addr {
// SetOldRSeqCPUAddr replaces the address that old rseq will keep updated with
// t's CPU number.
//
-// Preconditions: t.RSeqAvailable() == true. The caller must be running on the
-// task goroutine. t's AddressSpace must be active.
+// Preconditions:
+// * t.RSeqAvailable() == true.
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
func (t *Task) SetOldRSeqCPUAddr(addr usermem.Addr) error {
t.oldRSeqCPUAddr = addr
@@ -189,8 +191,9 @@ func (t *Task) SetOldRSeqCPUAddr(addr usermem.Addr) error {
return nil
}
-// Preconditions: The caller must be running on the task goroutine. t's
-// AddressSpace must be active.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
func (t *Task) rseqUpdateCPU() error {
if t.rseqAddr == 0 && t.oldRSeqCPUAddr == 0 {
t.rseqCPU = -1
@@ -209,8 +212,9 @@ func (t *Task) rseqUpdateCPU() error {
return oerr
}
-// Preconditions: The caller must be running on the task goroutine. t's
-// AddressSpace must be active.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
func (t *Task) oldRSeqCopyOutCPU() error {
if t.oldRSeqCPUAddr == 0 {
return nil
@@ -222,8 +226,9 @@ func (t *Task) oldRSeqCopyOutCPU() error {
return err
}
-// Preconditions: The caller must be running on the task goroutine. t's
-// AddressSpace must be active.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
func (t *Task) rseqCopyOutCPU() error {
if t.rseqAddr == 0 {
return nil
@@ -240,8 +245,9 @@ func (t *Task) rseqCopyOutCPU() error {
return err
}
-// Preconditions: The caller must be running on the task goroutine. t's
-// AddressSpace must be active.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
func (t *Task) rseqClearCPU() error {
buf := t.CopyScratchBuffer(8)
// CPUIDStart and CPUID are the first two fields in linux.RSeq.
@@ -269,8 +275,9 @@ func (t *Task) rseqClearCPU() error {
//
// See kernel/rseq.c:rseq_ip_fixup for reference.
//
-// Preconditions: The caller must be running on the task goroutine. t's
-// AddressSpace must be active.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
func (t *Task) rseqAddrInterrupt() {
if t.rseqAddr == 0 {
return
diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go
index c38c5a40c..387edfa91 100644
--- a/pkg/sentry/kernel/seccomp.go
+++ b/pkg/sentry/kernel/seccomp.go
@@ -18,7 +18,6 @@ import (
"syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/bpf"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/syserror"
@@ -27,25 +26,18 @@ import (
const maxSyscallFilterInstructions = 1 << 15
-// seccompData is equivalent to struct seccomp_data, which contains the data
-// passed to seccomp-bpf filters.
-type seccompData struct {
- // nr is the system call number.
- nr int32
-
- // arch is an AUDIT_ARCH_* value indicating the system call convention.
- arch uint32
-
- // instructionPointer is the value of the instruction pointer at the time
- // of the system call.
- instructionPointer uint64
-
- // args contains the first 6 system call arguments.
- args [6]uint64
-}
-
-func (d *seccompData) asBPFInput() bpf.Input {
- return bpf.InputBytes{binary.Marshal(nil, usermem.ByteOrder, d), usermem.ByteOrder}
+// dataAsBPFInput returns a serialized BPF program, only valid on the current task
+// goroutine.
+//
+// Note: this is called for every syscall, which is a very hot path.
+func dataAsBPFInput(t *Task, d *linux.SeccompData) bpf.Input {
+ buf := t.CopyScratchBuffer(d.SizeBytes())
+ d.MarshalUnsafe(buf)
+ return bpf.InputBytes{
+ Data: buf,
+ // Go-marshal always uses the native byte order.
+ Order: usermem.ByteOrder,
+ }
}
func seccompSiginfo(t *Task, errno, sysno int32, ip usermem.Addr) *arch.SignalInfo {
@@ -112,20 +104,20 @@ func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip u
}
func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip usermem.Addr) uint32 {
- data := seccompData{
- nr: sysno,
- arch: t.tc.st.AuditNumber,
- instructionPointer: uint64(ip),
+ data := linux.SeccompData{
+ Nr: sysno,
+ Arch: t.tc.st.AuditNumber,
+ InstructionPointer: uint64(ip),
}
// data.args is []uint64 and args is []arch.SyscallArgument (uintptr), so
// we can't do any slicing tricks or even use copy/append here.
for i, arg := range args {
- if i >= len(data.args) {
+ if i >= len(data.Args) {
break
}
- data.args[i] = arg.Uint64()
+ data.Args[i] = arg.Uint64()
}
- input := data.asBPFInput()
+ input := dataAsBPFInput(t, &data)
ret := uint32(linux.SECCOMP_RET_ALLOW)
f := t.syscallFilters.Load()
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index 0e19286de..df5c8421b 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -16,7 +16,6 @@ package kernel
import (
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -31,7 +30,7 @@ type ProcessGroupID ThreadID
//
// +stateify savable
type Session struct {
- refs refs.AtomicRefCount
+ SessionRefs
// leader is the originator of the Session.
//
@@ -61,16 +60,11 @@ type Session struct {
sessionEntry
}
-// incRef grabs a reference.
-func (s *Session) incRef() {
- s.refs.IncRef()
-}
-
-// decRef drops a reference.
+// DecRef drops a reference.
//
// Precondition: callers must hold TaskSet.mu for writing.
-func (s *Session) decRef() {
- s.refs.DecRefWithDestructor(func() {
+func (s *Session) DecRef() {
+ s.SessionRefs.DecRef(func() {
// Remove translations from the leader.
for ns := s.leader.pidns; ns != nil; ns = ns.parent {
id := ns.sids[s]
@@ -87,7 +81,7 @@ func (s *Session) decRef() {
//
// +stateify savable
type ProcessGroup struct {
- refs refs.AtomicRefCount // not exported.
+ refs ProcessGroupRefs
// originator is the originator of the group.
//
@@ -162,7 +156,7 @@ func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) {
}
alive := true
- pg.refs.DecRefWithDestructor(func() {
+ pg.refs.DecRef(func() {
alive = false // don't bother with handleOrphan.
// Remove translations from the originator.
@@ -174,7 +168,7 @@ func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) {
// Remove the list of process groups.
pg.session.processGroups.Remove(pg)
- pg.session.decRef()
+ pg.session.DecRef()
})
if alive {
pg.handleOrphan()
@@ -301,7 +295,7 @@ func (tg *ThreadGroup) createSession() error {
id: SessionID(id),
leader: tg,
}
- s.refs.EnableLeakCheck("kernel.Session")
+ s.EnableLeakCheck()
// Create a new ProcessGroup, belonging to that Session.
// This also has a single reference (assigned below).
@@ -315,7 +309,7 @@ func (tg *ThreadGroup) createSession() error {
session: s,
ancestors: 0,
}
- pg.refs.EnableLeakCheck("kernel.ProcessGroup")
+ pg.refs.EnableLeakCheck()
// Tie them and return the result.
s.processGroups.PushBack(pg)
@@ -395,13 +389,13 @@ func (tg *ThreadGroup) CreateProcessGroup() error {
//
// We manually adjust the ancestors if the parent is in the same
// session.
- tg.processGroup.session.incRef()
+ tg.processGroup.session.IncRef()
pg := ProcessGroup{
id: ProcessGroupID(id),
originator: tg,
session: tg.processGroup.session,
}
- pg.refs.EnableLeakCheck("kernel.ProcessGroup")
+ pg.refs.EnableLeakCheck()
if tg.leader.parent != nil && tg.leader.parent.tg.processGroup.session == pg.session {
pg.ancestors++
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
index bfd779837..b7e4b480d 100644
--- a/pkg/sentry/kernel/shm/BUILD
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -1,12 +1,25 @@
load("//tools:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
package(licenses = ["notice"])
+go_template_instance(
+ name = "shm_refs",
+ out = "shm_refs.go",
+ package = "shm",
+ prefix = "Shm",
+ template = "//pkg/refs_vfs2:refs_template",
+ types = {
+ "T": "Shm",
+ },
+)
+
go_library(
name = "shm",
srcs = [
"device.go",
"shm.go",
+ "shm_refs.go",
],
visibility = ["//pkg/sentry:internal"],
deps = [
@@ -20,7 +33,6 @@ go_library(
"//pkg/sentry/kernel/time",
"//pkg/sentry/memmap",
"//pkg/sentry/pgalloc",
- "//pkg/sentry/platform",
"//pkg/sentry/usage",
"//pkg/sync",
"//pkg/syserror",
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index f66cfcc7f..00c03585e 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -39,13 +39,11 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/log"
- "gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
- "gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
@@ -253,7 +251,7 @@ func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.Fi
creatorPID: pid,
changeTime: ktime.NowFromContext(ctx),
}
- shm.EnableLeakCheck("kernel.Shm")
+ shm.EnableLeakCheck()
// Find the next available ID.
for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ {
@@ -338,14 +336,14 @@ func (r *Registry) remove(s *Shm) {
//
// +stateify savable
type Shm struct {
- // AtomicRefCount tracks the number of references to this segment.
+ // ShmRefs tracks the number of references to this segment.
//
// A segment holds a reference to itself until it is marked for
// destruction.
//
// In addition to direct users, the MemoryManager will hold references
// via MappingIdentity.
- refs.AtomicRefCount
+ ShmRefs
mfp pgalloc.MemoryFileProvider
@@ -370,7 +368,7 @@ type Shm struct {
// fr is the offset into mfp.MemoryFile() that backs this contents of this
// segment. Immutable.
- fr platform.FileRange
+ fr memmap.FileRange
// mu protects all fields below.
mu sync.Mutex `state:"nosave"`
@@ -429,11 +427,14 @@ func (s *Shm) InodeID() uint64 {
return uint64(s.ID)
}
-// DecRef overrides refs.RefCount.DecRef with a destructor.
+// DecRef drops a reference on s.
//
// Precondition: Caller must not hold s.mu.
-func (s *Shm) DecRef() {
- s.DecRefWithDestructor(s.destroy)
+func (s *Shm) DecRef(ctx context.Context) {
+ s.ShmRefs.DecRef(func() {
+ s.mfp.MemoryFile().DecRef(s.fr)
+ s.registry.remove(s)
+ })
}
// Msync implements memmap.MappingIdentity.Msync. Msync is a no-op for shm
@@ -643,16 +644,11 @@ func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error {
return nil
}
-func (s *Shm) destroy() {
- s.mfp.MemoryFile().DecRef(s.fr)
- s.registry.remove(s)
-}
-
// MarkDestroyed marks a segment for destruction. The segment is actually
// destroyed once it has no references. MarkDestroyed may be called multiple
// times, and is safe to call after a segment has already been destroyed. See
// shmctl(IPC_RMID).
-func (s *Shm) MarkDestroyed() {
+func (s *Shm) MarkDestroyed(ctx context.Context) {
s.registry.dissociateKey(s)
s.mu.Lock()
@@ -664,7 +660,7 @@ func (s *Shm) MarkDestroyed() {
//
// N.B. This cannot be the final DecRef, as the caller also
// holds a reference.
- s.DecRef()
+ s.DecRef(ctx)
return
}
}
diff --git a/pkg/sentry/kernel/signalfd/BUILD b/pkg/sentry/kernel/signalfd/BUILD
index 3eb78e91b..76d472292 100644
--- a/pkg/sentry/kernel/signalfd/BUILD
+++ b/pkg/sentry/kernel/signalfd/BUILD
@@ -8,7 +8,6 @@ go_library(
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
- "//pkg/binary",
"//pkg/context",
"//pkg/sentry/fs",
"//pkg/sentry/fs/anon",
diff --git a/pkg/sentry/kernel/signalfd/signalfd.go b/pkg/sentry/kernel/signalfd/signalfd.go
index 8243bb93e..78f718cfe 100644
--- a/pkg/sentry/kernel/signalfd/signalfd.go
+++ b/pkg/sentry/kernel/signalfd/signalfd.go
@@ -17,7 +17,6 @@ package signalfd
import (
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/anon"
@@ -76,7 +75,7 @@ func New(ctx context.Context, mask linux.SignalSet) (*fs.File, error) {
}
// Release implements fs.FileOperations.Release.
-func (s *SignalOperations) Release() {}
+func (s *SignalOperations) Release(context.Context) {}
// Mask returns the signal mask.
func (s *SignalOperations) Mask() linux.SignalSet {
@@ -103,8 +102,7 @@ func (s *SignalOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
}
// Copy out the signal info using the specified format.
- var buf [128]byte
- binary.Marshal(buf[:0], usermem.ByteOrder, &linux.SignalfdSiginfo{
+ infoNative := linux.SignalfdSiginfo{
Signo: uint32(info.Signo),
Errno: info.Errno,
Code: info.Code,
@@ -113,9 +111,13 @@ func (s *SignalOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
Status: info.Status(),
Overrun: uint32(info.Overrun()),
Addr: info.Addr(),
- })
- n, err := dst.CopyOut(ctx, buf[:])
- return int64(n), err
+ }
+ n, err := infoNative.WriteTo(dst.Writer(ctx))
+ if err == usermem.ErrEndOfIOSequence {
+ // Partial copy-out ok.
+ err = nil
+ }
+ return n, err
}
// Readiness implements waiter.Waitable.Readiness.
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index 413111faf..332bdb8e8 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -348,6 +348,16 @@ func (s *SyscallTable) LookupName(sysno uintptr) string {
return fmt.Sprintf("sys_%d", sysno) // Unlikely.
}
+// LookupNo looks up a syscall number by name.
+func (s *SyscallTable) LookupNo(name string) (uintptr, error) {
+ for i, syscall := range s.Table {
+ if syscall.Name == name {
+ return uintptr(i), nil
+ }
+ }
+ return 0, fmt.Errorf("syscall %q not found", name)
+}
+
// LookupEmulate looks up an emulation syscall number.
func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) {
sysno, ok := s.Emulate[addr]
diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go
index 4607cde2f..a83ce219c 100644
--- a/pkg/sentry/kernel/syslog.go
+++ b/pkg/sentry/kernel/syslog.go
@@ -98,6 +98,15 @@ func (s *syslog) Log() []byte {
s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, selectMessage()))...)
}
+ if VFS2Enabled {
+ time += rand.Float64() / 2
+ s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Setting up VFS2..."))...)
+ if FUSEEnabled {
+ time += rand.Float64() / 2
+ s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Setting up FUSE..."))...)
+ }
+ }
+
time += rand.Float64() / 2
s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Ready!"))...)
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index f48247c94..f796e0fa3 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -68,6 +68,21 @@ type Task struct {
// runState is exclusive to the task goroutine.
runState taskRunState
+ // taskWorkCount represents the current size of the task work queue. It is
+ // used to avoid acquiring taskWorkMu when the queue is empty.
+ //
+ // Must accessed with atomic memory operations.
+ taskWorkCount int32
+
+ // taskWorkMu protects taskWork.
+ taskWorkMu sync.Mutex `state:"nosave"`
+
+ // taskWork is a queue of work to be executed before resuming user execution.
+ // It is similar to the task_work mechanism in Linux.
+ //
+ // taskWork is exclusive to the task goroutine.
+ taskWork []TaskWorker
+
// haveSyscallReturn is true if tc.Arch().Return() represents a value
// returned by a syscall (or set by ptrace after a syscall).
//
@@ -550,11 +565,20 @@ type Task struct {
// futexWaiter is exclusive to the task goroutine.
futexWaiter *futex.Waiter `state:"nosave"`
+ // robustList is a pointer to the head of the tasks's robust futex
+ // list.
+ robustList usermem.Addr
+
// startTime is the real time at which the task started. It is set when
// a Task is created or invokes execve(2).
//
// startTime is protected by mu.
startTime ktime.Time
+
+ // kcov is the kcov instance providing code coverage owned by this task.
+ //
+ // kcov is exclusive to the task goroutine.
+ kcov *Kcov
}
func (t *Task) savePtraceTracer() *Task {
@@ -711,17 +735,17 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock {
func (t *Task) IsChrooted() bool {
if VFS2Enabled {
realRoot := t.mountNamespaceVFS2.Root()
- defer realRoot.DecRef()
+ defer realRoot.DecRef(t)
root := t.fsContext.RootDirectoryVFS2()
- defer root.DecRef()
+ defer root.DecRef(t)
return root != realRoot
}
realRoot := t.tg.mounts.Root()
- defer realRoot.DecRef()
+ defer realRoot.DecRef(t)
root := t.fsContext.RootDirectory()
if root != nil {
- defer root.DecRef()
+ defer root.DecRef(t)
}
return root != realRoot
}
@@ -884,3 +908,16 @@ func (t *Task) UID() uint32 {
func (t *Task) GID() uint32 {
return uint32(t.Credentials().EffectiveKGID)
}
+
+// SetKcov sets the kcov instance associated with t.
+func (t *Task) SetKcov(k *Kcov) {
+ t.kcov = k
+}
+
+// ResetKcov clears the kcov instance associated with t.
+func (t *Task) ResetKcov() {
+ if t.kcov != nil {
+ t.kcov.OnTaskExit()
+ t.kcov = nil
+ }
+}
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index e1ecca99e..fce1064a7 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -161,6 +161,10 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
return 0, nil, syserror.EINVAL
}
+ // Pull task registers and FPU state, a cloned task will inherit the
+ // state of the current task.
+ t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch())
+
// "If CLONE_NEWUSER is specified along with other CLONE_NEW* flags in a
// single clone(2) or unshare(2) call, the user namespace is guaranteed to
// be created first, giving the child (clone(2)) or caller (unshare(2))
@@ -237,7 +241,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
var fdTable *FDTable
if opts.NewFiles {
- fdTable = t.fdTable.Fork()
+ fdTable = t.fdTable.Fork(t)
} else {
fdTable = t.fdTable
fdTable.IncRef()
@@ -294,7 +298,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
nt, err := t.tg.pidns.owner.NewTask(cfg)
if err != nil {
if opts.NewThreadGroup {
- tg.release()
+ tg.release(t)
}
return 0, nil, err
}
@@ -337,12 +341,12 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
nt.SetClearTID(opts.ChildTID)
}
if opts.ChildSetTID {
- // Can't use Task.CopyOut, which assumes AddressSpaceActive.
- usermem.CopyObjectOut(t, nt.MemoryManager(), opts.ChildTID, nt.ThreadID(), usermem.IOOpts{})
+ ctid := nt.ThreadID()
+ ctid.CopyOut(nt.AsCopyContext(usermem.IOOpts{AddressSpaceActive: false}), opts.ChildTID)
}
ntid := t.tg.pidns.IDOfTask(nt)
if opts.ParentSetTID {
- t.CopyOut(opts.ParentTID, ntid)
+ ntid.CopyOut(t, opts.ParentTID)
}
kind := ptraceCloneKindClone
@@ -510,7 +514,7 @@ func (t *Task) Unshare(opts *SharingOptions) error {
var oldFDTable *FDTable
if opts.NewFiles {
oldFDTable = t.fdTable
- t.fdTable = oldFDTable.Fork()
+ t.fdTable = oldFDTable.Fork(t)
}
var oldFSContext *FSContext
if opts.NewFSContext {
@@ -519,10 +523,10 @@ func (t *Task) Unshare(opts *SharingOptions) error {
}
t.mu.Unlock()
if oldFDTable != nil {
- oldFDTable.DecRef()
+ oldFDTable.DecRef(t)
}
if oldFSContext != nil {
- oldFSContext.DecRef()
+ oldFSContext.DecRef(t)
}
return nil
}
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 9fa528384..d1136461a 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -126,7 +126,11 @@ func (t *Task) SyscallTable() *SyscallTable {
// Preconditions: The caller must be running on the task goroutine, or t.mu
// must be locked.
func (t *Task) Stack() *arch.Stack {
- return &arch.Stack{t.Arch(), t.MemoryManager(), usermem.Addr(t.Arch().Stack())}
+ return &arch.Stack{
+ Arch: t.Arch(),
+ IO: t.MemoryManager(),
+ Bottom: usermem.Addr(t.Arch().Stack()),
+ }
}
// LoadTaskImage loads a specified file into a new TaskContext.
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 9b69f3cbe..412d471d3 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -199,14 +199,17 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
t.tg.pidns.owner.mu.Unlock()
oldFDTable := t.fdTable
- t.fdTable = t.fdTable.Fork()
- oldFDTable.DecRef()
+ t.fdTable = t.fdTable.Fork(t)
+ oldFDTable.DecRef(t)
// Remove FDs with the CloseOnExec flag set.
- t.fdTable.RemoveIf(func(_ *fs.File, _ *vfs.FileDescription, flags FDFlags) bool {
+ t.fdTable.RemoveIf(t, func(_ *fs.File, _ *vfs.FileDescription, flags FDFlags) bool {
return flags.CloseOnExec
})
+ // Handle the robust futex list.
+ t.exitRobustList()
+
// NOTE(b/30815691): We currently do not implement privileged
// executables (set-user/group-ID bits and file capabilities). This
// allows us to unconditionally enable user dumpability on the new mm.
@@ -223,6 +226,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
t.tc = *r.tc
t.mu.Unlock()
t.unstopVforkParent()
+ t.p.FullStateChanged()
// NOTE(b/30316266): All locks must be dropped prior to calling Activate.
t.MemoryManager().Activate(t)
@@ -233,9 +237,10 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
// promoteLocked makes t the leader of its thread group. If t is already the
// thread group leader, promoteLocked is a no-op.
//
-// Preconditions: All other tasks in t's thread group, including the existing
-// leader (if it is not t), have reached TaskExitZombie. The TaskSet mutex must
-// be locked for writing.
+// Preconditions:
+// * All other tasks in t's thread group, including the existing leader (if it
+// is not t), have reached TaskExitZombie.
+// * The TaskSet mutex must be locked for writing.
func (t *Task) promoteLocked() {
oldLeader := t.tg.leader
if t == oldLeader {
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index c4ade6e8e..b400a8b41 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -239,6 +239,8 @@ func (*runExitMain) execute(t *Task) taskRunState {
t.traceExitEvent()
lastExiter := t.exitThreadGroup()
+ t.ResetKcov()
+
// If the task has a cleartid, and the thread group wasn't killed by a
// signal, handle that before releasing the MM.
if t.cleartid != 0 {
@@ -246,13 +248,17 @@ func (*runExitMain) execute(t *Task) taskRunState {
signaled := t.tg.exiting && t.tg.exitStatus.Signaled()
t.tg.signalHandlers.mu.Unlock()
if !signaled {
- if _, err := t.CopyOut(t.cleartid, ThreadID(0)); err == nil {
+ zero := ThreadID(0)
+ if _, err := zero.CopyOut(t, t.cleartid); err == nil {
t.Futex().Wake(t, t.cleartid, false, ^uint32(0), 1)
}
// If the CopyOut fails, there's nothing we can do.
}
}
+ // Handle the robust futex list.
+ t.exitRobustList()
+
// Deactivate the address space and update max RSS before releasing the
// task's MM.
t.Deactivate()
@@ -266,12 +272,12 @@ func (*runExitMain) execute(t *Task) taskRunState {
// Releasing the MM unblocks a blocked CLONE_VFORK parent.
t.unstopVforkParent()
- t.fsContext.DecRef()
- t.fdTable.DecRef()
+ t.fsContext.DecRef(t)
+ t.fdTable.DecRef(t)
t.mu.Lock()
if t.mountNamespaceVFS2 != nil {
- t.mountNamespaceVFS2.DecRef()
+ t.mountNamespaceVFS2.DecRef(t)
t.mountNamespaceVFS2 = nil
}
t.mu.Unlock()
@@ -279,7 +285,7 @@ func (*runExitMain) execute(t *Task) taskRunState {
// If this is the last task to exit from the thread group, release the
// thread group's resources.
if lastExiter {
- t.tg.release()
+ t.tg.release(t)
}
// Detach tracees.
diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go
index a53e77c9f..c80391475 100644
--- a/pkg/sentry/kernel/task_futex.go
+++ b/pkg/sentry/kernel/task_futex.go
@@ -15,6 +15,8 @@
package kernel
import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -52,3 +54,127 @@ func (t *Task) LoadUint32(addr usermem.Addr) (uint32, error) {
func (t *Task) GetSharedKey(addr usermem.Addr) (futex.Key, error) {
return t.MemoryManager().GetSharedFutexKey(t, addr)
}
+
+// GetRobustList sets the robust futex list for the task.
+func (t *Task) GetRobustList() usermem.Addr {
+ t.mu.Lock()
+ addr := t.robustList
+ t.mu.Unlock()
+ return addr
+}
+
+// SetRobustList sets the robust futex list for the task.
+func (t *Task) SetRobustList(addr usermem.Addr) {
+ t.mu.Lock()
+ t.robustList = addr
+ t.mu.Unlock()
+}
+
+// exitRobustList walks the robust futex list, marking locks dead and notifying
+// wakers. It corresponds to Linux's exit_robust_list(). Following Linux,
+// errors are silently ignored.
+func (t *Task) exitRobustList() {
+ t.mu.Lock()
+ addr := t.robustList
+ t.robustList = 0
+ t.mu.Unlock()
+
+ if addr == 0 {
+ return
+ }
+
+ var rl linux.RobustListHead
+ if _, err := rl.CopyIn(t, usermem.Addr(addr)); err != nil {
+ return
+ }
+
+ next := primitive.Uint64(rl.List)
+ done := 0
+ var pendingLockAddr usermem.Addr
+ if rl.ListOpPending != 0 {
+ pendingLockAddr = usermem.Addr(rl.ListOpPending + rl.FutexOffset)
+ }
+
+ // Wake up normal elements.
+ for usermem.Addr(next) != addr {
+ // We traverse to the next element of the list before we
+ // actually wake anything. This prevents the race where waking
+ // this futex causes a modification of the list.
+ thisLockAddr := usermem.Addr(uint64(next) + rl.FutexOffset)
+
+ // Try to decode the next element in the list before waking the
+ // current futex. But don't check the error until after we've
+ // woken the current futex. Linux does it in this order too
+ _, nextErr := next.CopyIn(t, usermem.Addr(next))
+
+ // Wakeup the current futex if it's not pending.
+ if thisLockAddr != pendingLockAddr {
+ t.wakeRobustListOne(thisLockAddr)
+ }
+
+ // If there was an error copying the next futex, we must bail.
+ if nextErr != nil {
+ break
+ }
+
+ // This is a user structure, so it could be a massive list, or
+ // even contain a loop if they are trying to mess with us. We
+ // cap traversal to prevent that.
+ done++
+ if done >= linux.ROBUST_LIST_LIMIT {
+ break
+ }
+ }
+
+ // Is there a pending entry to wake?
+ if pendingLockAddr != 0 {
+ t.wakeRobustListOne(pendingLockAddr)
+ }
+}
+
+// wakeRobustListOne wakes a single futex from the robust list.
+func (t *Task) wakeRobustListOne(addr usermem.Addr) {
+ // Bit 0 in address signals PI futex.
+ pi := addr&1 == 1
+ addr = addr &^ 1
+
+ // Load the futex.
+ f, err := t.LoadUint32(addr)
+ if err != nil {
+ // Can't read this single value? Ignore the problem.
+ // We can wake the other futexes in the list.
+ return
+ }
+
+ tid := uint32(t.ThreadID())
+ for {
+ // Is this held by someone else?
+ if f&linux.FUTEX_TID_MASK != tid {
+ return
+ }
+
+ // This thread is dying and it's holding this futex. We need to
+ // set the owner died bit and wake up any waiters.
+ newF := (f & linux.FUTEX_WAITERS) | linux.FUTEX_OWNER_DIED
+ if curF, err := t.CompareAndSwapUint32(addr, f, newF); err != nil {
+ return
+ } else if curF != f {
+ // Futex changed out from under us. Try again...
+ f = curF
+ continue
+ }
+
+ // Wake waiters if there are any.
+ if f&linux.FUTEX_WAITERS != 0 {
+ private := f&linux.FUTEX_PRIVATE_FLAG != 0
+ if pi {
+ t.Futex().UnlockPI(t, addr, tid, private)
+ return
+ }
+ t.Futex().Wake(t, addr, private, linux.FUTEX_BITSET_MATCH_ANY, 1)
+ }
+
+ // Done.
+ return
+ }
+}
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
index eeccaa197..d23cea802 100644
--- a/pkg/sentry/kernel/task_log.go
+++ b/pkg/sentry/kernel/task_log.go
@@ -27,6 +27,9 @@ const (
// maxStackDebugBytes is the maximum number of user stack bytes that may be
// printed by debugDumpStack.
maxStackDebugBytes = 1024
+ // maxCodeDebugBytes is the maximum number of user code bytes that may be
+ // printed by debugDumpCode.
+ maxCodeDebugBytes = 128
)
// Infof logs an formatted info message by calling log.Infof.
@@ -61,6 +64,7 @@ func (t *Task) IsLogging(level log.Level) bool {
func (t *Task) DebugDumpState() {
t.debugDumpRegisters()
t.debugDumpStack()
+ t.debugDumpCode()
if mm := t.MemoryManager(); mm != nil {
t.Debugf("Mappings:\n%s", mm)
}
@@ -128,6 +132,45 @@ func (t *Task) debugDumpStack() {
}
}
+// debugDumpCode logs user code contents at log level debug.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) debugDumpCode() {
+ if !t.IsLogging(log.Debug) {
+ return
+ }
+ m := t.MemoryManager()
+ if m == nil {
+ t.Debugf("Memory manager for task is gone, skipping application code dump.")
+ return
+ }
+ t.Debugf("Code:")
+ // Print code on both sides of the instruction register.
+ start := usermem.Addr(t.Arch().IP()) - maxCodeDebugBytes/2
+ // Round addr down to a 16-byte boundary.
+ start &= ^usermem.Addr(15)
+ // Print 16 bytes per line, one byte at a time.
+ for offset := uint64(0); offset < maxCodeDebugBytes; offset += 16 {
+ addr, ok := start.AddLength(offset)
+ if !ok {
+ break
+ }
+ var data [16]byte
+ n, err := m.CopyIn(t, addr, data[:], usermem.IOOpts{
+ IgnorePermissions: true,
+ })
+ // Print as much of the line as we can, even if an error was
+ // encountered.
+ if n > 0 {
+ t.Debugf("%x: % x", addr, data[:n])
+ }
+ if err != nil {
+ t.Debugf("Error reading stack at address %x: %v", addr+usermem.Addr(n), err)
+ break
+ }
+ }
+}
+
// trace definitions.
//
// Note that all region names are prefixed by ':' in order to ensure that they
@@ -203,6 +246,6 @@ func (t *Task) traceExecEvent(tc *TaskContext) {
trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>")
return
}
- defer file.DecRef()
+ defer file.DecRef(t)
trace.Logf(t.traceContext, traceCategory, "exec: %s", file.PathnameWithDeleted(t))
}
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index d654dd997..8dc3fec90 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -26,6 +26,7 @@ import (
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/platform"
+ "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -140,7 +141,7 @@ func (*runApp) handleCPUIDInstruction(t *Task) error {
region := trace.StartRegion(t.traceContext, cpuidRegion)
expected := arch.CPUIDInstruction[:]
found := make([]byte, len(expected))
- _, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found)
+ _, err := t.CopyInBytes(usermem.Addr(t.Arch().IP()), found)
if err == nil && bytes.Equal(expected, found) {
// Skip the cpuid instruction.
t.Arch().CPUIDEmulate(t)
@@ -167,15 +168,30 @@ func (app *runApp) execute(t *Task) taskRunState {
return (*runInterrupt)(nil)
}
- // We're about to switch to the application again. If there's still a
+ // Execute any task work callbacks before returning to user space.
+ if atomic.LoadInt32(&t.taskWorkCount) > 0 {
+ t.taskWorkMu.Lock()
+ queue := t.taskWork
+ t.taskWork = nil
+ atomic.StoreInt32(&t.taskWorkCount, 0)
+ t.taskWorkMu.Unlock()
+
+ // Do not hold taskWorkMu while executing task work, which may register
+ // more work.
+ for _, work := range queue {
+ work.TaskWork(t)
+ }
+ }
+
+ // We're about to switch to the application again. If there's still an
// unhandled SyscallRestartErrno that wasn't translated to an EINTR,
// restart the syscall that was interrupted. If there's a saved signal
// mask, restore it. (Note that restoring the saved signal mask may unblock
// a pending signal, causing another interruption, but that signal should
// not interact with the interrupted syscall.)
if t.haveSyscallReturn {
- if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
- if sre == ERESTART_RESTARTBLOCK {
+ if sre, ok := syserror.SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
+ if sre == syserror.ERESTART_RESTARTBLOCK {
t.Debugf("Restarting syscall %d with restart block after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre)
t.Arch().RestartSyscallWithRestartBlock()
} else {
@@ -245,7 +261,7 @@ func (app *runApp) execute(t *Task) taskRunState {
region := trace.StartRegion(t.traceContext, runRegion)
t.accountTaskGoroutineEnter(TaskGoroutineRunningApp)
- info, at, err := t.p.Switch(t.MemoryManager().AddressSpace(), t.Arch(), t.rseqCPU)
+ info, at, err := t.p.Switch(t, t.MemoryManager(), t.Arch(), t.rseqCPU)
t.accountTaskGoroutineLeave(TaskGoroutineRunningApp)
region.End()
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
index 09366b60c..52c55d13d 100644
--- a/pkg/sentry/kernel/task_sched.go
+++ b/pkg/sentry/kernel/task_sched.go
@@ -133,9 +133,10 @@ func (t *Task) accountTaskGoroutineEnter(state TaskGoroutineState) {
}
}
-// Preconditions: The caller must be running on the task goroutine, and leaving
-// a state indicated by a previous call to
-// t.accountTaskGoroutineEnter(state).
+// Preconditions:
+// * The caller must be running on the task goroutine
+// * The caller must be leaving a state indicated by a previous call to
+// t.accountTaskGoroutineEnter(state).
func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) {
if state != TaskGoroutineRunningApp {
// Task is unblocking/continuing.
@@ -191,8 +192,8 @@ func (tg *ThreadGroup) CPUStats() usage.CPUStats {
return tg.cpuStatsAtLocked(tg.leader.k.CPUClockNow())
}
-// Preconditions: As for TaskGoroutineSchedInfo.userTicksAt. The TaskSet mutex
-// must be locked.
+// Preconditions: Same as TaskGoroutineSchedInfo.userTicksAt, plus:
+// * The TaskSet mutex must be locked.
func (tg *ThreadGroup) cpuStatsAtLocked(now uint64) usage.CPUStats {
stats := tg.exitedCPUStats
// Account for live tasks.
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index 79766cafe..ebdb83061 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -159,7 +159,7 @@ func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunS
sigact := computeAction(linux.Signal(info.Signo), act)
if t.haveSyscallReturn {
- if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
+ if sre, ok := syserror.SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
// Signals that are ignored, cause a thread group stop, or
// terminate the thread group do not interact with interrupted
// syscalls; in Linux terms, they are never returned to the signal
@@ -168,11 +168,11 @@ func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunS
// signal that is actually handled (by userspace).
if sigact == SignalActionHandler {
switch {
- case sre == ERESTARTNOHAND:
+ case sre == syserror.ERESTARTNOHAND:
fallthrough
- case sre == ERESTART_RESTARTBLOCK:
+ case sre == syserror.ERESTART_RESTARTBLOCK:
fallthrough
- case (sre == ERESTARTSYS && !act.IsRestart()):
+ case (sre == syserror.ERESTARTSYS && !act.IsRestart()):
t.Debugf("Not restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo)
t.Arch().SetReturn(uintptr(-ExtractErrno(syserror.EINTR, -1)))
default:
@@ -255,10 +255,15 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct)
}
}
+ mm := t.MemoryManager()
// Set up the signal handler. If we have a saved signal mask, the signal
// handler should run with the current mask, but sigreturn should restore
// the saved one.
- st := &arch.Stack{t.Arch(), t.MemoryManager(), sp}
+ st := &arch.Stack{
+ Arch: t.Arch(),
+ IO: mm,
+ Bottom: sp,
+ }
mask := t.signalMask
if t.haveSavedSignalMask {
mask = t.savedSignalMask
@@ -273,12 +278,13 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct)
// Please see the linux code as reference:
// linux/arch/arm64/kernel/signal.c:setup_return()
if act.Flags&linux.SA_RESTORER == 0 {
- act.Restorer = t.MemoryManager().VDSOSigReturn()
+ act.Restorer = mm.VDSOSigReturn()
}
if err := t.Arch().SignalSetup(st, &act, info, &alt, mask); err != nil {
return err
}
+ t.p.FullStateChanged()
t.haveSavedSignalMask = false
// Add our signal mask.
@@ -310,14 +316,16 @@ func (t *Task) SignalReturn(rt bool) (*SyscallControl, error) {
// Restore our signal mask. SIGKILL and SIGSTOP should not be blocked.
t.SetSignalMask(sigset &^ UnblockableSignals)
+ t.p.FullStateChanged()
return ctrlResume, nil
}
// Sigtimedwait implements the semantics of sigtimedwait(2).
//
-// Preconditions: The caller must be running on the task goroutine. t.exitState
-// < TaskExitZombie.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t.exitState < TaskExitZombie.
func (t *Task) Sigtimedwait(set linux.SignalSet, timeout time.Duration) (*arch.SignalInfo, error) {
// set is the set of signals we're interested in; invert it to get the set
// of signals to block.
@@ -581,8 +589,9 @@ func (t *Task) SignalMask() linux.SignalSet {
// SetSignalMask sets t's signal mask.
//
-// Preconditions: SetSignalMask can only be called by the task goroutine.
-// t.exitState < TaskExitZombie.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t.exitState < TaskExitZombie.
func (t *Task) SetSignalMask(mask linux.SignalSet) {
// By precondition, t prevents t.tg from completing an execve and mutating
// t.tg.signalHandlers, so we can skip the TaskSet mutex.
@@ -628,7 +637,7 @@ func (t *Task) setSignalMaskLocked(mask linux.SignalSet) {
// SetSavedSignalMask sets the saved signal mask (see Task.savedSignalMask's
// comment).
//
-// Preconditions: SetSavedSignalMask can only be called by the task goroutine.
+// Preconditions: The caller must be running on the task goroutine.
func (t *Task) SetSavedSignalMask(mask linux.SignalSet) {
t.savedSignalMask = mask
t.haveSavedSignalMask = true
@@ -636,6 +645,7 @@ func (t *Task) SetSavedSignalMask(mask linux.SignalSet) {
// SignalStack returns the task-private signal stack.
func (t *Task) SignalStack() arch.SignalStack {
+ t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch())
alt := t.signalStack
if t.onSignalStack(alt) {
alt.Flags |= arch.SignalStackFlagOnStack
@@ -1050,6 +1060,8 @@ func (*runInterrupt) execute(t *Task) taskRunState {
// Are there signals pending?
if info := t.dequeueSignalLocked(t.signalMask); info != nil {
+ t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch())
+
if linux.SignalSetOf(linux.Signal(info.Signo))&StopSignals != 0 {
// Indicate that we've dequeued a stop signal before unlocking the
// signal mutex; initiateGroupStop will check for races with
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index 8485fb4b6..64c1e120a 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -102,10 +102,10 @@ func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) {
t, err := ts.newTask(cfg)
if err != nil {
cfg.TaskContext.release()
- cfg.FSContext.DecRef()
- cfg.FDTable.DecRef()
+ cfg.FSContext.DecRef(t)
+ cfg.FDTable.DecRef(t)
if cfg.MountNamespaceVFS2 != nil {
- cfg.MountNamespaceVFS2.DecRef()
+ cfg.MountNamespaceVFS2.DecRef(t)
}
return nil, err
}
diff --git a/pkg/sentry/kernel/task_stop.go b/pkg/sentry/kernel/task_stop.go
index 10c6e455c..a35948a5f 100644
--- a/pkg/sentry/kernel/task_stop.go
+++ b/pkg/sentry/kernel/task_stop.go
@@ -99,8 +99,9 @@ type TaskStop interface {
// beginInternalStop indicates the start of an internal stop that applies to t.
//
-// Preconditions: The task must not already be in an internal stop (i.e. t.stop
-// == nil). The caller must be running on the task goroutine.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * The task must not already be in an internal stop (i.e. t.stop == nil).
func (t *Task) beginInternalStop(s TaskStop) {
t.tg.pidns.owner.mu.RLock()
defer t.tg.pidns.owner.mu.RUnlock()
@@ -109,8 +110,8 @@ func (t *Task) beginInternalStop(s TaskStop) {
t.beginInternalStopLocked(s)
}
-// Preconditions: The signal mutex must be locked. All preconditions for
-// Task.beginInternalStop also apply.
+// Preconditions: Same as beginInternalStop, plus:
+// * The signal mutex must be locked.
func (t *Task) beginInternalStopLocked(s TaskStop) {
if t.stop != nil {
panic(fmt.Sprintf("Attempting to enter internal stop %#v when already in internal stop %#v", s, t.stop))
@@ -128,8 +129,9 @@ func (t *Task) beginInternalStopLocked(s TaskStop) {
// t.stop, which is why there is no endInternalStop that locks the signal mutex
// for you.
//
-// Preconditions: The signal mutex must be locked. The task must be in an
-// internal stop (i.e. t.stop != nil).
+// Preconditions:
+// * The signal mutex must be locked.
+// * The task must be in an internal stop (i.e. t.stop != nil).
func (t *Task) endInternalStopLocked() {
if t.stop == nil {
panic("Attempting to leave non-existent internal stop")
@@ -205,6 +207,22 @@ func (ts *TaskSet) BeginExternalStop() {
}
}
+// PullFullState receives full states for all tasks.
+func (ts *TaskSet) PullFullState() {
+ ts.mu.Lock()
+ defer ts.mu.Unlock()
+ if ts.Root == nil {
+ return
+ }
+ for t := range ts.Root.tids {
+ t.Activate()
+ if mm := t.MemoryManager(); mm != nil {
+ t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch())
+ }
+ t.Deactivate()
+ }
+}
+
// EndExternalStop indicates the end of an external stop started by a previous
// call to TaskSet.BeginExternalStop. EndExternalStop does not wait for task
// goroutines to resume.
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index a5903b0b5..0141459e7 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -22,6 +22,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/bits"
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/metric"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/memmap"
@@ -29,75 +30,8 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
-// SyscallRestartErrno represents a ERESTART* errno defined in the Linux's kernel
-// include/linux/errno.h. These errnos are never returned to userspace
-// directly, but are used to communicate the expected behavior of an
-// interrupted syscall from the syscall to signal handling.
-type SyscallRestartErrno int
-
-// These numeric values are significant because ptrace syscall exit tracing can
-// observe them.
-//
-// For all of the following errnos, if the syscall is not interrupted by a
-// signal delivered to a user handler, the syscall is restarted.
-const (
- // ERESTARTSYS is returned by an interrupted syscall to indicate that it
- // should be converted to EINTR if interrupted by a signal delivered to a
- // user handler without SA_RESTART set, and restarted otherwise.
- ERESTARTSYS = SyscallRestartErrno(512)
-
- // ERESTARTNOINTR is returned by an interrupted syscall to indicate that it
- // should always be restarted.
- ERESTARTNOINTR = SyscallRestartErrno(513)
-
- // ERESTARTNOHAND is returned by an interrupted syscall to indicate that it
- // should be converted to EINTR if interrupted by a signal delivered to a
- // user handler, and restarted otherwise.
- ERESTARTNOHAND = SyscallRestartErrno(514)
-
- // ERESTART_RESTARTBLOCK is returned by an interrupted syscall to indicate
- // that it should be restarted using a custom function. The interrupted
- // syscall must register a custom restart function by calling
- // Task.SetRestartSyscallFn.
- ERESTART_RESTARTBLOCK = SyscallRestartErrno(516)
-)
-
var vsyscallCount = metric.MustCreateNewUint64Metric("/kernel/vsyscall_count", false /* sync */, "Number of times vsyscalls were invoked by the application")
-// Error implements error.Error.
-func (e SyscallRestartErrno) Error() string {
- // Descriptions are borrowed from strace.
- switch e {
- case ERESTARTSYS:
- return "to be restarted if SA_RESTART is set"
- case ERESTARTNOINTR:
- return "to be restarted"
- case ERESTARTNOHAND:
- return "to be restarted if no handler"
- case ERESTART_RESTARTBLOCK:
- return "interrupted by signal"
- default:
- return "(unknown interrupt error)"
- }
-}
-
-// SyscallRestartErrnoFromReturn returns the SyscallRestartErrno represented by
-// rv, the value in a syscall return register.
-func SyscallRestartErrnoFromReturn(rv uintptr) (SyscallRestartErrno, bool) {
- switch int(rv) {
- case -int(ERESTARTSYS):
- return ERESTARTSYS, true
- case -int(ERESTARTNOINTR):
- return ERESTARTNOINTR, true
- case -int(ERESTARTNOHAND):
- return ERESTARTNOHAND, true
- case -int(ERESTART_RESTARTBLOCK):
- return ERESTART_RESTARTBLOCK, true
- default:
- return 0, false
- }
-}
-
// SyscallRestartBlock represents the restart block for a syscall restartable
// with a custom function. It encapsulates the state required to restart a
// syscall across a S/R.
@@ -354,7 +288,7 @@ func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState {
// Grab the caller up front, to make sure there's a sensible stack.
caller := t.Arch().Native(uintptr(0))
- if _, err := t.CopyIn(usermem.Addr(t.Arch().Stack()), caller); err != nil {
+ if _, err := caller.CopyIn(t, usermem.Addr(t.Arch().Stack())); err != nil {
t.Debugf("vsyscall %d: error reading return address from stack: %v", sysno, err)
t.forceSignal(linux.SIGSEGV, false /* unconditional */)
t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
@@ -390,7 +324,7 @@ func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState {
type runVsyscallAfterPtraceEventSeccomp struct {
addr usermem.Addr
sysno uintptr
- caller interface{}
+ caller marshal.Marshallable
}
func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
@@ -413,7 +347,7 @@ func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
return t.doVsyscallInvoke(sysno, t.Arch().SyscallArgs(), r.caller)
}
-func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller interface{}) taskRunState {
+func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller marshal.Marshallable) taskRunState {
rval, ctrl, err := t.executeSyscall(sysno, args)
if ctrl != nil {
t.Debugf("vsyscall %d, caller %x: syscall control: %v", sysno, t.Arch().Value(caller), ctrl)
@@ -447,7 +381,7 @@ func ExtractErrno(err error, sysno int) int {
return 0
case syscall.Errno:
return int(err)
- case SyscallRestartErrno:
+ case syserror.SyscallRestartErrno:
return int(err)
case *memmap.BusError:
// Bus errors may generate SIGBUS, but for syscalls they still
diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
index b02044ad2..ce134bf54 100644
--- a/pkg/sentry/kernel/task_usermem.go
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -18,6 +18,7 @@ import (
"math"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -43,17 +44,6 @@ func (t *Task) Deactivate() {
}
}
-// CopyIn copies a fixed-size value or slice of fixed-size values in from the
-// task's memory. The copy will fail with syscall.EFAULT if it traverses user
-// memory that is unmapped or not readable by the user.
-//
-// This Task's AddressSpace must be active.
-func (t *Task) CopyIn(addr usermem.Addr, dst interface{}) (int, error) {
- return usermem.CopyObjectIn(t, t.MemoryManager(), addr, dst, usermem.IOOpts{
- AddressSpaceActive: true,
- })
-}
-
// CopyInBytes is a fast version of CopyIn if the caller can serialize the
// data without reflection and pass in a byte slice.
//
@@ -64,17 +54,6 @@ func (t *Task) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
})
}
-// CopyOut copies a fixed-size value or slice of fixed-size values out to the
-// task's memory. The copy will fail with syscall.EFAULT if it traverses user
-// memory that is unmapped or not writeable by the user.
-//
-// This Task's AddressSpace must be active.
-func (t *Task) CopyOut(addr usermem.Addr, src interface{}) (int, error) {
- return usermem.CopyObjectOut(t, t.MemoryManager(), addr, src, usermem.IOOpts{
- AddressSpaceActive: true,
- })
-}
-
// CopyOutBytes is a fast version of CopyOut if the caller can serialize the
// data without reflection and pass in a byte slice.
//
@@ -114,7 +93,7 @@ func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([
var v []string
for {
argAddr := t.Arch().Native(0)
- if _, err := t.CopyIn(addr, argAddr); err != nil {
+ if _, err := argAddr.CopyIn(t, addr); err != nil {
return v, err
}
if t.Arch().Value(argAddr) == 0 {
@@ -143,8 +122,9 @@ func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([
// CopyOutIovecs converts src to an array of struct iovecs and copies it to the
// memory mapped at addr.
//
-// Preconditions: As for usermem.IO.CopyOut. The caller must be running on the
-// task goroutine. t's AddressSpace must be active.
+// Preconditions: Same as usermem.IO.CopyOut, plus:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error {
switch t.Arch().Width() {
case 8:
@@ -191,8 +171,9 @@ func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error
// combined length of all AddrRanges would otherwise exceed this amount, ranges
// beyond MAX_RW_COUNT are silently truncated.
//
-// Preconditions: As for usermem.IO.CopyIn. The caller must be running on the
-// task goroutine. t's AddressSpace must be active.
+// Preconditions: Same as usermem.IO.CopyIn, plus:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRangeSeq, error) {
if numIovecs == 0 {
return usermem.AddrRangeSeq{}, nil
@@ -284,7 +265,7 @@ func (t *Task) SingleIOSequence(addr usermem.Addr, length int, opts usermem.IOOp
//
// IovecsIOSequence is analogous to Linux's lib/iov_iter.c:import_iovec().
//
-// Preconditions: As for Task.CopyInIovecs.
+// Preconditions: Same as Task.CopyInIovecs.
func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOpts) (usermem.IOSequence, error) {
if iovcnt < 0 || iovcnt > linux.UIO_MAXIOV {
return usermem.IOSequence{}, syserror.EINVAL
@@ -299,3 +280,30 @@ func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOp
Opts: opts,
}, nil
}
+
+// copyContext implements marshal.CopyContext. It wraps a task to allow copying
+// memory to and from the task memory with custom usermem.IOOpts.
+type copyContext struct {
+ *Task
+ opts usermem.IOOpts
+}
+
+// AsCopyContext wraps the task and returns it as CopyContext.
+func (t *Task) AsCopyContext(opts usermem.IOOpts) marshal.CopyContext {
+ return &copyContext{t, opts}
+}
+
+// CopyInString copies a string in from the task's memory.
+func (t *copyContext) CopyInString(addr usermem.Addr, maxLen int) (string, error) {
+ return usermem.CopyStringIn(t, t.MemoryManager(), addr, maxLen, t.opts)
+}
+
+// CopyInBytes copies task memory into dst from an IO context.
+func (t *copyContext) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
+ return t.MemoryManager().CopyIn(t, addr, dst, t.opts)
+}
+
+// CopyOutBytes copies src into task memoryfrom an IO context.
+func (t *copyContext) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) {
+ return t.MemoryManager().CopyOut(t, addr, src, t.opts)
+}
diff --git a/pkg/sentry/kernel/task_work.go b/pkg/sentry/kernel/task_work.go
new file mode 100644
index 000000000..dda5a433a
--- /dev/null
+++ b/pkg/sentry/kernel/task_work.go
@@ -0,0 +1,38 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import "sync/atomic"
+
+// TaskWorker is a deferred task.
+//
+// This must be savable.
+type TaskWorker interface {
+ // TaskWork will be executed prior to returning to user space. Note that
+ // TaskWork may call RegisterWork again, but this will not be executed until
+ // the next return to user space, unlike in Linux. This effectively allows
+ // registration of indefinite user return hooks, but not by default.
+ TaskWork(t *Task)
+}
+
+// RegisterWork can be used to register additional task work that will be
+// performed prior to returning to user space. See TaskWorker.TaskWork for
+// semantics regarding registration.
+func (t *Task) RegisterWork(work TaskWorker) {
+ t.taskWorkMu.Lock()
+ defer t.taskWorkMu.Unlock()
+ atomic.AddInt32(&t.taskWorkCount, 1)
+ t.taskWork = append(t.taskWork, work)
+}
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 4dfd2c990..0b34c0099 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -308,7 +308,7 @@ func (tg *ThreadGroup) Limits() *limits.LimitSet {
}
// release releases the thread group's resources.
-func (tg *ThreadGroup) release() {
+func (tg *ThreadGroup) release(t *Task) {
// Timers must be destroyed without holding the TaskSet or signal mutexes
// since timers send signals with Timer.mu locked.
tg.itimerRealTimer.Destroy()
@@ -325,7 +325,7 @@ func (tg *ThreadGroup) release() {
it.DestroyTimer()
}
if tg.mounts != nil {
- tg.mounts.DecRef()
+ tg.mounts.DecRef(t)
}
}
diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
index 872e1a82d..5ae5906e8 100644
--- a/pkg/sentry/kernel/threads.go
+++ b/pkg/sentry/kernel/threads.go
@@ -36,6 +36,8 @@ import (
const TasksLimit = (1 << 16)
// ThreadID is a generic thread identifier.
+//
+// +marshal
type ThreadID int32
// String returns a decimal representation of the ThreadID.
diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD
index 7ba7dc50c..2817aa3ba 100644
--- a/pkg/sentry/kernel/time/BUILD
+++ b/pkg/sentry/kernel/time/BUILD
@@ -6,6 +6,7 @@ go_library(
name = "time",
srcs = [
"context.go",
+ "tcpip.go",
"time.go",
],
visibility = ["//pkg/sentry:internal"],
diff --git a/pkg/sentry/kernel/time/tcpip.go b/pkg/sentry/kernel/time/tcpip.go
new file mode 100644
index 000000000..c4474c0cf
--- /dev/null
+++ b/pkg/sentry/kernel/time/tcpip.go
@@ -0,0 +1,131 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package time
+
+import (
+ "sync"
+ "time"
+)
+
+// TcpipAfterFunc waits for duration to elapse according to clock then runs fn.
+// The timer is started immediately and will fire exactly once.
+func TcpipAfterFunc(clock Clock, duration time.Duration, fn func()) *TcpipTimer {
+ timer := &TcpipTimer{
+ clock: clock,
+ }
+ timer.notifier = functionNotifier{
+ fn: func() {
+ // tcpip.Timer.Stop() explicitly states that the function is called in a
+ // separate goroutine that Stop() does not synchronize with.
+ // Timer.Destroy() synchronizes with calls to TimerListener.Notify().
+ // This is semantically meaningful because, in the former case, it's
+ // legal to call tcpip.Timer.Stop() while holding locks that may also be
+ // taken by the function, but this isn't so in the latter case. Most
+ // immediately, Timer calls TimerListener.Notify() while holding
+ // Timer.mu. A deadlock occurs without spawning a goroutine:
+ // T1: (Timer expires)
+ // => Timer.Tick() <- Timer.mu.Lock() called
+ // => TimerListener.Notify()
+ // => Timer.Stop()
+ // => Timer.Destroy() <- Timer.mu.Lock() called, deadlock!
+ //
+ // Spawning a goroutine avoids the deadlock:
+ // T1: (Timer expires)
+ // => Timer.Tick() <- Timer.mu.Lock() called
+ // => TimerListener.Notify() <- Launches T2
+ // T2:
+ // => Timer.Stop()
+ // => Timer.Destroy() <- Timer.mu.Lock() called, blocks
+ // T1:
+ // => (returns) <- Timer.mu.Unlock() called
+ // T2:
+ // => (continues) <- No deadlock!
+ go func() {
+ timer.Stop()
+ fn()
+ }()
+ },
+ }
+ timer.Reset(duration)
+ return timer
+}
+
+// TcpipTimer is a resettable timer with variable duration expirations.
+// Implements tcpip.Timer, which does not define a Destroy method; instead, all
+// resources are released after timer expiration and calls to Timer.Stop.
+//
+// Must be created by AfterFunc.
+type TcpipTimer struct {
+ // clock is the time source. clock is immutable.
+ clock Clock
+
+ // notifier is called when the Timer expires. notifier is immutable.
+ notifier functionNotifier
+
+ // mu protects t.
+ mu sync.Mutex
+
+ // t stores the latest running Timer. This is replaced whenever Reset is
+ // called since Timer cannot be restarted once it has been Destroyed by Stop.
+ //
+ // This field is nil iff Stop has been called.
+ t *Timer
+}
+
+// Stop implements tcpip.Timer.Stop.
+func (r *TcpipTimer) Stop() bool {
+ r.mu.Lock()
+ defer r.mu.Unlock()
+
+ if r.t == nil {
+ return false
+ }
+ _, lastSetting := r.t.Swap(Setting{})
+ r.t.Destroy()
+ r.t = nil
+ return lastSetting.Enabled
+}
+
+// Reset implements tcpip.Timer.Reset.
+func (r *TcpipTimer) Reset(d time.Duration) {
+ r.mu.Lock()
+ defer r.mu.Unlock()
+
+ if r.t == nil {
+ r.t = NewTimer(r.clock, &r.notifier)
+ }
+
+ r.t.Swap(Setting{
+ Enabled: true,
+ Period: 0,
+ Next: r.clock.Now().Add(d),
+ })
+}
+
+// functionNotifier is a TimerListener that runs a function.
+//
+// functionNotifier cannot be saved or loaded.
+type functionNotifier struct {
+ fn func()
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (f *functionNotifier) Notify(uint64, Setting) (Setting, bool) {
+ f.fn()
+ return Setting{}, false
+}
+
+// Destroy implements ktime.TimerListener.Destroy.
+func (f *functionNotifier) Destroy() {}
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
index e959700f2..f61a8e164 100644
--- a/pkg/sentry/kernel/time/time.go
+++ b/pkg/sentry/kernel/time/time.go
@@ -616,8 +616,10 @@ func (t *Timer) Swap(s Setting) (Time, Setting) {
// Timer's Clock) at which the Setting was changed. Setting s.Enabled to true
// starts the timer, while setting s.Enabled to false stops it.
//
-// Preconditions: The Timer must not be paused. f cannot call any Timer methods
-// since it is called with the Timer mutex locked.
+// Preconditions:
+// * The Timer must not be paused.
+// * f cannot call any Timer methods since it is called with the Timer mutex
+// locked.
func (t *Timer) SwapAnd(s Setting, f func()) (Time, Setting) {
now := t.clock.Now()
t.mu.Lock()
diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go
index 0adf25691..7c4fefb16 100644
--- a/pkg/sentry/kernel/timekeeper.go
+++ b/pkg/sentry/kernel/timekeeper.go
@@ -21,8 +21,8 @@ import (
"gvisor.dev/gvisor/pkg/log"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
- "gvisor.dev/gvisor/pkg/sentry/platform"
sentrytime "gvisor.dev/gvisor/pkg/sentry/time"
"gvisor.dev/gvisor/pkg/sync"
)
@@ -90,7 +90,7 @@ type Timekeeper struct {
// NewTimekeeper does not take ownership of paramPage.
//
// SetClocks must be called on the returned Timekeeper before it is usable.
-func NewTimekeeper(mfp pgalloc.MemoryFileProvider, paramPage platform.FileRange) (*Timekeeper, error) {
+func NewTimekeeper(mfp pgalloc.MemoryFileProvider, paramPage memmap.FileRange) (*Timekeeper, error) {
return &Timekeeper{
params: NewVDSOParamPage(mfp, paramPage),
}, nil
@@ -210,9 +210,6 @@ func (t *Timekeeper) startUpdater() {
p.realtimeBaseRef = int64(realtimeParams.BaseRef)
p.realtimeFrequency = realtimeParams.Frequency
}
-
- log.Debugf("Updating VDSO parameters: %+v", p)
-
return p
}); err != nil {
log.Warningf("Unable to update VDSO parameter page: %v", err)
diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go
index f1b3c212c..9bc452e67 100644
--- a/pkg/sentry/kernel/vdso.go
+++ b/pkg/sentry/kernel/vdso.go
@@ -17,10 +17,9 @@ package kernel
import (
"fmt"
- "gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/safemem"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
- "gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -28,6 +27,8 @@ import (
//
// They are exposed to the VDSO via a parameter page managed by VDSOParamPage,
// which also includes a sequence counter.
+//
+// +marshal
type vdsoParams struct {
monotonicReady uint64
monotonicBaseCycles int64
@@ -58,7 +59,7 @@ type vdsoParams struct {
type VDSOParamPage struct {
// The parameter page is fr, allocated from mfp.MemoryFile().
mfp pgalloc.MemoryFileProvider
- fr platform.FileRange
+ fr memmap.FileRange
// seq is the current sequence count written to the page.
//
@@ -68,21 +69,29 @@ type VDSOParamPage struct {
// checked in state_test_util tests, causing this field to change across
// save / restore.
seq uint64
+
+ // copyScratchBuffer is a temporary buffer used to marshal the params before
+ // copying it to the real parameter page. The parameter page is typically
+ // updated at a moderate frequency of ~O(seconds) throughout the lifetime of
+ // the sentry, so reusing this buffer is a good tradeoff between memory
+ // usage and the cost of allocation.
+ copyScratchBuffer []byte
}
// NewVDSOParamPage returns a VDSOParamPage.
//
// Preconditions:
-//
// * fr is a single page allocated from mfp.MemoryFile(). VDSOParamPage does
// not take ownership of fr; it must remain allocated for the lifetime of the
// VDSOParamPage.
-//
// * VDSOParamPage must be the only writer to fr.
-//
// * mfp.MemoryFile().MapInternal(fr) must return a single safemem.Block.
-func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *VDSOParamPage {
- return &VDSOParamPage{mfp: mfp, fr: fr}
+func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr memmap.FileRange) *VDSOParamPage {
+ return &VDSOParamPage{
+ mfp: mfp,
+ fr: fr,
+ copyScratchBuffer: make([]byte, (*vdsoParams)(nil).SizeBytes()),
+ }
}
// access returns a mapping of the param page.
@@ -136,7 +145,8 @@ func (v *VDSOParamPage) Write(f func() vdsoParams) error {
// Get the new params.
p := f()
- buf := binary.Marshal(nil, usermem.ByteOrder, p)
+ buf := v.copyScratchBuffer[:p.SizeBytes()]
+ p.MarshalUnsafe(buf)
// Skip the sequence counter.
if _, err := safemem.Copy(paramPage.DropFirst(8), safemem.BlockFromSafeSlice(buf)); err != nil {