diff options
Diffstat (limited to 'pkg/sentry/kernel')
68 files changed, 2233 insertions, 878 deletions
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index a28eab8b8..90dd4a047 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -69,8 +69,63 @@ go_template_instance( prefix = "socket", template = "//pkg/ilist:generic_list", types = { - "Element": "*SocketEntry", - "Linker": "*SocketEntry", + "Element": "*SocketRecordVFS1", + "Linker": "*SocketRecordVFS1", + }, +) + +go_template_instance( + name = "fd_table_refs", + out = "fd_table_refs.go", + package = "kernel", + prefix = "FDTable", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "FDTable", + }, +) + +go_template_instance( + name = "fs_context_refs", + out = "fs_context_refs.go", + package = "kernel", + prefix = "FSContext", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "FSContext", + }, +) + +go_template_instance( + name = "ipc_namespace_refs", + out = "ipc_namespace_refs.go", + package = "kernel", + prefix = "IPCNamespace", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "IPCNamespace", + }, +) + +go_template_instance( + name = "process_group_refs", + out = "process_group_refs.go", + package = "kernel", + prefix = "ProcessGroup", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "ProcessGroup", + }, +) + +go_template_instance( + name = "session_refs", + out = "session_refs.go", + package = "kernel", + prefix = "Session", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "Session", }, ) @@ -85,11 +140,17 @@ go_library( name = "kernel", srcs = [ "abstract_socket_namespace.go", + "aio.go", "context.go", "fd_table.go", + "fd_table_refs.go", "fd_table_unsafe.go", "fs_context.go", + "fs_context_refs.go", "ipc_namespace.go", + "ipc_namespace_refs.go", + "kcov.go", + "kcov_unsafe.go", "kernel.go", "kernel_opts.go", "kernel_state.go", @@ -98,6 +159,7 @@ go_library( "pending_signals_state.go", "posixtimer.go", "process_group_list.go", + "process_group_refs.go", "ptrace.go", "ptrace_amd64.go", "ptrace_arm64.go", @@ -105,6 +167,7 @@ go_library( "seccomp.go", "seqatomic_taskgoroutineschedinfo_unsafe.go", "session_list.go", + "session_refs.go", "sessions.go", "signal.go", "signal_handlers.go", @@ -131,6 +194,7 @@ go_library( "task_stop.go", "task_syscall.go", "task_usermem.go", + "task_work.go", "thread_group.go", "threads.go", "timekeeper.go", @@ -145,22 +209,27 @@ go_library( "gvisor.dev/gvisor/pkg/sentry/device", "gvisor.dev/gvisor/pkg/tcpip", ], + marshal = True, visibility = ["//:sandbox"], deps = [ ":uncaught_signal_go_proto", "//pkg/abi", "//pkg/abi/linux", "//pkg/amutex", - "//pkg/binary", "//pkg/bits", "//pkg/bpf", + "//pkg/cleanup", "//pkg/context", + "//pkg/coverage", "//pkg/cpuid", "//pkg/eventchannel", "//pkg/fspath", "//pkg/log", + "//pkg/marshal", + "//pkg/marshal/primitive", "//pkg/metric", "//pkg/refs", + "//pkg/refsvfs2", "//pkg/safemem", "//pkg/secio", "//pkg/sentry/arch", @@ -199,6 +268,7 @@ go_library( "//pkg/sentry/vfs", "//pkg/state", "//pkg/state/statefile", + "//pkg/state/wire", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", @@ -206,7 +276,6 @@ go_library( "//pkg/tcpip/stack", "//pkg/usermem", "//pkg/waiter", - "//tools/go_marshal/marshal", ], ) diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go index 920fe4329..0ddbe5ff6 100644 --- a/pkg/sentry/kernel/abstract_socket_namespace.go +++ b/pkg/sentry/kernel/abstract_socket_namespace.go @@ -15,28 +15,21 @@ package kernel import ( + "fmt" "syscall" - "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/refsvfs2" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sync" ) // +stateify savable type abstractEndpoint struct { - ep transport.BoundEndpoint - wr *refs.WeakRef - name string - ns *AbstractSocketNamespace -} - -// WeakRefGone implements refs.WeakRefUser.WeakRefGone. -func (e *abstractEndpoint) WeakRefGone() { - e.ns.mu.Lock() - if e.ns.endpoints[e.name].ep == e.ep { - delete(e.ns.endpoints, e.name) - } - e.ns.mu.Unlock() + ep transport.BoundEndpoint + socket refsvfs2.RefCounter + name string + ns *AbstractSocketNamespace } // AbstractSocketNamespace is used to implement the Linux abstract socket functionality. @@ -45,7 +38,11 @@ func (e *abstractEndpoint) WeakRefGone() { type AbstractSocketNamespace struct { mu sync.Mutex `state:"nosave"` - // Keeps mapping from name to endpoint. + // Keeps a mapping from name to endpoint. AbstractSocketNamespace does not hold + // any references on any sockets that it contains; when retrieving a socket, + // TryIncRef() must be called in case the socket is concurrently being + // destroyed. It is the responsibility of the socket to remove itself from the + // abstract socket namespace when it is destroyed. endpoints map[string]abstractEndpoint } @@ -57,16 +54,16 @@ func NewAbstractSocketNamespace() *AbstractSocketNamespace { } // A boundEndpoint wraps a transport.BoundEndpoint to maintain a reference on -// its backing object. +// its backing socket. type boundEndpoint struct { transport.BoundEndpoint - rc refs.RefCounter + socket refsvfs2.RefCounter } // Release implements transport.BoundEndpoint.Release. -func (e *boundEndpoint) Release() { - e.rc.DecRef() - e.BoundEndpoint.Release() +func (e *boundEndpoint) Release(ctx context.Context) { + e.socket.DecRef(ctx) + e.BoundEndpoint.Release(ctx) } // BoundEndpoint retrieves the endpoint bound to the given name. The return @@ -80,32 +77,59 @@ func (a *AbstractSocketNamespace) BoundEndpoint(name string) transport.BoundEndp return nil } - rc := ep.wr.Get() - if rc == nil { - delete(a.endpoints, name) + if !ep.socket.TryIncRef() { + // The socket has reached zero references and is being destroyed. return nil } - return &boundEndpoint{ep.ep, rc} + return &boundEndpoint{ep.ep, ep.socket} } // Bind binds the given socket. // -// When the last reference managed by rc is dropped, ep may be removed from the +// When the last reference managed by socket is dropped, ep may be removed from the // namespace. -func (a *AbstractSocketNamespace) Bind(name string, ep transport.BoundEndpoint, rc refs.RefCounter) error { +func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, socket refsvfs2.RefCounter) error { a.mu.Lock() defer a.mu.Unlock() + // Check if there is already a socket (which has not yet been destroyed) bound at name. if ep, ok := a.endpoints[name]; ok { - if rc := ep.wr.Get(); rc != nil { - rc.DecRef() + if ep.socket.TryIncRef() { + ep.socket.DecRef(ctx) return syscall.EADDRINUSE } } ae := abstractEndpoint{ep: ep, name: name, ns: a} - ae.wr = refs.NewWeakRef(rc, &ae) + ae.socket = socket a.endpoints[name] = ae return nil } + +// Remove removes the specified socket at name from the abstract socket +// namespace, if it has not yet been replaced. +func (a *AbstractSocketNamespace) Remove(name string, socket refsvfs2.RefCounter) { + a.mu.Lock() + defer a.mu.Unlock() + + ep, ok := a.endpoints[name] + if !ok { + // We never delete a map entry apart from a socket's destructor (although the + // map entry may be overwritten). Therefore, a socket should exist, even if it + // may not be the one we expect. + panic(fmt.Sprintf("expected socket to exist at '%s' in abstract socket namespace", name)) + } + + // A Bind() operation may race with callers of Remove(), e.g. in the + // following case: + // socket1 reaches zero references and begins destruction + // a.Bind("foo", ep, socket2) replaces socket1 with socket2 + // socket1's destructor calls a.Remove("foo", socket1) + // + // Therefore, we need to check that the socket at name is what we expect + // before modifying the map. + if ep.socket == socket { + delete(a.endpoints, name) + } +} diff --git a/pkg/sentry/kernel/aio.go b/pkg/sentry/kernel/aio.go new file mode 100644 index 000000000..0ac78c0b8 --- /dev/null +++ b/pkg/sentry/kernel/aio.go @@ -0,0 +1,81 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "time" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" +) + +// AIOCallback is an function that does asynchronous I/O on behalf of a task. +type AIOCallback func(context.Context) + +// QueueAIO queues an AIOCallback which will be run asynchronously. +func (t *Task) QueueAIO(cb AIOCallback) { + ctx := taskAsyncContext{t: t} + wg := &t.TaskSet().aioGoroutines + wg.Add(1) + go func() { + cb(ctx) + wg.Done() + }() +} + +type taskAsyncContext struct { + context.NoopSleeper + t *Task +} + +// Debugf implements log.Logger.Debugf. +func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) { + ctx.t.Debugf(format, v...) +} + +// Infof implements log.Logger.Infof. +func (ctx taskAsyncContext) Infof(format string, v ...interface{}) { + ctx.t.Infof(format, v...) +} + +// Warningf implements log.Logger.Warningf. +func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) { + ctx.t.Warningf(format, v...) +} + +// IsLogging implements log.Logger.IsLogging. +func (ctx taskAsyncContext) IsLogging(level log.Level) bool { + return ctx.t.IsLogging(level) +} + +// Deadline implements context.Context.Deadline. +func (ctx taskAsyncContext) Deadline() (time.Time, bool) { + return ctx.t.Deadline() +} + +// Done implements context.Context.Done. +func (ctx taskAsyncContext) Done() <-chan struct{} { + return ctx.t.Done() +} + +// Err implements context.Context.Err. +func (ctx taskAsyncContext) Err() error { + return ctx.t.Err() +} + +// Value implements context.Context.Value. +func (ctx taskAsyncContext) Value(key interface{}) interface{} { + return ctx.t.Value(key) +} diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD index 2bc49483a..869e49ebc 100644 --- a/pkg/sentry/kernel/auth/BUILD +++ b/pkg/sentry/kernel/auth/BUILD @@ -57,6 +57,7 @@ go_library( "id_map_set.go", "user_namespace.go", ], + marshal = True, visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/kernel/auth/context.go b/pkg/sentry/kernel/auth/context.go index ef5723127..c08d47787 100644 --- a/pkg/sentry/kernel/auth/context.go +++ b/pkg/sentry/kernel/auth/context.go @@ -34,3 +34,23 @@ func CredentialsFromContext(ctx context.Context) *Credentials { } return NewAnonymousCredentials() } + +// ContextWithCredentials returns a copy of ctx carrying creds. +func ContextWithCredentials(ctx context.Context, creds *Credentials) context.Context { + return &authContext{ctx, creds} +} + +type authContext struct { + context.Context + creds *Credentials +} + +// Value implements context.Context. +func (ac *authContext) Value(key interface{}) interface{} { + switch key { + case CtxCredentials: + return ac.creds + default: + return ac.Context.Value(key) + } +} diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go index e057d2c6d..6862f2ef5 100644 --- a/pkg/sentry/kernel/auth/credentials.go +++ b/pkg/sentry/kernel/auth/credentials.go @@ -232,3 +232,31 @@ func (c *Credentials) UseGID(gid GID) (KGID, error) { } return NoID, syserror.EPERM } + +// SetUID translates the provided uid to the root user namespace and updates c's +// uids to it. This performs no permissions or capabilities checks, the caller +// is responsible for ensuring the calling context is permitted to modify c. +func (c *Credentials) SetUID(uid UID) error { + kuid := c.UserNamespace.MapToKUID(uid) + if !kuid.Ok() { + return syserror.EINVAL + } + c.RealKUID = kuid + c.EffectiveKUID = kuid + c.SavedKUID = kuid + return nil +} + +// SetGID translates the provided gid to the root user namespace and updates c's +// gids to it. This performs no permissions or capabilities checks, the caller +// is responsible for ensuring the calling context is permitted to modify c. +func (c *Credentials) SetGID(gid GID) error { + kgid := c.UserNamespace.MapToKGID(gid) + if !kgid.Ok() { + return syserror.EINVAL + } + c.RealKGID = kgid + c.EffectiveKGID = kgid + c.SavedKGID = kgid + return nil +} diff --git a/pkg/sentry/kernel/auth/id.go b/pkg/sentry/kernel/auth/id.go index 0a58ba17c..4c32ee703 100644 --- a/pkg/sentry/kernel/auth/id.go +++ b/pkg/sentry/kernel/auth/id.go @@ -19,9 +19,13 @@ import ( ) // UID is a user ID in an unspecified user namespace. +// +// +marshal type UID uint32 // GID is a group ID in an unspecified user namespace. +// +// +marshal slice:GIDSlice type GID uint32 // In the root user namespace, user/group IDs have a 1-to-1 relationship with diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go index 0c40bf315..bb94769c4 100644 --- a/pkg/sentry/kernel/context.go +++ b/pkg/sentry/kernel/context.go @@ -18,7 +18,6 @@ import ( "time" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/log" ) // contextID is the kernel package's type for context.Context.Value keys. @@ -82,7 +81,8 @@ func UTSNamespaceFromContext(ctx context.Context) *UTSNamespace { } // IPCNamespaceFromContext returns the IPC namespace in which ctx is executing, -// or nil if there is no such IPC namespace. +// or nil if there is no such IPC namespace. It takes a reference on the +// namespace. func IPCNamespaceFromContext(ctx context.Context) *IPCNamespace { if v := ctx.Value(CtxIPCNamespace); v != nil { return v.(*IPCNamespace) @@ -113,55 +113,3 @@ func (*Task) Done() <-chan struct{} { func (*Task) Err() error { return nil } - -// AsyncContext returns a context.Context that may be used by goroutines that -// do work on behalf of t and therefore share its contextual values, but are -// not t's task goroutine (e.g. asynchronous I/O). -func (t *Task) AsyncContext() context.Context { - return taskAsyncContext{t: t} -} - -type taskAsyncContext struct { - context.NoopSleeper - t *Task -} - -// Debugf implements log.Logger.Debugf. -func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) { - ctx.t.Debugf(format, v...) -} - -// Infof implements log.Logger.Infof. -func (ctx taskAsyncContext) Infof(format string, v ...interface{}) { - ctx.t.Infof(format, v...) -} - -// Warningf implements log.Logger.Warningf. -func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) { - ctx.t.Warningf(format, v...) -} - -// IsLogging implements log.Logger.IsLogging. -func (ctx taskAsyncContext) IsLogging(level log.Level) bool { - return ctx.t.IsLogging(level) -} - -// Deadline implements context.Context.Deadline. -func (ctx taskAsyncContext) Deadline() (time.Time, bool) { - return ctx.t.Deadline() -} - -// Done implements context.Context.Done. -func (ctx taskAsyncContext) Done() <-chan struct{} { - return ctx.t.Done() -} - -// Err implements context.Context.Err. -func (ctx taskAsyncContext) Err() error { - return ctx.t.Err() -} - -// Value implements context.Context.Value. -func (ctx taskAsyncContext) Value(key interface{}) interface{} { - return ctx.t.Value(key) -} diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go index 3d78cd48f..15519f0df 100644 --- a/pkg/sentry/kernel/epoll/epoll.go +++ b/pkg/sentry/kernel/epoll/epoll.go @@ -76,8 +76,8 @@ type pollEntry struct { // WeakRefGone implements refs.WeakRefUser.WeakRefGone. // weakReferenceGone is called when the file in the weak reference is destroyed. // The poll entry is removed in response to this. -func (p *pollEntry) WeakRefGone() { - p.epoll.RemoveEntry(p.id) +func (p *pollEntry) WeakRefGone(ctx context.Context) { + p.epoll.RemoveEntry(ctx, p.id) } // EventPoll holds all the state associated with an event poll object, that is, @@ -107,7 +107,7 @@ type EventPoll struct { // different lock to avoid circular lock acquisition order involving // the wait queue mutexes and mu. The full order is mu, observed file // wait queue mutex, then listsMu; this allows listsMu to be acquired - // when readyCallback is called. + // when (*pollEntry).Callback is called. // // An entry is always in one of the following lists: // readyList -- when there's a chance that it's ready to have @@ -116,7 +116,7 @@ type EventPoll struct { // readEvents() functions always call the entry's file // Readiness() function to confirm it's ready. // waitingList -- when there's no chance that the entry is ready, - // so it's waiting for the readyCallback to be called + // so it's waiting for the (*pollEntry).Callback to be called // on it before it gets moved to the readyList. // disabledList -- when the entry is disabled. This happens when // a one-shot entry gets delivered via readEvents(). @@ -144,14 +144,14 @@ func NewEventPoll(ctx context.Context) *fs.File { // name matches fs/eventpoll.c:epoll_create1. dirent := fs.NewDirent(ctx, anon.NewInode(ctx), fmt.Sprintf("anon_inode:[eventpoll]")) // Release the initial dirent reference after NewFile takes a reference. - defer dirent.DecRef() + defer dirent.DecRef(ctx) return fs.NewFile(ctx, dirent, fs.FileFlags{}, &EventPoll{ files: make(map[FileIdentifier]*pollEntry), }) } // Release implements fs.FileOperations.Release. -func (e *EventPoll) Release() { +func (e *EventPoll) Release(ctx context.Context) { // We need to take the lock now because files may be attempting to // remove entries in parallel if they get destroyed. e.mu.Lock() @@ -160,7 +160,7 @@ func (e *EventPoll) Release() { // Go through all entries and clean up. for _, entry := range e.files { entry.id.File.EventUnregister(&entry.waiter) - entry.file.Drop() + entry.file.Drop(ctx) } e.files = nil } @@ -269,21 +269,19 @@ func (e *EventPoll) ReadEvents(max int) []linux.EpollEvent { return ret } -// readyCallback is called when one of the files we're polling becomes ready. It -// moves said file to the readyList if it's currently in the waiting list. -type readyCallback struct{} - // Callback implements waiter.EntryCallback.Callback. -func (*readyCallback) Callback(w *waiter.Entry) { - entry := w.Context.(*pollEntry) - e := entry.epoll +// +// Callback is called when one of the files we're polling becomes ready. It +// moves said file to the readyList if it's currently in the waiting list. +func (p *pollEntry) Callback(*waiter.Entry) { + e := p.epoll e.listsMu.Lock() - if entry.curList == &e.waitingList { - e.waitingList.Remove(entry) - e.readyList.PushBack(entry) - entry.curList = &e.readyList + if p.curList == &e.waitingList { + e.waitingList.Remove(p) + e.readyList.PushBack(p) + p.curList = &e.readyList e.listsMu.Unlock() e.Notify(waiter.EventIn) @@ -310,7 +308,7 @@ func (e *EventPoll) initEntryReadiness(entry *pollEntry) { // Check if the file happens to already be in a ready state. ready := f.Readiness(entry.mask) & entry.mask if ready != 0 { - (*readyCallback).Callback(nil, &entry.waiter) + entry.Callback(&entry.waiter) } } @@ -380,10 +378,9 @@ func (e *EventPoll) AddEntry(id FileIdentifier, flags EntryFlags, mask waiter.Ev userData: data, epoll: e, flags: flags, - waiter: waiter.Entry{Callback: &readyCallback{}}, mask: mask, } - entry.waiter.Context = entry + entry.waiter.Callback = entry e.files[id] = entry entry.file = refs.NewWeakRef(id.File, entry) @@ -406,7 +403,7 @@ func (e *EventPoll) UpdateEntry(id FileIdentifier, flags EntryFlags, mask waiter } // Unregister the old mask and remove entry from the list it's in, so - // readyCallback is guaranteed to not be called on this entry anymore. + // (*pollEntry).Callback is guaranteed to not be called on this entry anymore. entry.id.File.EventUnregister(&entry.waiter) // Remove entry from whatever list it's in. This ensure that no other @@ -426,7 +423,7 @@ func (e *EventPoll) UpdateEntry(id FileIdentifier, flags EntryFlags, mask waiter } // RemoveEntry a files from the collection of observed files. -func (e *EventPoll) RemoveEntry(id FileIdentifier) error { +func (e *EventPoll) RemoveEntry(ctx context.Context, id FileIdentifier) error { e.mu.Lock() defer e.mu.Unlock() @@ -448,7 +445,7 @@ func (e *EventPoll) RemoveEntry(id FileIdentifier) error { // Remove file from map, and drop weak reference. delete(e.files, id) - entry.file.Drop() + entry.file.Drop(ctx) return nil } diff --git a/pkg/sentry/kernel/epoll/epoll_state.go b/pkg/sentry/kernel/epoll/epoll_state.go index 8e9f200d0..7c61e0258 100644 --- a/pkg/sentry/kernel/epoll/epoll_state.go +++ b/pkg/sentry/kernel/epoll/epoll_state.go @@ -21,8 +21,7 @@ import ( // afterLoad is invoked by stateify. func (p *pollEntry) afterLoad() { - p.waiter = waiter.Entry{Callback: &readyCallback{}} - p.waiter.Context = p + p.waiter.Callback = p p.file = refs.NewWeakRef(p.id.File, p) p.id.File.EventRegister(&p.waiter, p.mask) } diff --git a/pkg/sentry/kernel/epoll/epoll_test.go b/pkg/sentry/kernel/epoll/epoll_test.go index 22630e9c5..55b505593 100644 --- a/pkg/sentry/kernel/epoll/epoll_test.go +++ b/pkg/sentry/kernel/epoll/epoll_test.go @@ -26,7 +26,8 @@ func TestFileDestroyed(t *testing.T) { f := filetest.NewTestFile(t) id := FileIdentifier{f, 12} - efile := NewEventPoll(contexttest.Context(t)) + ctx := contexttest.Context(t) + efile := NewEventPoll(ctx) e := efile.FileOperations.(*EventPoll) if err := e.AddEntry(id, 0, waiter.EventIn, [2]int32{}); err != nil { t.Fatalf("addEntry failed: %v", err) @@ -44,7 +45,7 @@ func TestFileDestroyed(t *testing.T) { } // Destroy the file. Check that we get no more events. - f.DecRef() + f.DecRef(ctx) evt = e.ReadEvents(1) if len(evt) != 0 { diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go index 87951adeb..bbf568dfc 100644 --- a/pkg/sentry/kernel/eventfd/eventfd.go +++ b/pkg/sentry/kernel/eventfd/eventfd.go @@ -70,7 +70,7 @@ func New(ctx context.Context, initVal uint64, semMode bool) *fs.File { // name matches fs/eventfd.c:eventfd_file_create. dirent := fs.NewDirent(ctx, anon.NewInode(ctx), "anon_inode:[eventfd]") // Release the initial dirent reference after NewFile takes a reference. - defer dirent.DecRef() + defer dirent.DecRef(ctx) return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &EventOperations{ val: initVal, semMode: semMode, @@ -106,7 +106,7 @@ func (e *EventOperations) HostFD() (int, error) { } // Release implements fs.FileOperations.Release. -func (e *EventOperations) Release() { +func (e *EventOperations) Release(context.Context) { e.mu.Lock() defer e.mu.Unlock() if e.hostfd >= 0 { diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD index b9126e946..2b3955598 100644 --- a/pkg/sentry/kernel/fasync/BUILD +++ b/pkg/sentry/kernel/fasync/BUILD @@ -11,6 +11,7 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", "//pkg/sync", "//pkg/waiter", ], diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go index d32c3e90a..153d2cd9b 100644 --- a/pkg/sentry/kernel/fasync/fasync.go +++ b/pkg/sentry/kernel/fasync/fasync.go @@ -20,15 +20,21 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) -// New creates a new FileAsync. +// New creates a new fs.FileAsync. func New() fs.FileAsync { return &FileAsync{} } +// NewVFS2 creates a new vfs.FileAsync. +func NewVFS2() vfs.FileAsync { + return &FileAsync{} +} + // FileAsync sends signals when the registered file is ready for IO. // // +stateify savable @@ -170,3 +176,13 @@ func (a *FileAsync) SetOwnerProcessGroup(requester *kernel.Task, recipient *kern a.recipientTG = nil a.recipientPG = recipient } + +// ClearOwner unsets the current signal recipient. +func (a *FileAsync) ClearOwner() { + a.mu.Lock() + defer a.mu.Unlock() + a.requester = nil + a.recipientT = nil + a.recipientTG = nil + a.recipientPG = nil +} diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index dbfcef0fa..7aba31587 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -23,12 +23,12 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" ) // FDFlags define flags for an individual descriptor. @@ -77,11 +77,9 @@ type descriptor struct { // // +stateify savable type FDTable struct { - refs.AtomicRefCount - k *Kernel + FDTableRefs - // uid is a unique identifier. - uid uint64 + k *Kernel // mu protects below. mu sync.Mutex `state:"nosave"` @@ -100,7 +98,7 @@ type FDTable struct { func (f *FDTable) saveDescriptorTable() map[int32]descriptor { m := make(map[int32]descriptor) - f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + f.forEach(context.Background(), func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { m[fd] = descriptor{ file: file, fileVFS2: fileVFS2, @@ -111,26 +109,30 @@ func (f *FDTable) saveDescriptorTable() map[int32]descriptor { } func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) { - f.init() // Initialize table. + ctx := context.Background() + f.initNoLeakCheck() // Initialize table. + f.used = 0 for fd, d := range m { - f.setAll(fd, d.file, d.fileVFS2, d.flags) + if file, fileVFS2 := f.setAll(ctx, fd, d.file, d.fileVFS2, d.flags); file != nil || fileVFS2 != nil { + panic("VFS1 or VFS2 files set") + } // Note that we do _not_ need to acquire a extra table reference here. The // table reference will already be accounted for in the file, so we drop the // reference taken by set above. switch { case d.file != nil: - d.file.DecRef() + d.file.DecRef(ctx) case d.fileVFS2 != nil: - d.fileVFS2.DecRef() + d.fileVFS2.DecRef(ctx) } } } // drop drops the table reference. -func (f *FDTable) drop(file *fs.File) { +func (f *FDTable) drop(ctx context.Context, file *fs.File) { // Release locks. - file.Dirent.Inode.LockCtx.Posix.UnlockRegion(lock.UniqueID(f.uid), lock.LockRange{0, lock.LockEOF}) + file.Dirent.Inode.LockCtx.Posix.UnlockRegion(f, lock.LockRange{0, lock.LockEOF}) // Send inotify events. d := file.Dirent @@ -146,61 +148,51 @@ func (f *FDTable) drop(file *fs.File) { d.InotifyEvent(ev, 0) // Drop the table reference. - file.DecRef() + file.DecRef(ctx) } // dropVFS2 drops the table reference. -func (f *FDTable) dropVFS2(file *vfs.FileDescription) { - // TODO(gvisor.dev/issue/1480): Release locks. +func (f *FDTable) dropVFS2(ctx context.Context, file *vfs.FileDescription) { + // Release any POSIX lock possibly held by the FDTable. Range {0, 0} means the + // entire file. + err := file.UnlockPOSIX(ctx, f, 0, 0, linux.SEEK_SET) + if err != nil && err != syserror.ENOLCK { + panic(fmt.Sprintf("UnlockPOSIX failed: %v", err)) + } // Generate inotify events. ev := uint32(linux.IN_CLOSE_NOWRITE) if file.IsWritable() { ev = linux.IN_CLOSE_WRITE } - file.Dentry().InotifyWithParent(ev, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(ctx, ev, 0, vfs.PathEvent) - // Drop the table reference. - file.DecRef() -} - -// ID returns a unique identifier for this FDTable. -func (f *FDTable) ID() uint64 { - return f.uid + // Drop the table's reference. + file.DecRef(ctx) } // NewFDTable allocates a new FDTable that may be used by tasks in k. func (k *Kernel) NewFDTable() *FDTable { - f := &FDTable{ - k: k, - uid: atomic.AddUint64(&k.fdMapUids, 1), - } + f := &FDTable{k: k} f.init() return f } -// destroy removes all of the file descriptors from the map. -func (f *FDTable) destroy() { - f.RemoveIf(func(*fs.File, *vfs.FileDescription, FDFlags) bool { - return true +// DecRef implements RefCounter.DecRef. +// +// If f reaches zero references, all of its file descriptors are removed. +func (f *FDTable) DecRef(ctx context.Context) { + f.FDTableRefs.DecRef(func() { + f.RemoveIf(ctx, func(*fs.File, *vfs.FileDescription, FDFlags) bool { + return true + }) }) } -// DecRef implements RefCounter.DecRef with destructor f.destroy. -func (f *FDTable) DecRef() { - f.DecRefWithDestructor(f.destroy) -} - -// Size returns the number of file descriptor slots currently allocated. -func (f *FDTable) Size() int { - size := atomic.LoadInt32(&f.used) - return int(size) -} - // forEach iterates over all non-nil files in sorted order. // // It is the caller's responsibility to acquire an appropriate lock. -func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) { +func (f *FDTable) forEach(ctx context.Context, fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) { // retries tracks the number of failed TryIncRef attempts for the same FD. retries := 0 fd := int32(0) @@ -219,7 +211,7 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes continue // Race caught. } fn(fd, file, nil, flags) - file.DecRef() + file.DecRef(ctx) case fileVFS2 != nil: if !fileVFS2.TryIncRef() { retries++ @@ -229,7 +221,7 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes continue // Race caught. } fn(fd, nil, fileVFS2, flags) - fileVFS2.DecRef() + fileVFS2.DecRef(ctx) } retries = 0 fd++ @@ -239,7 +231,8 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes // String is a stringer for FDTable. func (f *FDTable) String() string { var buf strings.Builder - f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + ctx := context.Background() + f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { switch { case file != nil: n, _ := file.Dirent.FullName(nil /* root */) @@ -247,7 +240,11 @@ func (f *FDTable) String() string { case fileVFS2 != nil: vfsObj := fileVFS2.Mount().Filesystem().VirtualFilesystem() - name, err := vfsObj.PathnameWithDeleted(context.Background(), vfs.VirtualDentry{}, fileVFS2.VirtualDentry()) + vd := fileVFS2.VirtualDentry() + if vd.Dentry() == nil { + panic(fmt.Sprintf("fd %d (type %T) has nil dentry: %#v", fd, fileVFS2.Impl(), fileVFS2)) + } + name, err := vfsObj.PathnameWithDeleted(ctx, vfs.VirtualDentry{}, fileVFS2.VirtualDentry()) if err != nil { fmt.Fprintf(&buf, "<err: %v>\n", err) return @@ -282,7 +279,6 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags } f.mu.Lock() - defer f.mu.Unlock() // From f.next to find available fd. if fd < f.next { @@ -292,15 +288,25 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags // Install all entries. for i := fd; i < end && len(fds) < len(files); i++ { if d, _, _ := f.get(i); d == nil { - f.set(i, files[len(fds)], flags) // Set the descriptor. - fds = append(fds, i) // Record the file descriptor. + // Set the descriptor. + f.set(ctx, i, files[len(fds)], flags) + fds = append(fds, i) // Record the file descriptor. } } // Failure? Unwind existing FDs. if len(fds) < len(files) { for _, i := range fds { - f.set(i, nil, FDFlags{}) // Zap entry. + f.set(ctx, i, nil, FDFlags{}) + } + f.mu.Unlock() + + // Drop the reference taken by the call to f.set() that + // originally installed the file. Don't call f.drop() + // (generating inotify events, etc.) since the file should + // appear to have never been inserted into f. + for _, file := range files[:len(fds)] { + file.DecRef(ctx) } return nil, syscall.EMFILE } @@ -310,6 +316,7 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags f.next = fds[len(fds)-1] + 1 } + f.mu.Unlock() return fds, nil } @@ -337,7 +344,6 @@ func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDes } f.mu.Lock() - defer f.mu.Unlock() // From f.next to find available fd. if fd < f.next { @@ -347,15 +353,25 @@ func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDes // Install all entries. for i := fd; i < end && len(fds) < len(files); i++ { if d, _, _ := f.getVFS2(i); d == nil { - f.setVFS2(i, files[len(fds)], flags) // Set the descriptor. - fds = append(fds, i) // Record the file descriptor. + // Set the descriptor. + f.setVFS2(ctx, i, files[len(fds)], flags) + fds = append(fds, i) // Record the file descriptor. } } // Failure? Unwind existing FDs. if len(fds) < len(files) { for _, i := range fds { - f.setVFS2(i, nil, FDFlags{}) // Zap entry. + f.setVFS2(ctx, i, nil, FDFlags{}) + } + f.mu.Unlock() + + // Drop the reference taken by the call to f.setVFS2() that + // originally installed the file. Don't call f.dropVFS2() + // (generating inotify events, etc.) since the file should + // appear to have never been inserted into f. + for _, file := range files[:len(fds)] { + file.DecRef(ctx) } return nil, syscall.EMFILE } @@ -365,6 +381,7 @@ func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDes f.next = fds[len(fds)-1] + 1 } + f.mu.Unlock() return fds, nil } @@ -400,7 +417,7 @@ func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDesc } for fd < end { if d, _, _ := f.getVFS2(fd); d == nil { - f.setVFS2(fd, file, flags) + f.setVFS2(ctx, fd, file, flags) if fd == f.next { // Update next search start position. f.next = fd + 1 @@ -416,40 +433,55 @@ func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDesc // reference for that FD, the ref count for that existing reference is // decremented. func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FDFlags) error { - return f.newFDAt(ctx, fd, file, nil, flags) + df, _, err := f.newFDAt(ctx, fd, file, nil, flags) + if err != nil { + return err + } + if df != nil { + f.drop(ctx, df) + } + return nil } // NewFDAtVFS2 sets the file reference for the given FD. If there is an active // reference for that FD, the ref count for that existing reference is // decremented. func (f *FDTable) NewFDAtVFS2(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) error { - return f.newFDAt(ctx, fd, nil, file, flags) + _, dfVFS2, err := f.newFDAt(ctx, fd, nil, file, flags) + if err != nil { + return err + } + if dfVFS2 != nil { + f.dropVFS2(ctx, dfVFS2) + } + return nil } -func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) error { +func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) (*fs.File, *vfs.FileDescription, error) { if fd < 0 { // Don't accept negative FDs. - return syscall.EBADF + return nil, nil, syscall.EBADF } // Check the limit for the provided file. if limitSet := limits.FromContext(ctx); limitSet != nil { if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur { - return syscall.EMFILE + return nil, nil, syscall.EMFILE } } // Install the entry. f.mu.Lock() defer f.mu.Unlock() - f.setAll(fd, file, fileVFS2, flags) - return nil + + df, dfVFS2 := f.setAll(ctx, fd, file, fileVFS2, flags) + return df, dfVFS2, nil } // SetFlags sets the flags for the given file descriptor. // // True is returned iff flags were changed. -func (f *FDTable) SetFlags(fd int32, flags FDFlags) error { +func (f *FDTable) SetFlags(ctx context.Context, fd int32, flags FDFlags) error { if fd < 0 { // Don't accept negative FDs. return syscall.EBADF @@ -465,7 +497,30 @@ func (f *FDTable) SetFlags(fd int32, flags FDFlags) error { } // Update the flags. - f.set(fd, file, flags) + f.set(ctx, fd, file, flags) + return nil +} + +// SetFlagsVFS2 sets the flags for the given file descriptor. +// +// True is returned iff flags were changed. +func (f *FDTable) SetFlagsVFS2(ctx context.Context, fd int32, flags FDFlags) error { + if fd < 0 { + // Don't accept negative FDs. + return syscall.EBADF + } + + f.mu.Lock() + defer f.mu.Unlock() + + file, _, _ := f.getVFS2(fd) + if file == nil { + // No file found. + return syscall.EBADF + } + + // Update the flags. + f.setVFS2(ctx, fd, file, flags) return nil } @@ -523,50 +578,23 @@ func (f *FDTable) GetVFS2(fd int32) (*vfs.FileDescription, FDFlags) { // // Precondition: The caller must be running on the task goroutine, or Task.mu // must be locked. -func (f *FDTable) GetFDs() []int32 { +func (f *FDTable) GetFDs(ctx context.Context) []int32 { fds := make([]int32, 0, int(atomic.LoadInt32(&f.used))) - f.forEach(func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) { + f.forEach(ctx, func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) { fds = append(fds, fd) }) return fds } -// GetRefs returns a stable slice of references to all files and bumps the -// reference count on each. The caller must use DecRef on each reference when -// they're done using the slice. -func (f *FDTable) GetRefs() []*fs.File { - files := make([]*fs.File, 0, f.Size()) - f.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { - file.IncRef() // Acquire a reference for caller. - files = append(files, file) - }) - return files -} - -// GetRefsVFS2 returns a stable slice of references to all files and bumps the -// reference count on each. The caller must use DecRef on each reference when -// they're done using the slice. -func (f *FDTable) GetRefsVFS2() []*vfs.FileDescription { - files := make([]*vfs.FileDescription, 0, f.Size()) - f.forEach(func(_ int32, _ *fs.File, file *vfs.FileDescription, _ FDFlags) { - file.IncRef() // Acquire a reference for caller. - files = append(files, file) - }) - return files -} - // Fork returns an independent FDTable. -func (f *FDTable) Fork() *FDTable { +func (f *FDTable) Fork(ctx context.Context) *FDTable { clone := f.k.NewFDTable() - f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { // The set function here will acquire an appropriate table // reference for the clone. We don't need anything else. - switch { - case file != nil: - clone.set(fd, file, flags) - case fileVFS2 != nil: - clone.setVFS2(fd, fileVFS2, flags) + if df, dfVFS2 := clone.setAll(ctx, fd, file, fileVFS2, flags); df != nil || dfVFS2 != nil { + panic("VFS1 or VFS2 files set") } }) return clone @@ -575,13 +603,12 @@ func (f *FDTable) Fork() *FDTable { // Remove removes an FD from and returns a non-file iff successful. // // N.B. Callers are required to use DecRef when they are done. -func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) { +func (f *FDTable) Remove(ctx context.Context, fd int32) (*fs.File, *vfs.FileDescription) { if fd < 0 { return nil, nil } f.mu.Lock() - defer f.mu.Unlock() // Update current available position. if fd < f.next { @@ -597,24 +624,51 @@ func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) { case orig2 != nil: orig2.IncRef() } + if orig != nil || orig2 != nil { - f.setAll(fd, nil, nil, FDFlags{}) // Zap entry. + orig, orig2 = f.setAll(ctx, fd, nil, nil, FDFlags{}) // Zap entry. } + f.mu.Unlock() + + if orig != nil { + f.drop(ctx, orig) + } + if orig2 != nil { + f.dropVFS2(ctx, orig2) + } + return orig, orig2 } // RemoveIf removes all FDs where cond is true. -func (f *FDTable) RemoveIf(cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) { - f.mu.Lock() - defer f.mu.Unlock() +func (f *FDTable) RemoveIf(ctx context.Context, cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) { + // TODO(gvisor.dev/issue/1624): Remove fs.File slice. + var files []*fs.File + var filesVFS2 []*vfs.FileDescription - f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + f.mu.Lock() + f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { if cond(file, fileVFS2, flags) { - f.set(fd, nil, FDFlags{}) // Clear from table. + df, dfVFS2 := f.setAll(ctx, fd, nil, nil, FDFlags{}) // Clear from table. + if df != nil { + files = append(files, df) + } + if dfVFS2 != nil { + filesVFS2 = append(filesVFS2, dfVFS2) + } // Update current available position. if fd < f.next { f.next = fd } } }) + f.mu.Unlock() + + for _, file := range files { + f.drop(ctx, file) + } + + for _, file := range filesVFS2 { + f.dropVFS2(ctx, file) + } } diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go index 29f95a2c4..bf5460083 100644 --- a/pkg/sentry/kernel/fd_table_test.go +++ b/pkg/sentry/kernel/fd_table_test.go @@ -72,7 +72,7 @@ func TestFDTableMany(t *testing.T) { } i := int32(2) - fdTable.Remove(i) + fdTable.Remove(ctx, i) if fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil || fds[0] != i { t.Fatalf("Allocated %v FDs but wanted to allocate %v: %v", i, maxFD, err) } @@ -93,7 +93,7 @@ func TestFDTableOverLimit(t *testing.T) { t.Fatalf("fdTable.NewFDs(maxFD-3, {f,f,f}): got %v, wanted nil", err) } else { for _, fd := range fds { - fdTable.Remove(fd) + fdTable.Remove(ctx, fd) } } @@ -150,13 +150,13 @@ func TestFDTable(t *testing.T) { t.Fatalf("fdTable.Get(2): got a %v, wanted nil", ref) } - ref, _ := fdTable.Remove(1) + ref, _ := fdTable.Remove(ctx, 1) if ref == nil { t.Fatalf("fdTable.Remove(1) for an existing FD: failed, want success") } - ref.DecRef() + ref.DecRef(ctx) - if ref, _ := fdTable.Remove(1); ref != nil { + if ref, _ := fdTable.Remove(ctx, 1); ref != nil { t.Fatalf("r.Remove(1) for a removed FD: got success, want failure") } }) @@ -191,7 +191,7 @@ func BenchmarkFDLookupAndDecRef(b *testing.B) { b.StartTimer() // Benchmark. for i := 0; i < b.N; i++ { tf, _ := fdTable.Get(fds[i%len(fds)]) - tf.DecRef() + tf.DecRef(ctx) } }) } @@ -219,7 +219,7 @@ func BenchmarkFDLookupAndDecRefConcurrent(b *testing.B) { defer wg.Done() for i := 0; i < each; i++ { tf, _ := fdTable.Get(fds[i%len(fds)]) - tf.DecRef() + tf.DecRef(ctx) } }() } diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go index 7fd97dc53..3476551f3 100644 --- a/pkg/sentry/kernel/fd_table_unsafe.go +++ b/pkg/sentry/kernel/fd_table_unsafe.go @@ -18,6 +18,7 @@ import ( "sync/atomic" "unsafe" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/vfs" ) @@ -30,12 +31,21 @@ type descriptorTable struct { slice unsafe.Pointer `state:".(map[int32]*descriptor)"` } -// init initializes the table. -func (f *FDTable) init() { +// initNoLeakCheck initializes the table without enabling leak checking. +// +// This is used when loading an FDTable after S/R, during which the ref count +// object itself will enable leak checking if necessary. +func (f *FDTable) initNoLeakCheck() { var slice []unsafe.Pointer // Empty slice. atomic.StorePointer(&f.slice, unsafe.Pointer(&slice)) } +// init initializes the table with leak checking. +func (f *FDTable) init() { + f.initNoLeakCheck() + f.EnableLeakCheck() +} + // get gets a file entry. // // The boolean indicates whether this was in range. @@ -76,33 +86,37 @@ func (f *FDTable) getAll(fd int32) (*fs.File, *vfs.FileDescription, FDFlags, boo return d.file, d.fileVFS2, d.flags, true } -// set sets an entry. -// -// This handles accounting changes, as well as acquiring and releasing the -// reference needed by the table iff the file is different. +// CurrentMaxFDs returns the number of file descriptors that may be stored in f +// without reallocation. +func (f *FDTable) CurrentMaxFDs() int { + slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice)) + return len(slice) +} + +// set sets an entry for VFS1, refer to setAll(). // // Precondition: mu must be held. -func (f *FDTable) set(fd int32, file *fs.File, flags FDFlags) { - f.setAll(fd, file, nil, flags) +func (f *FDTable) set(ctx context.Context, fd int32, file *fs.File, flags FDFlags) *fs.File { + dropFile, _ := f.setAll(ctx, fd, file, nil, flags) + return dropFile } -// setVFS2 sets an entry. -// -// This handles accounting changes, as well as acquiring and releasing the -// reference needed by the table iff the file is different. +// setVFS2 sets an entry for VFS2, refer to setAll(). // // Precondition: mu must be held. -func (f *FDTable) setVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) { - f.setAll(fd, nil, file, flags) +func (f *FDTable) setVFS2(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) *vfs.FileDescription { + _, dropFile := f.setAll(ctx, fd, nil, file, flags) + return dropFile } -// setAll sets an entry. -// -// This handles accounting changes, as well as acquiring and releasing the -// reference needed by the table iff the file is different. +// setAll sets the file description referred to by fd to file/fileVFS2. If +// file/fileVFS2 are non-nil, it takes a reference on them. If setAll replaces +// an existing file description, it returns it with the FDTable's reference +// transferred to the caller, which must call f.drop/dropVFS2() on the returned +// file after unlocking f.mu. // // Precondition: mu must be held. -func (f *FDTable) setAll(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { +func (f *FDTable) setAll(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) (*fs.File, *vfs.FileDescription) { if file != nil && fileVFS2 != nil { panic("VFS1 and VFS2 files set") } @@ -145,25 +159,25 @@ func (f *FDTable) setAll(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, } } - // Drop the table reference. + // Adjust used. + switch { + case orig == nil && desc != nil: + atomic.AddInt32(&f.used, 1) + case orig != nil && desc == nil: + atomic.AddInt32(&f.used, -1) + } + if orig != nil { switch { case orig.file != nil: if desc == nil || desc.file != orig.file { - f.drop(orig.file) + return orig.file, nil } case orig.fileVFS2 != nil: if desc == nil || desc.fileVFS2 != orig.fileVFS2 { - f.dropVFS2(orig.fileVFS2) + return nil, orig.fileVFS2 } } } - - // Adjust used. - switch { - case orig == nil && desc != nil: - atomic.AddInt32(&f.used, 1) - case orig != nil && desc == nil: - atomic.AddInt32(&f.used, -1) - } + return nil, nil } diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go index 47f78df9a..41fb2a784 100644 --- a/pkg/sentry/kernel/fs_context.go +++ b/pkg/sentry/kernel/fs_context.go @@ -17,7 +17,7 @@ package kernel import ( "fmt" - "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" @@ -29,7 +29,7 @@ import ( // // +stateify savable type FSContext struct { - refs.AtomicRefCount + FSContextRefs // mu protects below. mu sync.Mutex `state:"nosave"` @@ -63,7 +63,7 @@ func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext { cwd: cwd, umask: umask, } - f.EnableLeakCheck("kernel.FSContext") + f.EnableLeakCheck() return &f } @@ -76,96 +76,104 @@ func NewFSContextVFS2(root, cwd vfs.VirtualDentry, umask uint) *FSContext { cwdVFS2: cwd, umask: umask, } - f.EnableLeakCheck("kernel.FSContext") + f.EnableLeakCheck() return &f } -// destroy is the destructor for an FSContext. +// DecRef implements RefCounter.DecRef. // -// This will call DecRef on both root and cwd Dirents. If either call to -// DecRef returns an error, then it will be propagated. If both calls to -// DecRef return an error, then the one from root.DecRef will be propagated. +// When f reaches zero references, DecRef will be called on both root and cwd +// Dirents. // // Note that there may still be calls to WorkingDirectory() or RootDirectory() // (that return nil). This is because valid references may still be held via // proc files or other mechanisms. -func (f *FSContext) destroy() { - // Hold f.mu so that we don't race with RootDirectory() and - // WorkingDirectory(). - f.mu.Lock() - defer f.mu.Unlock() - - if VFS2Enabled { - f.rootVFS2.DecRef() - f.rootVFS2 = vfs.VirtualDentry{} - f.cwdVFS2.DecRef() - f.cwdVFS2 = vfs.VirtualDentry{} - } else { - f.root.DecRef() - f.root = nil - f.cwd.DecRef() - f.cwd = nil - } -} - -// DecRef implements RefCounter.DecRef with destructor f.destroy. -func (f *FSContext) DecRef() { - f.DecRefWithDestructor(f.destroy) +func (f *FSContext) DecRef(ctx context.Context) { + f.FSContextRefs.DecRef(func() { + // Hold f.mu so that we don't race with RootDirectory() and + // WorkingDirectory(). + f.mu.Lock() + defer f.mu.Unlock() + + if VFS2Enabled { + f.rootVFS2.DecRef(ctx) + f.rootVFS2 = vfs.VirtualDentry{} + f.cwdVFS2.DecRef(ctx) + f.cwdVFS2 = vfs.VirtualDentry{} + } else { + f.root.DecRef(ctx) + f.root = nil + f.cwd.DecRef(ctx) + f.cwd = nil + } + }) } // Fork forks this FSContext. // -// This is not a valid call after destroy. +// This is not a valid call after f is destroyed. func (f *FSContext) Fork() *FSContext { f.mu.Lock() defer f.mu.Unlock() if VFS2Enabled { + if !f.cwdVFS2.Ok() { + panic("FSContext.Fork() called after destroy") + } f.cwdVFS2.IncRef() f.rootVFS2.IncRef() } else { + if f.cwd == nil { + panic("FSContext.Fork() called after destroy") + } f.cwd.IncRef() f.root.IncRef() } - return &FSContext{ + ctx := &FSContext{ cwd: f.cwd, root: f.root, cwdVFS2: f.cwdVFS2, rootVFS2: f.rootVFS2, umask: f.umask, } + ctx.EnableLeakCheck() + return ctx } // WorkingDirectory returns the current working directory. // -// This will return nil if called after destroy(), otherwise it will return a -// Dirent with a reference taken. +// This will return nil if called after f is destroyed, otherwise it will return +// a Dirent with a reference taken. func (f *FSContext) WorkingDirectory() *fs.Dirent { f.mu.Lock() defer f.mu.Unlock() - f.cwd.IncRef() + if f.cwd != nil { + f.cwd.IncRef() + } return f.cwd } // WorkingDirectoryVFS2 returns the current working directory. // -// This will return nil if called after destroy(), otherwise it will return a -// Dirent with a reference taken. +// This will return an empty vfs.VirtualDentry if called after f is +// destroyed, otherwise it will return a Dirent with a reference taken. func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry { f.mu.Lock() defer f.mu.Unlock() - f.cwdVFS2.IncRef() + if f.cwdVFS2.Ok() { + f.cwdVFS2.IncRef() + } return f.cwdVFS2 } // SetWorkingDirectory sets the current working directory. // This will take an extra reference on the Dirent. // -// This is not a valid call after destroy. -func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) { +// This is not a valid call after f is destroyed. +func (f *FSContext) SetWorkingDirectory(ctx context.Context, d *fs.Dirent) { if d == nil { panic("FSContext.SetWorkingDirectory called with nil dirent") } @@ -180,27 +188,31 @@ func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) { old := f.cwd f.cwd = d d.IncRef() - old.DecRef() + old.DecRef(ctx) } // SetWorkingDirectoryVFS2 sets the current working directory. // This will take an extra reference on the VirtualDentry. // -// This is not a valid call after destroy. -func (f *FSContext) SetWorkingDirectoryVFS2(d vfs.VirtualDentry) { +// This is not a valid call after f is destroyed. +func (f *FSContext) SetWorkingDirectoryVFS2(ctx context.Context, d vfs.VirtualDentry) { f.mu.Lock() defer f.mu.Unlock() + if !f.cwdVFS2.Ok() { + panic(fmt.Sprintf("FSContext.SetWorkingDirectoryVFS2(%v)) called after destroy", d)) + } + old := f.cwdVFS2 f.cwdVFS2 = d d.IncRef() - old.DecRef() + old.DecRef(ctx) } // RootDirectory returns the current filesystem root. // -// This will return nil if called after destroy(), otherwise it will return a -// Dirent with a reference taken. +// This will return nil if called after f is destroyed, otherwise it will return +// a Dirent with a reference taken. func (f *FSContext) RootDirectory() *fs.Dirent { f.mu.Lock() defer f.mu.Unlock() @@ -212,21 +224,23 @@ func (f *FSContext) RootDirectory() *fs.Dirent { // RootDirectoryVFS2 returns the current filesystem root. // -// This will return nil if called after destroy(), otherwise it will return a -// Dirent with a reference taken. +// This will return an empty vfs.VirtualDentry if called after f is +// destroyed, otherwise it will return a Dirent with a reference taken. func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry { f.mu.Lock() defer f.mu.Unlock() - f.rootVFS2.IncRef() + if f.rootVFS2.Ok() { + f.rootVFS2.IncRef() + } return f.rootVFS2 } // SetRootDirectory sets the root directory. // This will take an extra reference on the Dirent. // -// This is not a valid call after free. -func (f *FSContext) SetRootDirectory(d *fs.Dirent) { +// This is not a valid call after f is destroyed. +func (f *FSContext) SetRootDirectory(ctx context.Context, d *fs.Dirent) { if d == nil { panic("FSContext.SetRootDirectory called with nil dirent") } @@ -241,13 +255,13 @@ func (f *FSContext) SetRootDirectory(d *fs.Dirent) { old := f.root f.root = d d.IncRef() - old.DecRef() + old.DecRef(ctx) } // SetRootDirectoryVFS2 sets the root directory. It takes a reference on vd. // -// This is not a valid call after free. -func (f *FSContext) SetRootDirectoryVFS2(vd vfs.VirtualDentry) { +// This is not a valid call after f is destroyed. +func (f *FSContext) SetRootDirectoryVFS2(ctx context.Context, vd vfs.VirtualDentry) { if !vd.Ok() { panic("FSContext.SetRootDirectoryVFS2 called with zero-value VirtualDentry") } @@ -263,7 +277,7 @@ func (f *FSContext) SetRootDirectoryVFS2(vd vfs.VirtualDentry) { vd.IncRef() f.rootVFS2 = vd f.mu.Unlock() - old.DecRef() + old.DecRef(ctx) } // Umask returns the current umask. diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD index c5021f2db..daa2dae76 100644 --- a/pkg/sentry/kernel/futex/BUILD +++ b/pkg/sentry/kernel/futex/BUILD @@ -51,6 +51,7 @@ go_test( srcs = ["futex_test.go"], library = ":futex", deps = [ + "//pkg/context", "//pkg/sync", "//pkg/usermem", ], diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go index 732e66da4..e4dcc4d40 100644 --- a/pkg/sentry/kernel/futex/futex.go +++ b/pkg/sentry/kernel/futex/futex.go @@ -19,6 +19,7 @@ package futex import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -66,9 +67,9 @@ type Key struct { Offset uint64 } -func (k *Key) release() { +func (k *Key) release(t Target) { if k.MappingIdentity != nil { - k.MappingIdentity.DecRef() + k.MappingIdentity.DecRef(t) } k.Mappable = nil k.MappingIdentity = nil @@ -94,6 +95,8 @@ func (k *Key) matches(k2 *Key) bool { // Target abstracts memory accesses and keys. type Target interface { + context.Context + // SwapUint32 gives access to usermem.IO.SwapUint32. SwapUint32(addr usermem.Addr, new uint32) (uint32, error) @@ -296,7 +299,7 @@ func (b *bucket) wakeWaiterLocked(w *Waiter) { // bucket "to". // // Preconditions: b and to must be locked. -func (b *bucket) requeueLocked(to *bucket, key, nkey *Key, n int) int { +func (b *bucket) requeueLocked(t Target, to *bucket, key, nkey *Key, n int) int { done := 0 for w := b.waiters.Front(); done < n && w != nil; { if !w.key.matches(key) { @@ -308,7 +311,7 @@ func (b *bucket) requeueLocked(to *bucket, key, nkey *Key, n int) int { requeued := w w = w.Next() // Next iteration. b.waiters.Remove(requeued) - requeued.key.release() + requeued.key.release(t) requeued.key = nkey.clone() to.waiters.PushBack(requeued) requeued.bucket.Store(to) @@ -456,7 +459,7 @@ func (m *Manager) Wake(t Target, addr usermem.Addr, private bool, bitmask uint32 r := b.wakeLocked(&k, bitmask, n) b.mu.Unlock() - k.release() + k.release(t) return r, nil } @@ -465,12 +468,12 @@ func (m *Manager) doRequeue(t Target, addr, naddr usermem.Addr, private bool, ch if err != nil { return 0, err } - defer k1.release() + defer k1.release(t) k2, err := getKey(t, naddr, private) if err != nil { return 0, err } - defer k2.release() + defer k2.release(t) b1, b2 := m.lockBuckets(&k1, &k2) defer b1.mu.Unlock() @@ -488,7 +491,7 @@ func (m *Manager) doRequeue(t Target, addr, naddr usermem.Addr, private bool, ch done := b1.wakeLocked(&k1, ^uint32(0), nwake) // Requeue the number required. - b1.requeueLocked(b2, &k1, &k2, nreq) + b1.requeueLocked(t, b2, &k1, &k2, nreq) return done, nil } @@ -515,12 +518,12 @@ func (m *Manager) WakeOp(t Target, addr1, addr2 usermem.Addr, private bool, nwak if err != nil { return 0, err } - defer k1.release() + defer k1.release(t) k2, err := getKey(t, addr2, private) if err != nil { return 0, err } - defer k2.release() + defer k2.release(t) b1, b2 := m.lockBuckets(&k1, &k2) defer b1.mu.Unlock() @@ -571,7 +574,7 @@ func (m *Manager) WaitPrepare(w *Waiter, t Target, addr usermem.Addr, private bo // Perform our atomic check. if err := check(t, addr, val); err != nil { b.mu.Unlock() - w.key.release() + w.key.release(t) return err } @@ -585,7 +588,7 @@ func (m *Manager) WaitPrepare(w *Waiter, t Target, addr usermem.Addr, private bo // WaitComplete must be called when a Waiter previously added by WaitPrepare is // no longer eligible to be woken. -func (m *Manager) WaitComplete(w *Waiter) { +func (m *Manager) WaitComplete(w *Waiter, t Target) { // Remove w from the bucket it's in. for { b := w.bucket.Load() @@ -617,7 +620,7 @@ func (m *Manager) WaitComplete(w *Waiter) { } // Release references held by the waiter. - w.key.release() + w.key.release(t) } // LockPI attempts to lock the futex following the Priority-inheritance futex @@ -648,13 +651,13 @@ func (m *Manager) LockPI(w *Waiter, t Target, addr usermem.Addr, tid uint32, pri success, err := m.lockPILocked(w, t, addr, tid, b, try) if err != nil { - w.key.release() + w.key.release(t) b.mu.Unlock() return false, err } if success || try { // Release waiter if it's not going to be a wait. - w.key.release() + w.key.release(t) } b.mu.Unlock() return success, nil @@ -717,10 +720,10 @@ func (m *Manager) lockPILocked(w *Waiter, t Target, addr usermem.Addr, tid uint3 } } -// UnlockPI unlock the futex following the Priority-inheritance futex -// rules. The address provided must contain the caller's TID. If there are -// waiters, TID of the next waiter (FIFO) is set to the given address, and the -// waiter woken up. If there are no waiters, 0 is set to the address. +// UnlockPI unlocks the futex following the Priority-inheritance futex rules. +// The address provided must contain the caller's TID. If there are waiters, +// TID of the next waiter (FIFO) is set to the given address, and the waiter +// woken up. If there are no waiters, 0 is set to the address. func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool) error { k, err := getKey(t, addr, private) if err != nil { @@ -730,7 +733,7 @@ func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool err = m.unlockPILocked(t, addr, tid, b, &k) - k.release() + k.release(t) b.mu.Unlock() return err } diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go index 7c5c7665b..d0128c548 100644 --- a/pkg/sentry/kernel/futex/futex_test.go +++ b/pkg/sentry/kernel/futex/futex_test.go @@ -22,6 +22,7 @@ import ( "testing" "unsafe" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) @@ -29,28 +30,33 @@ import ( // testData implements the Target interface, and allows us to // treat the address passed for futex operations as an index in // a byte slice for testing simplicity. -type testData []byte +type testData struct { + context.Context + data []byte +} const sizeofInt32 = 4 func newTestData(size uint) testData { - return make([]byte, size) + return testData{ + data: make([]byte, size), + } } func (t testData) SwapUint32(addr usermem.Addr, new uint32) (uint32, error) { - val := atomic.SwapUint32((*uint32)(unsafe.Pointer(&t[addr])), new) + val := atomic.SwapUint32((*uint32)(unsafe.Pointer(&t.data[addr])), new) return val, nil } func (t testData) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error) { - if atomic.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&t[addr])), old, new) { + if atomic.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&t.data[addr])), old, new) { return old, nil } - return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))), nil + return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t.data[addr]))), nil } func (t testData) LoadUint32(addr usermem.Addr) (uint32, error) { - return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))), nil + return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t.data[addr]))), nil } func (t testData) GetSharedKey(addr usermem.Addr) (Key, error) { @@ -83,7 +89,7 @@ func TestFutexWake(t *testing.T) { // Start waiting for wakeup. w := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w) + defer m.WaitComplete(w, d) // Perform a wakeup. if n, err := m.Wake(d, 0, private, ^uint32(0), 1); err != nil || n != 1 { @@ -106,7 +112,7 @@ func TestFutexWakeBitmask(t *testing.T) { // Start waiting for wakeup. w := newPreparedTestWaiter(t, m, d, 0, private, 0, 0x0000ffff) - defer m.WaitComplete(w) + defer m.WaitComplete(w, d) // Perform a wakeup using the wrong bitmask. if n, err := m.Wake(d, 0, private, 0xffff0000, 1); err != nil || n != 0 { @@ -141,7 +147,7 @@ func TestFutexWakeTwo(t *testing.T) { var ws [3]*Waiter for i := range ws { ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(ws[i]) + defer m.WaitComplete(ws[i], d) } // Perform two wakeups. @@ -174,9 +180,9 @@ func TestFutexWakeUnrelated(t *testing.T) { // Start two waiters waiting for wakeup on different addresses. w1 := newPreparedTestWaiter(t, m, d, 0*sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w1) + defer m.WaitComplete(w1, d) w2 := newPreparedTestWaiter(t, m, d, 1*sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w2) + defer m.WaitComplete(w2, d) // Perform two wakeups on the second address. if n, err := m.Wake(d, 1*sizeofInt32, private, ^uint32(0), 2); err != nil || n != 1 { @@ -216,9 +222,9 @@ func TestWakeOpFirstNonEmpty(t *testing.T) { // Add two waiters on address 0. w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w1) + defer m.WaitComplete(w1, d) w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w2) + defer m.WaitComplete(w2, d) // Perform 10 wakeups on address 0. if n, err := m.WakeOp(d, 0, sizeofInt32, private, 10, 0, 0); err != nil || n != 2 { @@ -244,9 +250,9 @@ func TestWakeOpSecondNonEmpty(t *testing.T) { // Add two waiters on address sizeofInt32. w1 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w1) + defer m.WaitComplete(w1, d) w2 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w2) + defer m.WaitComplete(w2, d) // Perform 10 wakeups on address sizeofInt32 (contingent on // d.Op(0), which should succeed). @@ -273,9 +279,9 @@ func TestWakeOpSecondNonEmptyFailingOp(t *testing.T) { // Add two waiters on address sizeofInt32. w1 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w1) + defer m.WaitComplete(w1, d) w2 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w2) + defer m.WaitComplete(w2, d) // Perform 10 wakeups on address sizeofInt32 (contingent on // d.Op(1), which should fail). @@ -302,15 +308,15 @@ func TestWakeOpAllNonEmpty(t *testing.T) { // Add two waiters on address 0. w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w1) + defer m.WaitComplete(w1, d) w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w2) + defer m.WaitComplete(w2, d) // Add two waiters on address sizeofInt32. w3 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w3) + defer m.WaitComplete(w3, d) w4 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w4) + defer m.WaitComplete(w4, d) // Perform 10 wakeups on address 0 (unconditionally), and 10 // wakeups on address sizeofInt32 (contingent on d.Op(0), which @@ -344,15 +350,15 @@ func TestWakeOpAllNonEmptyFailingOp(t *testing.T) { // Add two waiters on address 0. w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w1) + defer m.WaitComplete(w1, d) w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w2) + defer m.WaitComplete(w2, d) // Add two waiters on address sizeofInt32. w3 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w3) + defer m.WaitComplete(w3, d) w4 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w4) + defer m.WaitComplete(w4, d) // Perform 10 wakeups on address 0 (unconditionally), and 10 // wakeups on address sizeofInt32 (contingent on d.Op(1), which @@ -388,7 +394,7 @@ func TestWakeOpSameAddress(t *testing.T) { var ws [4]*Waiter for i := range ws { ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(ws[i]) + defer m.WaitComplete(ws[i], d) } // Perform 1 wakeup on address 0 (unconditionally), and 1 wakeup @@ -422,7 +428,7 @@ func TestWakeOpSameAddressFailingOp(t *testing.T) { var ws [4]*Waiter for i := range ws { ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(ws[i]) + defer m.WaitComplete(ws[i], d) } // Perform 1 wakeup on address 0 (unconditionally), and 1 wakeup @@ -472,7 +478,7 @@ func (t *testMutex) Lock() { for { // Attempt to grab the lock. if atomic.CompareAndSwapUint32( - (*uint32)(unsafe.Pointer(&t.d[t.a])), + (*uint32)(unsafe.Pointer(&t.d.data[t.a])), testMutexUnlocked, testMutexLocked) { // Lock held. @@ -490,7 +496,7 @@ func (t *testMutex) Lock() { panic("WaitPrepare returned unexpected error: " + err.Error()) } <-w.C - t.m.WaitComplete(w) + t.m.WaitComplete(w, t.d) } } @@ -498,7 +504,7 @@ func (t *testMutex) Lock() { // This will notify any waiters via the futex manager. func (t *testMutex) Unlock() { // Unlock. - atomic.StoreUint32((*uint32)(unsafe.Pointer(&t.d[t.a])), testMutexUnlocked) + atomic.StoreUint32((*uint32)(unsafe.Pointer(&t.d.data[t.a])), testMutexUnlocked) // Notify all waiters. t.m.Wake(t.d, t.a, true, ^uint32(0), math.MaxInt32) diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go index 80a070d7e..b87e40dd1 100644 --- a/pkg/sentry/kernel/ipc_namespace.go +++ b/pkg/sentry/kernel/ipc_namespace.go @@ -15,6 +15,7 @@ package kernel import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/semaphore" "gvisor.dev/gvisor/pkg/sentry/kernel/shm" @@ -24,6 +25,8 @@ import ( // // +stateify savable type IPCNamespace struct { + IPCNamespaceRefs + // User namespace which owns this IPC namespace. Immutable. userNS *auth.UserNamespace @@ -33,11 +36,13 @@ type IPCNamespace struct { // NewIPCNamespace creates a new IPC namespace. func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace { - return &IPCNamespace{ + ns := &IPCNamespace{ userNS: userNS, semaphores: semaphore.NewRegistry(userNS), shms: shm.NewRegistry(userNS), } + ns.EnableLeakCheck() + return ns } // SemaphoreRegistry returns the semaphore set registry for this namespace. @@ -50,6 +55,13 @@ func (i *IPCNamespace) ShmRegistry() *shm.Registry { return i.shms } +// DecRef implements refsvfs2.RefCounter.DecRef. +func (i *IPCNamespace) DecRef(ctx context.Context) { + i.IPCNamespaceRefs.DecRef(func() { + i.shms.Release(ctx) + }) +} + // IPCNamespace returns the task's IPC namespace. func (t *Task) IPCNamespace() *IPCNamespace { t.mu.Lock() diff --git a/pkg/sentry/kernel/kcov.go b/pkg/sentry/kernel/kcov.go new file mode 100644 index 000000000..4fcdfc541 --- /dev/null +++ b/pkg/sentry/kernel/kcov.go @@ -0,0 +1,338 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + "io" + "sync" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/coverage" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// kcovAreaSizeMax is the maximum number of uint64 entries allowed in the kcov +// area. On Linux, the maximum is INT_MAX / 8. +const kcovAreaSizeMax = 10 * 1024 * 1024 + +// Kcov provides kernel coverage data to userspace through a memory-mapped +// region, as kcov does in Linux. +// +// To give the illusion that the data is always up to date, we update the shared +// memory every time before we return to userspace. +type Kcov struct { + // mfp provides application memory. It is immutable after creation. + mfp pgalloc.MemoryFileProvider + + // mu protects all of the fields below. + mu sync.RWMutex + + // mode is the current kcov mode. + mode uint8 + + // size is the size of the mapping through which the kernel conveys coverage + // information to userspace. + size uint64 + + // owningTask is the task that currently owns coverage data on the system. The + // interface for kcov essentially requires that coverage is only going to a + // single task. Note that kcov should only generate coverage data for the + // owning task, but we currently generate global coverage. + owningTask *Task + + // count is a locally cached version of the first uint64 in the kcov data, + // which is the number of subsequent entries representing PCs. + // + // It is used with kcovInode.countBlock(), to copy in/out the first element of + // the actual data in an efficient manner, avoid boilerplate, and prevent + // accidental garbage escapes by the temporary counts. + count uint64 + + mappable *mm.SpecialMappable +} + +// NewKcov creates and returns a Kcov instance. +func (k *Kernel) NewKcov() *Kcov { + return &Kcov{ + mfp: k, + } +} + +var coveragePool = sync.Pool{ + New: func() interface{} { + return make([]byte, 0) + }, +} + +// TaskWork implements TaskWorker.TaskWork. +func (kcov *Kcov) TaskWork(t *Task) { + kcov.mu.Lock() + defer kcov.mu.Unlock() + + if kcov.mode != linux.KCOV_MODE_TRACE_PC { + return + } + + rw := &kcovReadWriter{ + mf: kcov.mfp.MemoryFile(), + fr: kcov.mappable.FileRange(), + } + + // Read in the PC count. + if _, err := safemem.ReadFullToBlocks(rw, kcov.countBlock()); err != nil { + panic(fmt.Sprintf("Internal error reading count from kcov area: %v", err)) + } + + rw.off = 8 * (1 + kcov.count) + n := coverage.ConsumeCoverageData(&kcovIOWriter{rw}) + + // Update the pc count, based on the number of entries written. Note that if + // we reached the end of the kcov area, we may not have written everything in + // output. + kcov.count += uint64(n / 8) + rw.off = 0 + if _, err := safemem.WriteFullFromBlocks(rw, kcov.countBlock()); err != nil { + panic(fmt.Sprintf("Internal error writing count to kcov area: %v", err)) + } + + // Re-register for future work. + t.RegisterWork(kcov) +} + +// InitTrace performs the KCOV_INIT_TRACE ioctl. +func (kcov *Kcov) InitTrace(size uint64) error { + kcov.mu.Lock() + defer kcov.mu.Unlock() + + if kcov.mode != linux.KCOV_MODE_DISABLED { + return syserror.EBUSY + } + + // To simplify all the logic around mapping, we require that the length of the + // shared region is a multiple of the system page size. + if (8*size)&(usermem.PageSize-1) != 0 { + return syserror.EINVAL + } + + // We need space for at least two uint64s to hold current position and a + // single PC. + if size < 2 || size > kcovAreaSizeMax { + return syserror.EINVAL + } + + kcov.size = size + kcov.mode = linux.KCOV_MODE_INIT + return nil +} + +// EnableTrace performs the KCOV_ENABLE_TRACE ioctl. +func (kcov *Kcov) EnableTrace(ctx context.Context, traceKind uint8) error { + t := TaskFromContext(ctx) + if t == nil { + panic("kcovInode.EnableTrace() cannot be used outside of a task goroutine") + } + + kcov.mu.Lock() + defer kcov.mu.Unlock() + + // KCOV_ENABLE must be preceded by KCOV_INIT_TRACE and an mmap call. + if kcov.mode != linux.KCOV_MODE_INIT || kcov.mappable == nil { + return syserror.EINVAL + } + + switch traceKind { + case linux.KCOV_TRACE_PC: + kcov.mode = linux.KCOV_MODE_TRACE_PC + case linux.KCOV_TRACE_CMP: + // We do not support KCOV_MODE_TRACE_CMP. + return syserror.ENOTSUP + default: + return syserror.EINVAL + } + + if kcov.owningTask != nil && kcov.owningTask != t { + return syserror.EBUSY + } + + kcov.owningTask = t + t.SetKcov(kcov) + t.RegisterWork(kcov) + + // Clear existing coverage data; the task expects to read only coverage data + // from the time it is activated. + coverage.ClearCoverageData() + return nil +} + +// DisableTrace performs the KCOV_DISABLE_TRACE ioctl. +func (kcov *Kcov) DisableTrace(ctx context.Context) error { + kcov.mu.Lock() + defer kcov.mu.Unlock() + + t := TaskFromContext(ctx) + if t == nil { + panic("kcovInode.EnableTrace() cannot be used outside of a task goroutine") + } + + if t != kcov.owningTask { + return syserror.EINVAL + } + kcov.mode = linux.KCOV_MODE_INIT + kcov.owningTask = nil + if kcov.mappable != nil { + kcov.mappable.DecRef(ctx) + kcov.mappable = nil + } + return nil +} + +// Clear resets the mode and clears the owning task and memory mapping for kcov. +// It is called when the fd corresponding to kcov is closed. Note that the mode +// needs to be set so that the next call to kcov.TaskWork() will exit early. +func (kcov *Kcov) Clear(ctx context.Context) { + kcov.mu.Lock() + kcov.mode = linux.KCOV_MODE_INIT + kcov.owningTask = nil + if kcov.mappable != nil { + kcov.mappable.DecRef(ctx) + kcov.mappable = nil + } + kcov.mu.Unlock() +} + +// OnTaskExit is called when the owning task exits. It is similar to +// kcov.Clear(), except the memory mapping is not cleared, so that the same +// mapping can be used in the future if kcov is enabled again by another task. +func (kcov *Kcov) OnTaskExit() { + kcov.mu.Lock() + kcov.mode = linux.KCOV_MODE_INIT + kcov.owningTask = nil + kcov.mu.Unlock() +} + +// ConfigureMMap is called by the vfs.FileDescription for this kcov instance to +// implement vfs.FileDescription.ConfigureMMap. +func (kcov *Kcov) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + kcov.mu.Lock() + defer kcov.mu.Unlock() + + if kcov.mode != linux.KCOV_MODE_INIT { + return syserror.EINVAL + } + + if kcov.mappable == nil { + // Set up the kcov area. + fr, err := kcov.mfp.MemoryFile().Allocate(kcov.size*8, usage.Anonymous) + if err != nil { + return err + } + + // Get the thread id for the mmap name. + t := TaskFromContext(ctx) + if t == nil { + panic("ThreadFromContext returned nil") + } + // For convenience, a special mappable is used here. Note that these mappings + // will look different under /proc/[pid]/maps than they do on Linux. + kcov.mappable = mm.NewSpecialMappable(fmt.Sprintf("[kcov:%d]", t.ThreadID()), kcov.mfp, fr) + } + kcov.mappable.IncRef() + opts.Mappable = kcov.mappable + opts.MappingIdentity = kcov.mappable + return nil +} + +// kcovReadWriter implements safemem.Reader and safemem.Writer. +type kcovReadWriter struct { + off uint64 + mf *pgalloc.MemoryFile + fr memmap.FileRange +} + +// ReadToBlocks implements safemem.Reader.ReadToBlocks. +func (rw *kcovReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { + if dsts.IsEmpty() { + return 0, nil + } + + // Limit the read to the kcov range and check for overflow. + if rw.fr.Length() <= rw.off { + return 0, io.EOF + } + start := rw.fr.Start + rw.off + end := rw.fr.Start + rw.fr.Length() + if rend := start + dsts.NumBytes(); rend < end { + end = rend + } + + // Get internal mappings. + bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, usermem.Read) + if err != nil { + return 0, err + } + + // Copy from internal mappings. + n, err := safemem.CopySeq(dsts, bs) + rw.off += n + return n, err +} + +// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. +func (rw *kcovReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { + if srcs.IsEmpty() { + return 0, nil + } + + // Limit the write to the kcov area and check for overflow. + if rw.fr.Length() <= rw.off { + return 0, io.EOF + } + start := rw.fr.Start + rw.off + end := rw.fr.Start + rw.fr.Length() + if wend := start + srcs.NumBytes(); wend < end { + end = wend + } + + // Get internal mapping. + bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, usermem.Write) + if err != nil { + return 0, err + } + + // Copy to internal mapping. + n, err := safemem.CopySeq(bs, srcs) + rw.off += n + return n, err +} + +// kcovIOWriter implements io.Writer as a basic wrapper over kcovReadWriter. +type kcovIOWriter struct { + rw *kcovReadWriter +} + +// Write implements io.Writer.Write. +func (w *kcovIOWriter) Write(p []byte) (int, error) { + bs := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(p)) + n, err := safemem.WriteFullFromBlocks(w.rw, bs) + return int(n), err +} diff --git a/pkg/sentry/kernel/kcov_unsafe.go b/pkg/sentry/kernel/kcov_unsafe.go new file mode 100644 index 000000000..6f8a0266b --- /dev/null +++ b/pkg/sentry/kernel/kcov_unsafe.go @@ -0,0 +1,28 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "unsafe" + + "gvisor.dev/gvisor/pkg/safemem" +) + +// countBlock provides a safemem.BlockSeq for kcov.count. +// +// Like k.count, the block returned is protected by k.mu. +func (kcov *Kcov) countBlock() safemem.BlockSeq { + return safemem.BlockSeqOf(safemem.BlockFromSafePointer(unsafe.Pointer(&kcov.count), int(unsafe.Sizeof(kcov.count)))) +} diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 5efeb3767..9b2be44d4 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -34,12 +34,12 @@ package kernel import ( "errors" "fmt" - "io" "path/filepath" "sync/atomic" "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/eventchannel" @@ -73,6 +73,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/state" + "gvisor.dev/gvisor/pkg/state/wire" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" ) @@ -81,6 +82,10 @@ import ( // easy access everywhere. To be removed once VFS2 becomes the default. var VFS2Enabled = false +// FUSEEnabled is set to true when FUSE is enabled. Added as a global for allow +// easy access everywhere. To be removed once FUSE is completed. +var FUSEEnabled = false + // Kernel represents an emulated Linux kernel. It must be initialized by calling // Init() or LoadFrom(). // @@ -194,11 +199,6 @@ type Kernel struct { // cpuClockTickerSetting is protected by runningTasksMu. cpuClockTickerSetting ktime.Setting - // fdMapUids is an ever-increasing counter for generating FDTable uids. - // - // fdMapUids is mutable, and is accessed using atomic memory operations. - fdMapUids uint64 - // uniqueID is used to generate unique identifiers. // // uniqueID is mutable, and is accessed using atomic memory operations. @@ -221,13 +221,18 @@ type Kernel struct { // danglingEndpoints is used to save / restore tcpip.DanglingEndpoints. danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"` - // sockets is the list of all network sockets the system. Protected by - // extMu. + // sockets is the list of all network sockets in the system. + // Protected by extMu. + // TODO(gvisor.dev/issue/1624): Only used by VFS1. sockets socketList - // nextSocketEntry is the next entry number to use in sockets. Protected + // socketsVFS2 records all network sockets in the system. Protected by + // extMu. + socketsVFS2 map[*vfs.FileDescription]*SocketRecord + + // nextSocketRecord is the next entry number to use in sockets. Protected // by extMu. - nextSocketEntry uint64 + nextSocketRecord uint64 // deviceRegistry is used to save/restore device.SimpleDevices. deviceRegistry struct{} `state:".(*device.Registry)"` @@ -249,7 +254,7 @@ type Kernel struct { // SpecialOpts contains special kernel options. SpecialOpts - // VFS keeps the filesystem state used across the kernel. + // vfs keeps the filesystem state used across the kernel. vfs vfs.VirtualFilesystem // hostMount is the Mount used for file descriptors that were imported @@ -336,7 +341,7 @@ func (k *Kernel) Init(args InitKernelArgs) error { return fmt.Errorf("Timekeeper is nil") } if args.Timekeeper.clocks == nil { - return fmt.Errorf("Must call Timekeeper.SetClocks() before Kernel.Init()") + return fmt.Errorf("must call Timekeeper.SetClocks() before Kernel.Init()") } if args.RootUserNamespace == nil { return fmt.Errorf("RootUserNamespace is nil") @@ -361,7 +366,7 @@ func (k *Kernel) Init(args InitKernelArgs) error { k.useHostCores = true maxCPU, err := hostcpu.MaxPossibleCPU() if err != nil { - return fmt.Errorf("Failed to get maximum CPU number: %v", err) + return fmt.Errorf("failed to get maximum CPU number: %v", err) } minAppCores := uint(maxCPU) + 1 if k.applicationCores < minAppCores { @@ -377,7 +382,8 @@ func (k *Kernel) Init(args InitKernelArgs) error { k.netlinkPorts = port.New() if VFS2Enabled { - if err := k.vfs.Init(); err != nil { + ctx := k.SupervisorContext() + if err := k.vfs.Init(ctx); err != nil { return fmt.Errorf("failed to initialize VFS: %v", err) } @@ -385,19 +391,19 @@ func (k *Kernel) Init(args InitKernelArgs) error { if err != nil { return fmt.Errorf("failed to create pipefs filesystem: %v", err) } - defer pipeFilesystem.DecRef() + defer pipeFilesystem.DecRef(ctx) pipeMount, err := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{}) if err != nil { return fmt.Errorf("failed to create pipefs mount: %v", err) } k.pipeMount = pipeMount - tmpfsFilesystem, tmpfsRoot, err := tmpfs.NewFilesystem(k.SupervisorContext(), &k.vfs, auth.NewRootCredentials(k.rootUserNamespace)) + tmpfsFilesystem, tmpfsRoot, err := tmpfs.NewFilesystem(ctx, &k.vfs, auth.NewRootCredentials(k.rootUserNamespace)) if err != nil { return fmt.Errorf("failed to create tmpfs filesystem: %v", err) } - defer tmpfsFilesystem.DecRef() - defer tmpfsRoot.DecRef() + defer tmpfsFilesystem.DecRef(ctx) + defer tmpfsRoot.DecRef(ctx) shmMount, err := k.vfs.NewDisconnectedMount(tmpfsFilesystem, tmpfsRoot, &vfs.MountOptions{}) if err != nil { return fmt.Errorf("failed to create tmpfs mount: %v", err) @@ -408,12 +414,14 @@ func (k *Kernel) Init(args InitKernelArgs) error { if err != nil { return fmt.Errorf("failed to create sockfs filesystem: %v", err) } - defer socketFilesystem.DecRef() + defer socketFilesystem.DecRef(ctx) socketMount, err := k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{}) if err != nil { return fmt.Errorf("failed to create sockfs mount: %v", err) } k.socketMount = socketMount + + k.socketsVFS2 = make(map[*vfs.FileDescription]*SocketRecord) } return nil @@ -422,56 +430,70 @@ func (k *Kernel) Init(args InitKernelArgs) error { // SaveTo saves the state of k to w. // // Preconditions: The kernel must be paused throughout the call to SaveTo. -func (k *Kernel) SaveTo(w io.Writer) error { +func (k *Kernel) SaveTo(ctx context.Context, w wire.Writer) error { saveStart := time.Now() - ctx := k.SupervisorContext() // Do not allow other Kernel methods to affect it while it's being saved. k.extMu.Lock() defer k.extMu.Unlock() // Stop time. - k.pauseTimeLocked() - defer k.resumeTimeLocked() + k.pauseTimeLocked(ctx) + defer k.resumeTimeLocked(ctx) // Evict all evictable MemoryFile allocations. k.mf.StartEvictions() k.mf.WaitForEvictions() - // Flush write operations on open files so data reaches backing storage. - // This must come after MemoryFile eviction since eviction may cause file - // writes. - if err := k.tasks.flushWritesToFiles(ctx); err != nil { - return err - } + if VFS2Enabled { + // Discard unsavable mappings, such as those for host file descriptors. + if err := k.invalidateUnsavableMappings(ctx); err != nil { + return fmt.Errorf("failed to invalidate unsavable mappings: %v", err) + } + + // Prepare filesystems for saving. This must be done after + // invalidateUnsavableMappings(), since dropping memory mappings may + // affect filesystem state (e.g. page cache reference counts). + if err := k.vfs.PrepareSave(ctx); err != nil { + return err + } + } else { + // Flush cached file writes to backing storage. This must come after + // MemoryFile eviction since eviction may cause file writes. + if err := k.flushWritesToFiles(ctx); err != nil { + return err + } - // Remove all epoll waiter objects from underlying wait queues. - // NOTE: for programs to resume execution in future snapshot scenarios, - // we will need to re-establish these waiter objects after saving. - k.tasks.unregisterEpollWaiters() + // Remove all epoll waiter objects from underlying wait queues. + // NOTE: for programs to resume execution in future snapshot scenarios, + // we will need to re-establish these waiter objects after saving. + k.tasks.unregisterEpollWaiters(ctx) - // Clear the dirent cache before saving because Dirents must be Loaded in a - // particular order (parents before children), and Loading dirents from a cache - // breaks that order. - if err := k.flushMountSourceRefs(); err != nil { - return err - } + // Clear the dirent cache before saving because Dirents must be Loaded in a + // particular order (parents before children), and Loading dirents from a cache + // breaks that order. + if err := k.flushMountSourceRefs(ctx); err != nil { + return err + } - // Ensure that all pending asynchronous work is complete: - // - inode and mount release - // - asynchronuous IO - fs.AsyncBarrier() - - // Once all fs work has completed (flushed references have all been released), - // reset mount mappings. This allows individual mounts to save how inodes map - // to filesystem resources. Without this, fs.Inodes cannot be restored. - fs.SaveInodeMappings() - - // Discard unsavable mappings, such as those for host file descriptors. - // This must be done after waiting for "asynchronous fs work", which - // includes async I/O that may touch application memory. - if err := k.invalidateUnsavableMappings(ctx); err != nil { - return fmt.Errorf("failed to invalidate unsavable mappings: %v", err) + // Ensure that all inode and mount release operations have completed. + fs.AsyncBarrier() + + // Once all fs work has completed (flushed references have all been released), + // reset mount mappings. This allows individual mounts to save how inodes map + // to filesystem resources. Without this, fs.Inodes cannot be restored. + fs.SaveInodeMappings() + + // Discard unsavable mappings, such as those for host file descriptors. + // This must be done after waiting for "asynchronous fs work", which + // includes async I/O that may touch application memory. + // + // TODO(gvisor.dev/issue/1624): This rationale is believed to be + // obsolete since AIO callbacks are now waited-for by Kernel.Pause(), + // but this order is conservatively retained for VFS1. + if err := k.invalidateUnsavableMappings(ctx); err != nil { + return fmt.Errorf("failed to invalidate unsavable mappings: %v", err) + } } // Save the CPUID FeatureSet before the rest of the kernel so we can @@ -480,23 +502,23 @@ func (k *Kernel) SaveTo(w io.Writer) error { // // N.B. This will also be saved along with the full kernel save below. cpuidStart := time.Now() - if err := state.Save(k.SupervisorContext(), w, k.FeatureSet(), nil); err != nil { + if _, err := state.Save(ctx, w, k.FeatureSet()); err != nil { return err } log.Infof("CPUID save took [%s].", time.Since(cpuidStart)) // Save the kernel state. kernelStart := time.Now() - var stats state.Stats - if err := state.Save(k.SupervisorContext(), w, k, &stats); err != nil { + stats, err := state.Save(ctx, w, k) + if err != nil { return err } - log.Infof("Kernel save stats: %s", &stats) + log.Infof("Kernel save stats: %s", stats.String()) log.Infof("Kernel save took [%s].", time.Since(kernelStart)) // Save the memory file's state. memoryStart := time.Now() - if err := k.mf.SaveTo(k.SupervisorContext(), w); err != nil { + if err := k.mf.SaveTo(ctx, w); err != nil { return err } log.Infof("Memory save took [%s].", time.Since(memoryStart)) @@ -508,7 +530,9 @@ func (k *Kernel) SaveTo(w io.Writer) error { // flushMountSourceRefs flushes the MountSources for all mounted filesystems // and open FDs. -func (k *Kernel) flushMountSourceRefs() error { +// +// Preconditions: !VFS2Enabled. +func (k *Kernel) flushMountSourceRefs(ctx context.Context) error { // Flush all mount sources for currently mounted filesystems in each task. flushed := make(map[*fs.MountNamespace]struct{}) k.tasks.mu.RLock() @@ -524,7 +548,7 @@ func (k *Kernel) flushMountSourceRefs() error { // There may be some open FDs whose filesystems have been unmounted. We // must flush those as well. - return k.tasks.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error { + return k.tasks.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error { file.Dirent.Inode.MountSource.FlushDirentRefs() return nil }) @@ -534,12 +558,7 @@ func (k *Kernel) flushMountSourceRefs() error { // each task. // // Precondition: Must be called with the kernel paused. -func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) (err error) { - // TODO(gvisor.dev/issue/1663): Add save support for VFS2. - if VFS2Enabled { - return nil - } - +func (ts *TaskSet) forEachFDPaused(ctx context.Context, f func(*fs.File, *vfs.FileDescription) error) (err error) { ts.mu.RLock() defer ts.mu.RUnlock() for t := range ts.Root.tids { @@ -547,7 +566,7 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) if t.fdTable == nil { continue } - t.fdTable.forEach(func(_ int32, file *fs.File, fileVFS2 *vfs.FileDescription, _ FDFlags) { + t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fileVFS2 *vfs.FileDescription, _ FDFlags) { if lastErr := f(file, fileVFS2); lastErr != nil && err == nil { err = lastErr } @@ -556,9 +575,9 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) return err } -func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error { - // TODO(gvisor.dev/issue/1663): Add save support for VFS2. - return ts.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error { +// Preconditions: !VFS2Enabled. +func (k *Kernel) flushWritesToFiles(ctx context.Context) error { + return k.tasks.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error { if flags := file.Flags(); !flags.Write { return nil } @@ -580,6 +599,32 @@ func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error { }) } +// Preconditions: !VFS2Enabled. +func (ts *TaskSet) unregisterEpollWaiters(ctx context.Context) { + ts.mu.RLock() + defer ts.mu.RUnlock() + + // Tasks that belong to the same process could potentially point to the + // same FDTable. So we retain a map of processed ones to avoid + // processing the same FDTable multiple times. + processed := make(map[*FDTable]struct{}) + for t := range ts.Root.tids { + // We can skip locking Task.mu here since the kernel is paused. + if t.fdTable == nil { + continue + } + if _, ok := processed[t.fdTable]; ok { + continue + } + t.fdTable.forEach(ctx, func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { + if e, ok := file.FileOperations.(*epoll.EventPoll); ok { + e.UnregisterEpollWaiters() + } + }) + processed[t.fdTable] = struct{}{} + } +} + // Preconditions: The kernel must be paused. func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { invalidated := make(map[*mm.MemoryManager]struct{}) @@ -605,38 +650,8 @@ func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { return nil } -func (ts *TaskSet) unregisterEpollWaiters() { - // TODO(gvisor.dev/issue/1663): Add save support for VFS2. - if VFS2Enabled { - return - } - - ts.mu.RLock() - defer ts.mu.RUnlock() - - // Tasks that belong to the same process could potentially point to the - // same FDTable. So we retain a map of processed ones to avoid - // processing the same FDTable multiple times. - processed := make(map[*FDTable]struct{}) - for t := range ts.Root.tids { - // We can skip locking Task.mu here since the kernel is paused. - if t.fdTable == nil { - continue - } - if _, ok := processed[t.fdTable]; ok { - continue - } - t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { - if e, ok := file.FileOperations.(*epoll.EventPoll); ok { - e.UnregisterEpollWaiters() - } - }) - processed[t.fdTable] = struct{}{} - } -} - // LoadFrom returns a new Kernel loaded from args. -func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) error { +func (k *Kernel) LoadFrom(ctx context.Context, r wire.Reader, net inet.Stack, clocks sentrytime.Clocks, vfsOpts *vfs.CompleteRestoreOptions) error { loadStart := time.Now() initAppCores := k.applicationCores @@ -647,7 +662,7 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) // don't need to explicitly install it in the Kernel. cpuidStart := time.Now() var features cpuid.FeatureSet - if err := state.Load(k.SupervisorContext(), r, &features, nil); err != nil { + if _, err := state.Load(ctx, r, &features); err != nil { return err } log.Infof("CPUID load took [%s].", time.Since(cpuidStart)) @@ -662,11 +677,11 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) // Load the kernel state. kernelStart := time.Now() - var stats state.Stats - if err := state.Load(k.SupervisorContext(), r, k, &stats); err != nil { + stats, err := state.Load(ctx, r, k) + if err != nil { return err } - log.Infof("Kernel load stats: %s", &stats) + log.Infof("Kernel load stats: %s", stats.String()) log.Infof("Kernel load took [%s].", time.Since(kernelStart)) // rootNetworkNamespace should be populated after loading the state file. @@ -675,7 +690,7 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) // Load the memory file's state. memoryStart := time.Now() - if err := k.mf.LoadFrom(k.SupervisorContext(), r); err != nil { + if err := k.mf.LoadFrom(ctx, r); err != nil { return err } log.Infof("Memory load took [%s].", time.Since(memoryStart)) @@ -687,11 +702,17 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) net.Resume() } - // Ensure that all pending asynchronous work is complete: - // - namedpipe opening - // - inode file opening - if err := fs.AsyncErrorBarrier(); err != nil { - return err + if VFS2Enabled { + if err := k.vfs.CompleteRestore(ctx, vfsOpts); err != nil { + return err + } + } else { + // Ensure that all pending asynchronous work is complete: + // - namedpipe opening + // - inode file opening + if err := fs.AsyncErrorBarrier(); err != nil { + return err + } } tcpip.AsyncLoading.Wait() @@ -820,7 +841,9 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} { case CtxUTSNamespace: return ctx.args.UTSNamespace case CtxIPCNamespace: - return ctx.args.IPCNamespace + ipcns := ctx.args.IPCNamespace + ipcns.IncRef() + return ipcns case auth.CtxCredentials: return ctx.args.Credentials case fs.CtxRoot: @@ -833,14 +856,16 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} { if ctx.args.MountNamespaceVFS2 == nil { return nil } - // MountNamespaceVFS2.Root() takes a reference on the root dirent for us. - return ctx.args.MountNamespaceVFS2.Root() + root := ctx.args.MountNamespaceVFS2.Root() + root.IncRef() + return root case vfs.CtxMountNamespace: if ctx.k.globalInit == nil { return nil } - // MountNamespaceVFS2 takes a reference for us. - return ctx.k.GlobalInit().Leader().MountNamespaceVFS2() + mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2() + mntns.IncRef() + return mntns case fs.CtxDirentCacheLimiter: return ctx.k.DirentCacheLimiter case inet.CtxStack: @@ -890,20 +915,20 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, opener fsbridge.Lookup fsContext *FSContext mntns *fs.MountNamespace + mntnsVFS2 *vfs.MountNamespace ) if VFS2Enabled { - mntnsVFS2 := args.MountNamespaceVFS2 + mntnsVFS2 = args.MountNamespaceVFS2 if mntnsVFS2 == nil { - // MountNamespaceVFS2 adds a reference to the namespace, which is - // transferred to the new process. - mntnsVFS2 = k.GlobalInit().Leader().MountNamespaceVFS2() + // Add a reference to the namespace, which is transferred to the new process. + mntnsVFS2 = k.globalInit.Leader().MountNamespaceVFS2() + mntnsVFS2.IncRef() } // Get the root directory from the MountNamespace. - root := args.MountNamespaceVFS2.Root() - // The call to newFSContext below will take a reference on root, so we - // don't need to hold this one. - defer root.DecRef() + root := mntnsVFS2.Root() + root.IncRef() + defer root.DecRef(ctx) // Grab the working directory. wd := root // Default. @@ -921,7 +946,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, if err != nil { return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) } - defer wd.DecRef() + defer wd.DecRef(ctx) } opener = fsbridge.NewVFSLookup(mntnsVFS2, root, wd) fsContext = NewFSContextVFS2(root, wd, args.Umask) @@ -936,7 +961,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, root := mntns.Root() // The call to newFSContext below will take a reference on root, so we // don't need to hold this one. - defer root.DecRef() + defer root.DecRef(ctx) // Grab the working directory. remainingTraversals := args.MaxSymlinkTraversals @@ -947,13 +972,17 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, if err != nil { return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) } - defer wd.DecRef() + defer wd.DecRef(ctx) } opener = fsbridge.NewFSLookup(mntns, root, wd) fsContext = newFSContext(root, wd, args.Umask) } tg := k.NewThreadGroup(mntns, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits) + cu := cleanup.Make(func() { + tg.Release(ctx) + }) + defer cu.Clean() // Check which file to start from. switch { @@ -1010,16 +1039,17 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, UTSNamespace: args.UTSNamespace, IPCNamespace: args.IPCNamespace, AbstractSocketNamespace: args.AbstractSocketNamespace, - MountNamespaceVFS2: args.MountNamespaceVFS2, + MountNamespaceVFS2: mntnsVFS2, ContainerID: args.ContainerID, } - t, err := k.tasks.NewTask(config) + t, err := k.tasks.NewTask(ctx, config) if err != nil { return nil, 0, err } t.traceExecEvent(tc) // Simulate exec for tracing. // Success. + cu.Release() tgid := k.tasks.Root.IDOfThreadGroup(tg) if k.globalInit == nil { k.globalInit = tg @@ -1057,7 +1087,7 @@ func (k *Kernel) Start() error { // If k was created by LoadKernelFrom, timers were stopped during // Kernel.SaveTo and need to be resumed. If k was created by NewKernel, // this is a no-op. - k.resumeTimeLocked() + k.resumeTimeLocked(k.SupervisorContext()) // Start task goroutines. k.tasks.mu.RLock() defer k.tasks.mu.RUnlock() @@ -1069,9 +1099,10 @@ func (k *Kernel) Start() error { // pauseTimeLocked pauses all Timers and Timekeeper updates. // -// Preconditions: Any task goroutines running in k must be stopped. k.extMu -// must be locked. -func (k *Kernel) pauseTimeLocked() { +// Preconditions: +// * Any task goroutines running in k must be stopped. +// * k.extMu must be locked. +func (k *Kernel) pauseTimeLocked(ctx context.Context) { // k.cpuClockTicker may be nil since Kernel.SaveTo() may be called before // Kernel.Start(). if k.cpuClockTicker != nil { @@ -1093,7 +1124,7 @@ func (k *Kernel) pauseTimeLocked() { // This means we'll iterate FDTables shared by multiple tasks repeatedly, // but ktime.Timer.Pause is idempotent so this is harmless. if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { + t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { if VFS2Enabled { if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { tfd.PauseTimer() @@ -1113,9 +1144,10 @@ func (k *Kernel) pauseTimeLocked() { // pauseTimeLocked has not been previously called, resumeTimeLocked has no // effect. // -// Preconditions: Any task goroutines running in k must be stopped. k.extMu -// must be locked. -func (k *Kernel) resumeTimeLocked() { +// Preconditions: +// * Any task goroutines running in k must be stopped. +// * k.extMu must be locked. +func (k *Kernel) resumeTimeLocked(ctx context.Context) { if k.cpuClockTicker != nil { k.cpuClockTicker.Resume() } @@ -1129,7 +1161,7 @@ func (k *Kernel) resumeTimeLocked() { } } if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { + t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { if VFS2Enabled { if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { tfd.ResumeTimer() @@ -1254,13 +1286,22 @@ func (k *Kernel) Kill(es ExitStatus) { } // Pause requests that all tasks in k temporarily stop executing, and blocks -// until all tasks in k have stopped. Multiple calls to Pause nest and require -// an equal number of calls to Unpause to resume execution. +// until all tasks and asynchronous I/O operations in k have stopped. Multiple +// calls to Pause nest and require an equal number of calls to Unpause to +// resume execution. func (k *Kernel) Pause() { k.extMu.Lock() k.tasks.BeginExternalStop() k.extMu.Unlock() k.tasks.runningGoroutines.Wait() + k.tasks.aioGoroutines.Wait() +} + +// ReceiveTaskStates receives full states for all tasks. +func (k *Kernel) ReceiveTaskStates() { + k.extMu.Lock() + k.tasks.PullFullState() + k.extMu.Unlock() } // Unpause ends the effect of a previous call to Pause. If Unpause is called @@ -1353,8 +1394,9 @@ func (k *Kernel) RootUTSNamespace() *UTSNamespace { return k.rootUTSNamespace } -// RootIPCNamespace returns the root IPCNamespace. +// RootIPCNamespace takes a reference and returns the root IPCNamespace. func (k *Kernel) RootIPCNamespace() *IPCNamespace { + k.rootIPCNamespace.IncRef() return k.rootIPCNamespace } @@ -1470,6 +1512,11 @@ func (k *Kernel) NowMonotonic() int64 { return now } +// AfterFunc implements tcpip.Clock.AfterFunc. +func (k *Kernel) AfterFunc(d time.Duration, f func()) tcpip.Timer { + return ktime.TcpipAfterFunc(k.realtimeClock, d, f) +} + // SetMemoryFile sets Kernel.mf. SetMemoryFile must be called before Init or // LoadFrom. func (k *Kernel) SetMemoryFile(mf *pgalloc.MemoryFile) { @@ -1494,20 +1541,27 @@ func (k *Kernel) SupervisorContext() context.Context { } } -// SocketEntry represents a socket recorded in Kernel.sockets. It implements +// SocketRecord represents a socket recorded in Kernel.socketsVFS2. +// +// +stateify savable +type SocketRecord struct { + k *Kernel + Sock *refs.WeakRef // TODO(gvisor.dev/issue/1624): Only used by VFS1. + SockVFS2 *vfs.FileDescription // Only used by VFS2. + ID uint64 // Socket table entry number. +} + +// SocketRecordVFS1 represents a socket recorded in Kernel.sockets. It implements // refs.WeakRefUser for sockets stored in the socket table. // // +stateify savable -type SocketEntry struct { +type SocketRecordVFS1 struct { socketEntry - k *Kernel - Sock *refs.WeakRef - SockVFS2 *vfs.FileDescription - ID uint64 // Socket table entry number. + SocketRecord } // WeakRefGone implements refs.WeakRefUser.WeakRefGone. -func (s *SocketEntry) WeakRefGone() { +func (s *SocketRecordVFS1) WeakRefGone(context.Context) { s.k.extMu.Lock() s.k.sockets.Remove(s) s.k.extMu.Unlock() @@ -1518,9 +1572,14 @@ func (s *SocketEntry) WeakRefGone() { // Precondition: Caller must hold a reference to sock. func (k *Kernel) RecordSocket(sock *fs.File) { k.extMu.Lock() - id := k.nextSocketEntry - k.nextSocketEntry++ - s := &SocketEntry{k: k, ID: id} + id := k.nextSocketRecord + k.nextSocketRecord++ + s := &SocketRecordVFS1{ + SocketRecord: SocketRecord{ + k: k, + ID: id, + }, + } s.Sock = refs.NewWeakRef(sock, s) k.sockets.PushBack(s) k.extMu.Unlock() @@ -1532,29 +1591,45 @@ func (k *Kernel) RecordSocket(sock *fs.File) { // Precondition: Caller must hold a reference to sock. // // Note that the socket table will not hold a reference on the -// vfs.FileDescription, because we do not support weak refs on VFS2 files. +// vfs.FileDescription. func (k *Kernel) RecordSocketVFS2(sock *vfs.FileDescription) { k.extMu.Lock() - id := k.nextSocketEntry - k.nextSocketEntry++ - s := &SocketEntry{ + if _, ok := k.socketsVFS2[sock]; ok { + panic(fmt.Sprintf("Socket %p added twice", sock)) + } + id := k.nextSocketRecord + k.nextSocketRecord++ + s := &SocketRecord{ k: k, ID: id, SockVFS2: sock, } - k.sockets.PushBack(s) + k.socketsVFS2[sock] = s + k.extMu.Unlock() +} + +// DeleteSocketVFS2 removes a VFS2 socket from the system-wide socket table. +func (k *Kernel) DeleteSocketVFS2(sock *vfs.FileDescription) { + k.extMu.Lock() + delete(k.socketsVFS2, sock) k.extMu.Unlock() } // ListSockets returns a snapshot of all sockets. // -// Callers of ListSockets() in VFS2 should use SocketEntry.SockVFS2.TryIncRef() +// Callers of ListSockets() in VFS2 should use SocketRecord.SockVFS2.TryIncRef() // to get a reference on a socket in the table. -func (k *Kernel) ListSockets() []*SocketEntry { +func (k *Kernel) ListSockets() []*SocketRecord { k.extMu.Lock() - var socks []*SocketEntry - for s := k.sockets.Front(); s != nil; s = s.Next() { - socks = append(socks, s) + var socks []*SocketRecord + if VFS2Enabled { + for _, s := range k.socketsVFS2 { + socks = append(socks, s) + } + } else { + for s := k.sockets.Front(); s != nil; s = s.Next() { + socks = append(socks, &s.SocketRecord) + } } k.extMu.Unlock() return socks @@ -1582,7 +1657,9 @@ func (ctx supervisorContext) Value(key interface{}) interface{} { case CtxUTSNamespace: return ctx.k.rootUTSNamespace case CtxIPCNamespace: - return ctx.k.rootIPCNamespace + ipcns := ctx.k.rootIPCNamespace + ipcns.IncRef() + return ipcns case auth.CtxCredentials: // The supervisor context is global root. return auth.NewRootCredentials(ctx.k.rootUserNamespace) @@ -1595,16 +1672,16 @@ func (ctx supervisorContext) Value(key interface{}) interface{} { if ctx.k.globalInit == nil { return vfs.VirtualDentry{} } - mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2() - defer mntns.DecRef() - // Root() takes a reference on the root dirent for us. - return mntns.Root() + root := ctx.k.GlobalInit().Leader().MountNamespaceVFS2().Root() + root.IncRef() + return root case vfs.CtxMountNamespace: if ctx.k.globalInit == nil { return nil } - // MountNamespaceVFS2() takes a reference for us. - return ctx.k.GlobalInit().Leader().MountNamespaceVFS2() + mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2() + mntns.IncRef() + return mntns case fs.CtxDirentCacheLimiter: return ctx.k.DirentCacheLimiter case inet.CtxStack: @@ -1685,3 +1762,20 @@ func (k *Kernel) ShmMount() *vfs.Mount { func (k *Kernel) SocketMount() *vfs.Mount { return k.socketMount } + +// Release releases resources owned by k. +// +// Precondition: This should only be called after the kernel is fully +// initialized, e.g. after k.Start() has been called. +func (k *Kernel) Release() { + ctx := k.SupervisorContext() + if VFS2Enabled { + k.hostMount.DecRef(ctx) + k.pipeMount.DecRef(ctx) + k.shmMount.DecRef(ctx) + k.socketMount.DecRef(ctx) + k.vfs.Release(ctx) + } + k.timekeeper.Destroy() + k.vdso.Release(ctx) +} diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD index 7bfa9075a..99134e634 100644 --- a/pkg/sentry/kernel/pipe/BUILD +++ b/pkg/sentry/kernel/pipe/BUILD @@ -21,11 +21,13 @@ go_library( "//pkg/amutex", "//pkg/buffer", "//pkg/context", + "//pkg/marshal/primitive", "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserror", diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go index 4b688c627..6497dc4ba 100644 --- a/pkg/sentry/kernel/pipe/node.go +++ b/pkg/sentry/kernel/pipe/node.go @@ -93,7 +93,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi if i.p.isNamed && !flags.NonBlocking && !i.p.HasWriters() { if !waitFor(&i.mu, &i.wWakeup, ctx) { - r.DecRef() + r.DecRef(ctx) return nil, syserror.ErrInterrupted } } @@ -111,12 +111,12 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi // On a nonblocking, write-only open, the open fails with ENXIO if the // read side isn't open yet. if flags.NonBlocking { - w.DecRef() + w.DecRef(ctx) return nil, syserror.ENXIO } if !waitFor(&i.mu, &i.rWakeup, ctx) { - w.DecRef() + w.DecRef(ctx) return nil, syserror.ErrInterrupted } } diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go index ab75a87ff..ce0db5583 100644 --- a/pkg/sentry/kernel/pipe/node_test.go +++ b/pkg/sentry/kernel/pipe/node_test.go @@ -167,7 +167,7 @@ func TestClosedReaderBlocksWriteOpen(t *testing.T) { f := NewInodeOperations(ctx, perms, newNamedPipe(t)) rFile, _ := testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, NonBlocking: true}, nil) - rFile.DecRef() + rFile.DecRef(ctx) wDone := make(chan struct{}) // This open for write should block because the reader is now gone. diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go index 79645d7d2..67beb0ad6 100644 --- a/pkg/sentry/kernel/pipe/pipe.go +++ b/pkg/sentry/kernel/pipe/pipe.go @@ -17,6 +17,7 @@ package pipe import ( "fmt" + "io" "sync/atomic" "syscall" @@ -152,7 +153,7 @@ func NewConnectedPipe(ctx context.Context, sizeBytes, atomicIOBytes int64) (*fs. d := fs.NewDirent(ctx, fs.NewInode(ctx, iops, ms, sattr), fmt.Sprintf("pipe:[%d]", ino)) // The p.Open calls below will each take a reference on the Dirent. We // must drop the one we already have. - defer d.DecRef() + defer d.DecRef(ctx) return p.Open(ctx, d, fs.FileFlags{Read: true}), p.Open(ctx, d, fs.FileFlags{Write: true}) } @@ -200,22 +201,22 @@ type readOps struct { // // Precondition: this pipe must have readers. func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) { - // Don't block for a zero-length read even if the pipe is empty. - if ops.left() == 0 { - return 0, nil - } - p.mu.Lock() defer p.mu.Unlock() return p.readLocked(ctx, ops) } func (p *Pipe) readLocked(ctx context.Context, ops readOps) (int64, error) { + // Don't block for a zero-length read even if the pipe is empty. + if ops.left() == 0 { + return 0, nil + } + // Is the pipe empty? if p.view.Size() == 0 { if !p.HasWriters() { // There are no writers, return EOF. - return 0, nil + return 0, io.EOF } return 0, syserror.ErrWouldBlock } @@ -388,6 +389,10 @@ func (p *Pipe) rwReadiness() waiter.EventMask { func (p *Pipe) queued() int64 { p.mu.Lock() defer p.mu.Unlock() + return p.queuedLocked() +} + +func (p *Pipe) queuedLocked() int64 { return p.view.Size() } diff --git a/pkg/sentry/kernel/pipe/pipe_test.go b/pkg/sentry/kernel/pipe/pipe_test.go index bda739dbe..fe97e9800 100644 --- a/pkg/sentry/kernel/pipe/pipe_test.go +++ b/pkg/sentry/kernel/pipe/pipe_test.go @@ -27,8 +27,8 @@ import ( func TestPipeRW(t *testing.T) { ctx := contexttest.Context(t) r, w := NewConnectedPipe(ctx, 65536, 4096) - defer r.DecRef() - defer w.DecRef() + defer r.DecRef(ctx) + defer w.DecRef(ctx) msg := []byte("here's some bytes") wantN := int64(len(msg)) @@ -47,8 +47,8 @@ func TestPipeRW(t *testing.T) { func TestPipeReadBlock(t *testing.T) { ctx := contexttest.Context(t) r, w := NewConnectedPipe(ctx, 65536, 4096) - defer r.DecRef() - defer w.DecRef() + defer r.DecRef(ctx) + defer w.DecRef(ctx) n, err := r.Readv(ctx, usermem.BytesIOSequence(make([]byte, 1))) if n != 0 || err != syserror.ErrWouldBlock { @@ -62,8 +62,8 @@ func TestPipeWriteBlock(t *testing.T) { ctx := contexttest.Context(t) r, w := NewConnectedPipe(ctx, capacity, atomicIOBytes) - defer r.DecRef() - defer w.DecRef() + defer r.DecRef(ctx) + defer w.DecRef(ctx) msg := make([]byte, capacity+1) n, err := w.Writev(ctx, usermem.BytesIOSequence(msg)) @@ -77,8 +77,8 @@ func TestPipeWriteUntilEnd(t *testing.T) { ctx := contexttest.Context(t) r, w := NewConnectedPipe(ctx, atomicIOBytes, atomicIOBytes) - defer r.DecRef() - defer w.DecRef() + defer r.DecRef(ctx) + defer w.DecRef(ctx) msg := []byte("here's some bytes") diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go index aacf28da2..f665920cb 100644 --- a/pkg/sentry/kernel/pipe/pipe_util.go +++ b/pkg/sentry/kernel/pipe/pipe_util.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/amutex" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" @@ -33,7 +34,7 @@ import ( // the old fs architecture. // Release cleans up the pipe's state. -func (p *Pipe) Release() { +func (p *Pipe) Release(context.Context) { p.rClose() p.wClose() @@ -145,9 +146,14 @@ func (p *Pipe) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArgume v = math.MaxInt32 // Silently truncate. } // Copy result to userspace. - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ - AddressSpaceActive: true, - }) + iocc := primitive.IOCopyContext{ + IO: io, + Ctx: ctx, + Opts: usermem.IOOpts{ + AddressSpaceActive: true, + }, + } + _, err := primitive.CopyInt32Out(&iocc, args[2].Pointer(), int32(v)) return 0, err default: return 0, syscall.ENOTTY diff --git a/pkg/sentry/kernel/pipe/reader.go b/pkg/sentry/kernel/pipe/reader.go index 7724b4452..ac18785c0 100644 --- a/pkg/sentry/kernel/pipe/reader.go +++ b/pkg/sentry/kernel/pipe/reader.go @@ -15,6 +15,7 @@ package pipe import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/waiter" ) @@ -29,7 +30,7 @@ type Reader struct { // Release implements fs.FileOperations.Release. // // This overrides ReaderWriter.Release. -func (r *Reader) Release() { +func (r *Reader) Release(context.Context) { r.Pipe.rClose() // Wake up writers. diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index 2602bed72..d96bf253b 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -32,6 +33,8 @@ import ( // VFSPipe represents the actual pipe, analagous to an inode. VFSPipes should // not be copied. +// +// +stateify savable type VFSPipe struct { // mu protects the fields below. mu sync.Mutex `state:"nosave"` @@ -61,11 +64,18 @@ func NewVFSPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *VFSPipe { // // Preconditions: statusFlags should not contain an open access mode. func (vp *VFSPipe) ReaderWriterPair(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, *vfs.FileDescription) { - return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags) + // Connected pipes share the same locks. + locks := &vfs.FileLocks{} + return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags, locks), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags, locks) +} + +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (*VFSPipe) Allocate(context.Context, uint64, uint64, uint64) error { + return syserror.ESPIPE } // Open opens the pipe represented by vp. -func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, error) { +func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) (*vfs.FileDescription, error) { vp.mu.Lock() defer vp.mu.Unlock() @@ -75,7 +85,7 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s return nil, syserror.EINVAL } - fd := vp.newFD(mnt, vfsd, statusFlags) + fd := vp.newFD(mnt, vfsd, statusFlags, locks) // Named pipes have special blocking semantics during open: // @@ -98,7 +108,7 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s // If this pipe is being opened as blocking and there's no // writer, we have to wait for a writer to open the other end. if vp.pipe.isNamed && statusFlags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() && !waitFor(&vp.mu, &vp.wWakeup, ctx) { - fd.DecRef() + fd.DecRef(ctx) return nil, syserror.EINTR } @@ -109,12 +119,12 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s // Non-blocking, write-only opens fail with ENXIO when the read // side isn't open yet. if statusFlags&linux.O_NONBLOCK != 0 { - fd.DecRef() + fd.DecRef(ctx) return nil, syserror.ENXIO } // Wait for a reader to open the other end. if !waitFor(&vp.mu, &vp.rWakeup, ctx) { - fd.DecRef() + fd.DecRef(ctx) return nil, syserror.EINTR } } @@ -127,10 +137,11 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s } // Preconditions: vp.mu must be held. -func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) *vfs.FileDescription { +func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) *vfs.FileDescription { fd := &VFSPipeFD{ pipe: &vp.pipe, } + fd.LockFD.Init(locks) fd.vfsfd.Init(fd, statusFlags, mnt, vfsd, &vfs.FileDescriptionOptions{ DenyPRead: true, DenyPWrite: true, @@ -155,16 +166,19 @@ func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) * // VFSPipeFD implements vfs.FileDescriptionImpl for pipes. It also implements // non-atomic usermem.IO methods, allowing it to be passed as usermem.IO to // other FileDescriptions for splice(2) and tee(2). +// +// +stateify savable type VFSPipeFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.LockFD pipe *Pipe } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *VFSPipeFD) Release() { +func (fd *VFSPipeFD) Release(context.Context) { var event waiter.EventMask if fd.vfsfd.IsReadable() { fd.pipe.rClose() @@ -195,6 +209,11 @@ func (fd *VFSPipeFD) Readiness(mask waiter.EventMask) waiter.EventMask { } } +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (fd *VFSPipeFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + return syserror.ESPIPE +} + // EventRegister implements waiter.Waitable.EventRegister. func (fd *VFSPipeFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) { fd.pipe.EventRegister(e, mask) @@ -222,8 +241,7 @@ func (fd *VFSPipeFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.Syscal // PipeSize implements fcntl(F_GETPIPE_SZ). func (fd *VFSPipeFD) PipeSize() int64 { - // Inline Pipe.FifoSize() rather than calling it with nil Context and - // fs.File and ignoring the returned error (which is always nil). + // Inline Pipe.FifoSize() since we don't have a fs.File. fd.pipe.mu.Lock() defer fd.pipe.mu.Unlock() return fd.pipe.max @@ -234,19 +252,57 @@ func (fd *VFSPipeFD) SetPipeSize(size int64) (int64, error) { return fd.pipe.SetFifoSize(size) } -// IOSequence returns a useremm.IOSequence that reads up to count bytes from, -// or writes up to count bytes to, fd. -func (fd *VFSPipeFD) IOSequence(count int64) usermem.IOSequence { - return usermem.IOSequence{ +// SpliceToNonPipe performs a splice operation from fd to a non-pipe file. +func (fd *VFSPipeFD) SpliceToNonPipe(ctx context.Context, out *vfs.FileDescription, off, count int64) (int64, error) { + fd.pipe.mu.Lock() + defer fd.pipe.mu.Unlock() + + // Cap the sequence at number of bytes actually available. + v := fd.pipe.queuedLocked() + if v < count { + count = v + } + src := usermem.IOSequence{ IO: fd, Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}), } + + var ( + n int64 + err error + ) + if off == -1 { + n, err = out.Write(ctx, src, vfs.WriteOptions{}) + } else { + n, err = out.PWrite(ctx, src, off, vfs.WriteOptions{}) + } + if n > 0 { + fd.pipe.view.TrimFront(n) + } + return n, err } -// CopyIn implements usermem.IO.CopyIn. +// SpliceFromNonPipe performs a splice operation from a non-pipe file to fd. +func (fd *VFSPipeFD) SpliceFromNonPipe(ctx context.Context, in *vfs.FileDescription, off, count int64) (int64, error) { + fd.pipe.mu.Lock() + defer fd.pipe.mu.Unlock() + + dst := usermem.IOSequence{ + IO: fd, + Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}), + } + + if off == -1 { + return in.Read(ctx, dst, vfs.ReadOptions{}) + } + return in.PRead(ctx, dst, off, vfs.ReadOptions{}) +} + +// CopyIn implements usermem.IO.CopyIn. Note that it is the caller's +// responsibility to trim fd.pipe.view after the read is completed. func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) { origCount := int64(len(dst)) - n, err := fd.pipe.read(ctx, readOps{ + n, err := fd.pipe.readLocked(ctx, readOps{ left: func() int64 { return int64(len(dst)) }, @@ -255,7 +311,6 @@ func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, }, read: func(view *buffer.View) (int64, error) { n, err := view.ReadAt(dst, 0) - view.TrimFront(int64(n)) return int64(n), err }, }) @@ -271,7 +326,7 @@ func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, // CopyOut implements usermem.IO.CopyOut. func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) { origCount := int64(len(src)) - n, err := fd.pipe.write(ctx, writeOps{ + n, err := fd.pipe.writeLocked(ctx, writeOps{ left: func() int64 { return int64(len(src)) }, @@ -295,7 +350,7 @@ func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, // ZeroOut implements usermem.IO.ZeroOut. func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) { origCount := toZero - n, err := fd.pipe.write(ctx, writeOps{ + n, err := fd.pipe.writeLocked(ctx, writeOps{ left: func() int64 { return toZero }, @@ -316,14 +371,15 @@ func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int6 return n, err } -// CopyInTo implements usermem.IO.CopyInTo. +// CopyInTo implements usermem.IO.CopyInTo. Note that it is the caller's +// responsibility to trim fd.pipe.view after the read is completed. func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) { count := ars.NumBytes() if count == 0 { return 0, nil } origCount := count - n, err := fd.pipe.read(ctx, readOps{ + n, err := fd.pipe.readLocked(ctx, readOps{ left: func() int64 { return count }, @@ -332,7 +388,6 @@ func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst }, read: func(view *buffer.View) (int64, error) { n, err := view.ReadToSafememWriter(dst, uint64(count)) - view.TrimFront(int64(n)) return int64(n), err }, }) @@ -352,7 +407,7 @@ func (fd *VFSPipeFD) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq, return 0, nil } origCount := count - n, err := fd.pipe.write(ctx, writeOps{ + n, err := fd.pipe.writeLocked(ctx, writeOps{ left: func() int64 { return count }, @@ -446,3 +501,13 @@ func spliceOrTee(ctx context.Context, dst, src *VFSPipeFD, count int64, removeFr } return n, err } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *VFSPipeFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *VFSPipeFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/kernel/pipe/writer.go b/pkg/sentry/kernel/pipe/writer.go index 5bc6aa931..ef4b70ca3 100644 --- a/pkg/sentry/kernel/pipe/writer.go +++ b/pkg/sentry/kernel/pipe/writer.go @@ -15,6 +15,7 @@ package pipe import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/waiter" ) @@ -29,7 +30,7 @@ type Writer struct { // Release implements fs.FileOperations.Release. // // This overrides ReaderWriter.Release. -func (w *Writer) Release() { +func (w *Writer) Release(context.Context) { w.Pipe.wClose() // Wake up readers. diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go index e23e796ef..1145faf13 100644 --- a/pkg/sentry/kernel/ptrace.go +++ b/pkg/sentry/kernel/ptrace.go @@ -18,6 +18,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/syserror" @@ -224,8 +225,9 @@ func (s *ptraceStop) Killable() bool { // beginPtraceStopLocked does not signal t's tracer or wake it if it is // waiting. // -// Preconditions: The TaskSet mutex must be locked. The caller must be running -// on the task goroutine. +// Preconditions: +// * The TaskSet mutex must be locked. +// * The caller must be running on the task goroutine. func (t *Task) beginPtraceStopLocked() bool { t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() @@ -270,8 +272,9 @@ func (t *Task) ptraceTrapLocked(code int32) { // ptraceStop, temporarily preventing it from being removed by a concurrent // Task.Kill, and returns true. Otherwise it returns false. // -// Preconditions: The TaskSet mutex must be locked. The caller must be running -// on the task goroutine of t's tracer. +// Preconditions: +// * The TaskSet mutex must be locked. +// * The caller must be running on the task goroutine of t's tracer. func (t *Task) ptraceFreeze() bool { t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() @@ -301,8 +304,9 @@ func (t *Task) ptraceUnfreeze() { t.ptraceUnfreezeLocked() } -// Preconditions: t must be in a frozen ptraceStop. t's signal mutex must be -// locked. +// Preconditions: +// * t must be in a frozen ptraceStop. +// * t's signal mutex must be locked. func (t *Task) ptraceUnfreezeLocked() { // Do this even if the task has been killed to ensure a panic if t.stop is // nil or not a ptraceStop. @@ -497,8 +501,9 @@ func (t *Task) forgetTracerLocked() { // ptraceSignalLocked is called after signal dequeueing to check if t should // enter ptrace signal-delivery-stop. // -// Preconditions: The signal mutex must be locked. The caller must be running -// on the task goroutine. +// Preconditions: +// * The signal mutex must be locked. +// * The caller must be running on the task goroutine. func (t *Task) ptraceSignalLocked(info *arch.SignalInfo) bool { if linux.Signal(info.Signo) == linux.SIGKILL { return false @@ -828,8 +833,9 @@ func (t *Task) ptraceInterrupt(target *Task) error { return nil } -// Preconditions: The TaskSet mutex must be locked for writing. t must have a -// tracer. +// Preconditions: +// * The TaskSet mutex must be locked for writing. +// * t must have a tracer. func (t *Task) ptraceSetOptionsLocked(opts uintptr) error { const valid = uintptr(linux.PTRACE_O_EXITKILL | linux.PTRACE_O_TRACESYSGOOD | @@ -994,18 +1000,15 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { // at the address specified by the data parameter, and the return value // is the error flag." - ptrace(2) word := t.Arch().Native(0) - if _, err := usermem.CopyObjectIn(t, target.MemoryManager(), addr, word, usermem.IOOpts{ - IgnorePermissions: true, - }); err != nil { + if _, err := word.CopyIn(target.AsCopyContext(usermem.IOOpts{IgnorePermissions: true}), addr); err != nil { return err } - _, err := t.CopyOut(data, word) + _, err := word.CopyOut(t, data) return err case linux.PTRACE_POKETEXT, linux.PTRACE_POKEDATA: - _, err := usermem.CopyObjectOut(t, target.MemoryManager(), addr, t.Arch().Native(uintptr(data)), usermem.IOOpts{ - IgnorePermissions: true, - }) + word := t.Arch().Native(uintptr(data)) + _, err := word.CopyOut(target.AsCopyContext(usermem.IOOpts{IgnorePermissions: true}), addr) return err case linux.PTRACE_GETREGSET: @@ -1018,6 +1021,9 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { if err != nil { return err } + + t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()) + ar := ars.Head() n, err := target.Arch().PtraceGetRegSet(uintptr(addr), &usermem.IOReadWriter{ Ctx: t, @@ -1044,10 +1050,14 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { if err != nil { return err } + + mm := t.MemoryManager() + t.p.PullFullState(mm.AddressSpace(), t.Arch()) + ar := ars.Head() n, err := target.Arch().PtraceSetRegSet(uintptr(addr), &usermem.IOReadWriter{ Ctx: t, - IO: t.MemoryManager(), + IO: mm, Addr: ar.Start, Opts: usermem.IOOpts{ AddressSpaceActive: true, @@ -1056,6 +1066,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { if err != nil { return err } + t.p.FullStateChanged() ar.End -= usermem.Addr(n) return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar)) @@ -1065,12 +1076,12 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { if target.ptraceSiginfo == nil { return syserror.EINVAL } - _, err := t.CopyOut(data, target.ptraceSiginfo) + _, err := target.ptraceSiginfo.CopyOut(t, data) return err case linux.PTRACE_SETSIGINFO: var info arch.SignalInfo - if _, err := t.CopyIn(data, &info); err != nil { + if _, err := info.CopyIn(t, data); err != nil { return err } t.tg.pidns.owner.mu.RLock() @@ -1085,7 +1096,8 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { if addr != linux.SignalSetSize { return syserror.EINVAL } - _, err := t.CopyOut(data, target.SignalMask()) + mask := target.SignalMask() + _, err := mask.CopyOut(t, data) return err case linux.PTRACE_SETSIGMASK: @@ -1093,7 +1105,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { return syserror.EINVAL } var mask linux.SignalSet - if _, err := t.CopyIn(data, &mask); err != nil { + if _, err := mask.CopyIn(t, data); err != nil { return err } // The target's task goroutine is stopped, so this is safe: @@ -1108,7 +1120,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { case linux.PTRACE_GETEVENTMSG: t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() - _, err := t.CopyOut(usermem.Addr(data), target.ptraceEventMsg) + _, err := primitive.CopyUint64Out(t, usermem.Addr(data), target.ptraceEventMsg) return err // PEEKSIGINFO is unimplemented but seems to have no users anywhere. diff --git a/pkg/sentry/kernel/ptrace_amd64.go b/pkg/sentry/kernel/ptrace_amd64.go index cef1276ec..609ad3941 100644 --- a/pkg/sentry/kernel/ptrace_amd64.go +++ b/pkg/sentry/kernel/ptrace_amd64.go @@ -30,7 +30,7 @@ func (t *Task) ptraceArch(target *Task, req int64, addr, data usermem.Addr) erro if err != nil { return err } - _, err = t.CopyOut(data, n) + _, err = n.CopyOut(t, data) return err case linux.PTRACE_POKEUSR: // aka PTRACE_POKEUSER diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go index 18416643b..2a9023fdf 100644 --- a/pkg/sentry/kernel/rseq.go +++ b/pkg/sentry/kernel/rseq.go @@ -173,8 +173,10 @@ func (t *Task) OldRSeqCPUAddr() usermem.Addr { // SetOldRSeqCPUAddr replaces the address that old rseq will keep updated with // t's CPU number. // -// Preconditions: t.RSeqAvailable() == true. The caller must be running on the -// task goroutine. t's AddressSpace must be active. +// Preconditions: +// * t.RSeqAvailable() == true. +// * The caller must be running on the task goroutine. +// * t's AddressSpace must be active. func (t *Task) SetOldRSeqCPUAddr(addr usermem.Addr) error { t.oldRSeqCPUAddr = addr @@ -189,8 +191,9 @@ func (t *Task) SetOldRSeqCPUAddr(addr usermem.Addr) error { return nil } -// Preconditions: The caller must be running on the task goroutine. t's -// AddressSpace must be active. +// Preconditions: +// * The caller must be running on the task goroutine. +// * t's AddressSpace must be active. func (t *Task) rseqUpdateCPU() error { if t.rseqAddr == 0 && t.oldRSeqCPUAddr == 0 { t.rseqCPU = -1 @@ -209,8 +212,9 @@ func (t *Task) rseqUpdateCPU() error { return oerr } -// Preconditions: The caller must be running on the task goroutine. t's -// AddressSpace must be active. +// Preconditions: +// * The caller must be running on the task goroutine. +// * t's AddressSpace must be active. func (t *Task) oldRSeqCopyOutCPU() error { if t.oldRSeqCPUAddr == 0 { return nil @@ -222,8 +226,9 @@ func (t *Task) oldRSeqCopyOutCPU() error { return err } -// Preconditions: The caller must be running on the task goroutine. t's -// AddressSpace must be active. +// Preconditions: +// * The caller must be running on the task goroutine. +// * t's AddressSpace must be active. func (t *Task) rseqCopyOutCPU() error { if t.rseqAddr == 0 { return nil @@ -240,8 +245,9 @@ func (t *Task) rseqCopyOutCPU() error { return err } -// Preconditions: The caller must be running on the task goroutine. t's -// AddressSpace must be active. +// Preconditions: +// * The caller must be running on the task goroutine. +// * t's AddressSpace must be active. func (t *Task) rseqClearCPU() error { buf := t.CopyScratchBuffer(8) // CPUIDStart and CPUID are the first two fields in linux.RSeq. @@ -269,8 +275,9 @@ func (t *Task) rseqClearCPU() error { // // See kernel/rseq.c:rseq_ip_fixup for reference. // -// Preconditions: The caller must be running on the task goroutine. t's -// AddressSpace must be active. +// Preconditions: +// * The caller must be running on the task goroutine. +// * t's AddressSpace must be active. func (t *Task) rseqAddrInterrupt() { if t.rseqAddr == 0 { return diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go index c38c5a40c..387edfa91 100644 --- a/pkg/sentry/kernel/seccomp.go +++ b/pkg/sentry/kernel/seccomp.go @@ -18,7 +18,6 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/syserror" @@ -27,25 +26,18 @@ import ( const maxSyscallFilterInstructions = 1 << 15 -// seccompData is equivalent to struct seccomp_data, which contains the data -// passed to seccomp-bpf filters. -type seccompData struct { - // nr is the system call number. - nr int32 - - // arch is an AUDIT_ARCH_* value indicating the system call convention. - arch uint32 - - // instructionPointer is the value of the instruction pointer at the time - // of the system call. - instructionPointer uint64 - - // args contains the first 6 system call arguments. - args [6]uint64 -} - -func (d *seccompData) asBPFInput() bpf.Input { - return bpf.InputBytes{binary.Marshal(nil, usermem.ByteOrder, d), usermem.ByteOrder} +// dataAsBPFInput returns a serialized BPF program, only valid on the current task +// goroutine. +// +// Note: this is called for every syscall, which is a very hot path. +func dataAsBPFInput(t *Task, d *linux.SeccompData) bpf.Input { + buf := t.CopyScratchBuffer(d.SizeBytes()) + d.MarshalUnsafe(buf) + return bpf.InputBytes{ + Data: buf, + // Go-marshal always uses the native byte order. + Order: usermem.ByteOrder, + } } func seccompSiginfo(t *Task, errno, sysno int32, ip usermem.Addr) *arch.SignalInfo { @@ -112,20 +104,20 @@ func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip u } func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip usermem.Addr) uint32 { - data := seccompData{ - nr: sysno, - arch: t.tc.st.AuditNumber, - instructionPointer: uint64(ip), + data := linux.SeccompData{ + Nr: sysno, + Arch: t.tc.st.AuditNumber, + InstructionPointer: uint64(ip), } // data.args is []uint64 and args is []arch.SyscallArgument (uintptr), so // we can't do any slicing tricks or even use copy/append here. for i, arg := range args { - if i >= len(data.args) { + if i >= len(data.Args) { break } - data.args[i] = arg.Uint64() + data.Args[i] = arg.Uint64() } - input := data.asBPFInput() + input := dataAsBPFInput(t, &data) ret := uint32(linux.SECCOMP_RET_ALLOW) f := t.syscallFilters.Load() diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go index c00fa1138..c39ecfb8f 100644 --- a/pkg/sentry/kernel/semaphore/semaphore.go +++ b/pkg/sentry/kernel/semaphore/semaphore.go @@ -283,6 +283,33 @@ func (s *Set) Change(ctx context.Context, creds *auth.Credentials, owner fs.File return nil } +// GetStat extracts semid_ds information from the set. +func (s *Set) GetStat(creds *auth.Credentials) (*linux.SemidDS, error) { + s.mu.Lock() + defer s.mu.Unlock() + + // "The calling process must have read permission on the semaphore set." + if !s.checkPerms(creds, fs.PermMask{Read: true}) { + return nil, syserror.EACCES + } + + ds := &linux.SemidDS{ + SemPerm: linux.IPCPerm{ + Key: uint32(s.key), + UID: uint32(creds.UserNamespace.MapFromKUID(s.owner.UID)), + GID: uint32(creds.UserNamespace.MapFromKGID(s.owner.GID)), + CUID: uint32(creds.UserNamespace.MapFromKUID(s.creator.UID)), + CGID: uint32(creds.UserNamespace.MapFromKGID(s.creator.GID)), + Mode: uint16(s.perms.LinuxMode()), + Seq: 0, // IPC sequence not supported. + }, + SemOTime: s.opTime.TimeT(), + SemCTime: s.changeTime.TimeT(), + SemNSems: uint64(s.Size()), + } + return ds, nil +} + // SetVal overrides a semaphore value, waking up waiters as needed. func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Credentials, pid int32) error { if val < 0 || val > valueMax { @@ -320,7 +347,7 @@ func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credenti } for _, val := range vals { - if val < 0 || val > valueMax { + if val > valueMax { return syserror.ERANGE } } diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go index 0e19286de..df5c8421b 100644 --- a/pkg/sentry/kernel/sessions.go +++ b/pkg/sentry/kernel/sessions.go @@ -16,7 +16,6 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/syserror" ) @@ -31,7 +30,7 @@ type ProcessGroupID ThreadID // // +stateify savable type Session struct { - refs refs.AtomicRefCount + SessionRefs // leader is the originator of the Session. // @@ -61,16 +60,11 @@ type Session struct { sessionEntry } -// incRef grabs a reference. -func (s *Session) incRef() { - s.refs.IncRef() -} - -// decRef drops a reference. +// DecRef drops a reference. // // Precondition: callers must hold TaskSet.mu for writing. -func (s *Session) decRef() { - s.refs.DecRefWithDestructor(func() { +func (s *Session) DecRef() { + s.SessionRefs.DecRef(func() { // Remove translations from the leader. for ns := s.leader.pidns; ns != nil; ns = ns.parent { id := ns.sids[s] @@ -87,7 +81,7 @@ func (s *Session) decRef() { // // +stateify savable type ProcessGroup struct { - refs refs.AtomicRefCount // not exported. + refs ProcessGroupRefs // originator is the originator of the group. // @@ -162,7 +156,7 @@ func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) { } alive := true - pg.refs.DecRefWithDestructor(func() { + pg.refs.DecRef(func() { alive = false // don't bother with handleOrphan. // Remove translations from the originator. @@ -174,7 +168,7 @@ func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) { // Remove the list of process groups. pg.session.processGroups.Remove(pg) - pg.session.decRef() + pg.session.DecRef() }) if alive { pg.handleOrphan() @@ -301,7 +295,7 @@ func (tg *ThreadGroup) createSession() error { id: SessionID(id), leader: tg, } - s.refs.EnableLeakCheck("kernel.Session") + s.EnableLeakCheck() // Create a new ProcessGroup, belonging to that Session. // This also has a single reference (assigned below). @@ -315,7 +309,7 @@ func (tg *ThreadGroup) createSession() error { session: s, ancestors: 0, } - pg.refs.EnableLeakCheck("kernel.ProcessGroup") + pg.refs.EnableLeakCheck() // Tie them and return the result. s.processGroups.PushBack(pg) @@ -395,13 +389,13 @@ func (tg *ThreadGroup) CreateProcessGroup() error { // // We manually adjust the ancestors if the parent is in the same // session. - tg.processGroup.session.incRef() + tg.processGroup.session.IncRef() pg := ProcessGroup{ id: ProcessGroupID(id), originator: tg, session: tg.processGroup.session, } - pg.refs.EnableLeakCheck("kernel.ProcessGroup") + pg.refs.EnableLeakCheck() if tg.leader.parent != nil && tg.leader.parent.tg.processGroup.session == pg.session { pg.ancestors++ diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD index bfd779837..80a592c8f 100644 --- a/pkg/sentry/kernel/shm/BUILD +++ b/pkg/sentry/kernel/shm/BUILD @@ -1,12 +1,25 @@ load("//tools:defs.bzl", "go_library") +load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) +go_template_instance( + name = "shm_refs", + out = "shm_refs.go", + package = "shm", + prefix = "Shm", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "Shm", + }, +) + go_library( name = "shm", srcs = [ "device.go", "shm.go", + "shm_refs.go", ], visibility = ["//pkg/sentry:internal"], deps = [ @@ -14,13 +27,13 @@ go_library( "//pkg/context", "//pkg/log", "//pkg/refs", + "//pkg/refsvfs2", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", "//pkg/sentry/pgalloc", - "//pkg/sentry/platform", "//pkg/sentry/usage", "//pkg/sync", "//pkg/syserror", diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go index f66cfcc7f..ebbebf46b 100644 --- a/pkg/sentry/kernel/shm/shm.go +++ b/pkg/sentry/kernel/shm/shm.go @@ -39,13 +39,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -253,7 +251,7 @@ func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.Fi creatorPID: pid, changeTime: ktime.NowFromContext(ctx), } - shm.EnableLeakCheck("kernel.Shm") + shm.EnableLeakCheck() // Find the next available ID. for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ { @@ -323,9 +321,32 @@ func (r *Registry) remove(s *Shm) { r.totalPages -= s.effectiveSize / usermem.PageSize } +// Release drops the self-reference of each active shm segment in the registry. +// It is called when the kernel.IPCNamespace containing r is being destroyed. +func (r *Registry) Release(ctx context.Context) { + // Because Shm.DecRef() may acquire the same locks, collect the segments to + // release first. Note that this should not race with any updates to r, since + // the IPC namespace containing it has no more references. + toRelease := make([]*Shm, 0) + r.mu.Lock() + for _, s := range r.keysToShms { + s.mu.Lock() + if !s.pendingDestruction { + toRelease = append(toRelease, s) + } + s.mu.Unlock() + } + r.mu.Unlock() + + for _, s := range toRelease { + r.dissociateKey(s) + s.DecRef(ctx) + } +} + // Shm represents a single shared memory segment. // -// Shm segment are backed directly by an allocation from platform memory. +// Shm segments are backed directly by an allocation from platform memory. // Segments are always mapped as a whole, greatly simplifying how mappings are // tracked. However note that mremap and munmap calls may cause the vma for a // segment to become fragmented; which requires special care when unmapping a @@ -338,14 +359,14 @@ func (r *Registry) remove(s *Shm) { // // +stateify savable type Shm struct { - // AtomicRefCount tracks the number of references to this segment. + // ShmRefs tracks the number of references to this segment. // // A segment holds a reference to itself until it is marked for // destruction. // // In addition to direct users, the MemoryManager will hold references // via MappingIdentity. - refs.AtomicRefCount + ShmRefs mfp pgalloc.MemoryFileProvider @@ -370,7 +391,7 @@ type Shm struct { // fr is the offset into mfp.MemoryFile() that backs this contents of this // segment. Immutable. - fr platform.FileRange + fr memmap.FileRange // mu protects all fields below. mu sync.Mutex `state:"nosave"` @@ -429,11 +450,14 @@ func (s *Shm) InodeID() uint64 { return uint64(s.ID) } -// DecRef overrides refs.RefCount.DecRef with a destructor. +// DecRef drops a reference on s. // // Precondition: Caller must not hold s.mu. -func (s *Shm) DecRef() { - s.DecRefWithDestructor(s.destroy) +func (s *Shm) DecRef(ctx context.Context) { + s.ShmRefs.DecRef(func() { + s.mfp.MemoryFile().DecRef(s.fr) + s.registry.remove(s) + }) } // Msync implements memmap.MappingIdentity.Msync. Msync is a no-op for shm @@ -643,30 +667,28 @@ func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error { return nil } -func (s *Shm) destroy() { - s.mfp.MemoryFile().DecRef(s.fr) - s.registry.remove(s) -} - // MarkDestroyed marks a segment for destruction. The segment is actually // destroyed once it has no references. MarkDestroyed may be called multiple // times, and is safe to call after a segment has already been destroyed. See // shmctl(IPC_RMID). -func (s *Shm) MarkDestroyed() { +func (s *Shm) MarkDestroyed(ctx context.Context) { s.registry.dissociateKey(s) s.mu.Lock() - defer s.mu.Unlock() - if !s.pendingDestruction { - s.pendingDestruction = true - // Drop the self-reference so destruction occurs when all - // external references are gone. - // - // N.B. This cannot be the final DecRef, as the caller also - // holds a reference. - s.DecRef() + if s.pendingDestruction { + s.mu.Unlock() return } + s.pendingDestruction = true + s.mu.Unlock() + + // Drop the self-reference so destruction occurs when all + // external references are gone. + // + // N.B. This cannot be the final DecRef, as the caller also + // holds a reference. + s.DecRef(ctx) + return } // checkOwnership verifies whether a segment may be accessed by ctx as an diff --git a/pkg/sentry/kernel/signalfd/BUILD b/pkg/sentry/kernel/signalfd/BUILD index 3eb78e91b..76d472292 100644 --- a/pkg/sentry/kernel/signalfd/BUILD +++ b/pkg/sentry/kernel/signalfd/BUILD @@ -8,7 +8,6 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/binary", "//pkg/context", "//pkg/sentry/fs", "//pkg/sentry/fs/anon", diff --git a/pkg/sentry/kernel/signalfd/signalfd.go b/pkg/sentry/kernel/signalfd/signalfd.go index 8243bb93e..78f718cfe 100644 --- a/pkg/sentry/kernel/signalfd/signalfd.go +++ b/pkg/sentry/kernel/signalfd/signalfd.go @@ -17,7 +17,6 @@ package signalfd import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" @@ -76,7 +75,7 @@ func New(ctx context.Context, mask linux.SignalSet) (*fs.File, error) { } // Release implements fs.FileOperations.Release. -func (s *SignalOperations) Release() {} +func (s *SignalOperations) Release(context.Context) {} // Mask returns the signal mask. func (s *SignalOperations) Mask() linux.SignalSet { @@ -103,8 +102,7 @@ func (s *SignalOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS } // Copy out the signal info using the specified format. - var buf [128]byte - binary.Marshal(buf[:0], usermem.ByteOrder, &linux.SignalfdSiginfo{ + infoNative := linux.SignalfdSiginfo{ Signo: uint32(info.Signo), Errno: info.Errno, Code: info.Code, @@ -113,9 +111,13 @@ func (s *SignalOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS Status: info.Status(), Overrun: uint32(info.Overrun()), Addr: info.Addr(), - }) - n, err := dst.CopyOut(ctx, buf[:]) - return int64(n), err + } + n, err := infoNative.WriteTo(dst.Writer(ctx)) + if err == usermem.ErrEndOfIOSequence { + // Partial copy-out ok. + err = nil + } + return n, err } // Readiness implements waiter.Waitable.Readiness. diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go index 413111faf..332bdb8e8 100644 --- a/pkg/sentry/kernel/syscalls.go +++ b/pkg/sentry/kernel/syscalls.go @@ -348,6 +348,16 @@ func (s *SyscallTable) LookupName(sysno uintptr) string { return fmt.Sprintf("sys_%d", sysno) // Unlikely. } +// LookupNo looks up a syscall number by name. +func (s *SyscallTable) LookupNo(name string) (uintptr, error) { + for i, syscall := range s.Table { + if syscall.Name == name { + return uintptr(i), nil + } + } + return 0, fmt.Errorf("syscall %q not found", name) +} + // LookupEmulate looks up an emulation syscall number. func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) { sysno, ok := s.Emulate[addr] diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go index 4607cde2f..a83ce219c 100644 --- a/pkg/sentry/kernel/syslog.go +++ b/pkg/sentry/kernel/syslog.go @@ -98,6 +98,15 @@ func (s *syslog) Log() []byte { s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, selectMessage()))...) } + if VFS2Enabled { + time += rand.Float64() / 2 + s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Setting up VFS2..."))...) + if FUSEEnabled { + time += rand.Float64() / 2 + s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Setting up FUSE..."))...) + } + } + time += rand.Float64() / 2 s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Ready!"))...) diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index f48247c94..037971393 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -68,6 +68,21 @@ type Task struct { // runState is exclusive to the task goroutine. runState taskRunState + // taskWorkCount represents the current size of the task work queue. It is + // used to avoid acquiring taskWorkMu when the queue is empty. + // + // Must accessed with atomic memory operations. + taskWorkCount int32 + + // taskWorkMu protects taskWork. + taskWorkMu sync.Mutex `state:"nosave"` + + // taskWork is a queue of work to be executed before resuming user execution. + // It is similar to the task_work mechanism in Linux. + // + // taskWork is exclusive to the task goroutine. + taskWork []TaskWorker + // haveSyscallReturn is true if tc.Arch().Return() represents a value // returned by a syscall (or set by ptrace after a syscall). // @@ -550,11 +565,20 @@ type Task struct { // futexWaiter is exclusive to the task goroutine. futexWaiter *futex.Waiter `state:"nosave"` + // robustList is a pointer to the head of the tasks's robust futex + // list. + robustList usermem.Addr + // startTime is the real time at which the task started. It is set when // a Task is created or invokes execve(2). // // startTime is protected by mu. startTime ktime.Time + + // kcov is the kcov instance providing code coverage owned by this task. + // + // kcov is exclusive to the task goroutine. + kcov *Kcov } func (t *Task) savePtraceTracer() *Task { @@ -632,7 +656,9 @@ func (t *Task) Value(key interface{}) interface{} { case CtxUTSNamespace: return t.utsns case CtxIPCNamespace: - return t.ipcns + ipcns := t.IPCNamespace() + ipcns.IncRef() + return ipcns case CtxTask: return t case auth.CtxCredentials: @@ -711,17 +737,16 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock { func (t *Task) IsChrooted() bool { if VFS2Enabled { realRoot := t.mountNamespaceVFS2.Root() - defer realRoot.DecRef() root := t.fsContext.RootDirectoryVFS2() - defer root.DecRef() + defer root.DecRef(t) return root != realRoot } realRoot := t.tg.mounts.Root() - defer realRoot.DecRef() + defer realRoot.DecRef(t) root := t.fsContext.RootDirectory() if root != nil { - defer root.DecRef() + defer root.DecRef(t) } return root != realRoot } @@ -844,7 +869,6 @@ func (t *Task) MountNamespace() *fs.MountNamespace { func (t *Task) MountNamespaceVFS2() *vfs.MountNamespace { t.mu.Lock() defer t.mu.Unlock() - t.mountNamespaceVFS2.IncRef() return t.mountNamespaceVFS2 } @@ -884,3 +908,16 @@ func (t *Task) UID() uint32 { func (t *Task) GID() uint32 { return uint32(t.Credentials().EffectiveKGID) } + +// SetKcov sets the kcov instance associated with t. +func (t *Task) SetKcov(k *Kcov) { + t.kcov = k +} + +// ResetKcov clears the kcov instance associated with t. +func (t *Task) ResetKcov() { + if t.kcov != nil { + t.kcov.OnTaskExit() + t.kcov = nil + } +} diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index e1ecca99e..682080c14 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -19,6 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" + "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -161,6 +162,10 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { return 0, nil, syserror.EINVAL } + // Pull task registers and FPU state, a cloned task will inherit the + // state of the current task. + t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()) + // "If CLONE_NEWUSER is specified along with other CLONE_NEW* flags in a // single clone(2) or unshare(2) call, the user namespace is guaranteed to // be created first, giving the child (clone(2)) or caller (unshare(2)) @@ -199,7 +204,13 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC // namespace" ipcns = NewIPCNamespace(userns) + } else { + ipcns.IncRef() } + cu := cleanup.Make(func() { + ipcns.DecRef(t) + }) + defer cu.Clean() netns := t.NetworkNamespace() if opts.NewNetworkNamespace { @@ -210,12 +221,18 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { mntnsVFS2 := t.mountNamespaceVFS2 if mntnsVFS2 != nil { mntnsVFS2.IncRef() + cu.Add(func() { + mntnsVFS2.DecRef(t) + }) } tc, err := t.tc.Fork(t, t.k, !opts.NewAddressSpace) if err != nil { return 0, nil, err } + cu.Add(func() { + tc.release() + }) // clone() returns 0 in the child. tc.Arch.SetReturn(0) if opts.Stack != 0 { @@ -237,7 +254,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { var fdTable *FDTable if opts.NewFiles { - fdTable = t.fdTable.Fork() + fdTable = t.fdTable.Fork(t) } else { fdTable = t.fdTable fdTable.IncRef() @@ -291,11 +308,11 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { } else { cfg.InheritParent = t } - nt, err := t.tg.pidns.owner.NewTask(cfg) + nt, err := t.tg.pidns.owner.NewTask(t, cfg) + // If NewTask succeeds, we transfer references to nt. If NewTask fails, it does + // the cleanup for us. + cu.Release() if err != nil { - if opts.NewThreadGroup { - tg.release() - } return 0, nil, err } @@ -337,12 +354,12 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { nt.SetClearTID(opts.ChildTID) } if opts.ChildSetTID { - // Can't use Task.CopyOut, which assumes AddressSpaceActive. - usermem.CopyObjectOut(t, nt.MemoryManager(), opts.ChildTID, nt.ThreadID(), usermem.IOOpts{}) + ctid := nt.ThreadID() + ctid.CopyOut(nt.AsCopyContext(usermem.IOOpts{AddressSpaceActive: false}), opts.ChildTID) } ntid := t.tg.pidns.IDOfTask(nt) if opts.ParentSetTID { - t.CopyOut(opts.ParentTID, ntid) + ntid.CopyOut(t, opts.ParentTID) } kind := ptraceCloneKindClone @@ -505,12 +522,13 @@ func (t *Task) Unshare(opts *SharingOptions) error { } // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC // namespace" + t.ipcns.DecRef(t) t.ipcns = NewIPCNamespace(creds.UserNamespace) } var oldFDTable *FDTable if opts.NewFiles { oldFDTable = t.fdTable - t.fdTable = oldFDTable.Fork() + t.fdTable = oldFDTable.Fork(t) } var oldFSContext *FSContext if opts.NewFSContext { @@ -519,10 +537,10 @@ func (t *Task) Unshare(opts *SharingOptions) error { } t.mu.Unlock() if oldFDTable != nil { - oldFDTable.DecRef() + oldFDTable.DecRef(t) } if oldFSContext != nil { - oldFSContext.DecRef() + oldFSContext.DecRef(t) } return nil } diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go index 9fa528384..d1136461a 100644 --- a/pkg/sentry/kernel/task_context.go +++ b/pkg/sentry/kernel/task_context.go @@ -126,7 +126,11 @@ func (t *Task) SyscallTable() *SyscallTable { // Preconditions: The caller must be running on the task goroutine, or t.mu // must be locked. func (t *Task) Stack() *arch.Stack { - return &arch.Stack{t.Arch(), t.MemoryManager(), usermem.Addr(t.Arch().Stack())} + return &arch.Stack{ + Arch: t.Arch(), + IO: t.MemoryManager(), + Bottom: usermem.Addr(t.Arch().Stack()), + } } // LoadTaskImage loads a specified file into a new TaskContext. diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go index 00c425cca..412d471d3 100644 --- a/pkg/sentry/kernel/task_exec.go +++ b/pkg/sentry/kernel/task_exec.go @@ -198,11 +198,18 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { t.tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{}) t.tg.pidns.owner.mu.Unlock() + oldFDTable := t.fdTable + t.fdTable = t.fdTable.Fork(t) + oldFDTable.DecRef(t) + // Remove FDs with the CloseOnExec flag set. - t.fdTable.RemoveIf(func(_ *fs.File, _ *vfs.FileDescription, flags FDFlags) bool { + t.fdTable.RemoveIf(t, func(_ *fs.File, _ *vfs.FileDescription, flags FDFlags) bool { return flags.CloseOnExec }) + // Handle the robust futex list. + t.exitRobustList() + // NOTE(b/30815691): We currently do not implement privileged // executables (set-user/group-ID bits and file capabilities). This // allows us to unconditionally enable user dumpability on the new mm. @@ -219,6 +226,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { t.tc = *r.tc t.mu.Unlock() t.unstopVforkParent() + t.p.FullStateChanged() // NOTE(b/30316266): All locks must be dropped prior to calling Activate. t.MemoryManager().Activate(t) @@ -229,9 +237,10 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { // promoteLocked makes t the leader of its thread group. If t is already the // thread group leader, promoteLocked is a no-op. // -// Preconditions: All other tasks in t's thread group, including the existing -// leader (if it is not t), have reached TaskExitZombie. The TaskSet mutex must -// be locked for writing. +// Preconditions: +// * All other tasks in t's thread group, including the existing leader (if it +// is not t), have reached TaskExitZombie. +// * The TaskSet mutex must be locked for writing. func (t *Task) promoteLocked() { oldLeader := t.tg.leader if t == oldLeader { diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index c4ade6e8e..ce7b9641d 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -239,6 +239,8 @@ func (*runExitMain) execute(t *Task) taskRunState { t.traceExitEvent() lastExiter := t.exitThreadGroup() + t.ResetKcov() + // If the task has a cleartid, and the thread group wasn't killed by a // signal, handle that before releasing the MM. if t.cleartid != 0 { @@ -246,13 +248,17 @@ func (*runExitMain) execute(t *Task) taskRunState { signaled := t.tg.exiting && t.tg.exitStatus.Signaled() t.tg.signalHandlers.mu.Unlock() if !signaled { - if _, err := t.CopyOut(t.cleartid, ThreadID(0)); err == nil { + zero := ThreadID(0) + if _, err := zero.CopyOut(t, t.cleartid); err == nil { t.Futex().Wake(t, t.cleartid, false, ^uint32(0), 1) } // If the CopyOut fails, there's nothing we can do. } } + // Handle the robust futex list. + t.exitRobustList() + // Deactivate the address space and update max RSS before releasing the // task's MM. t.Deactivate() @@ -266,20 +272,21 @@ func (*runExitMain) execute(t *Task) taskRunState { // Releasing the MM unblocks a blocked CLONE_VFORK parent. t.unstopVforkParent() - t.fsContext.DecRef() - t.fdTable.DecRef() + t.fsContext.DecRef(t) + t.fdTable.DecRef(t) t.mu.Lock() if t.mountNamespaceVFS2 != nil { - t.mountNamespaceVFS2.DecRef() + t.mountNamespaceVFS2.DecRef(t) t.mountNamespaceVFS2 = nil } + t.ipcns.DecRef(t) t.mu.Unlock() // If this is the last task to exit from the thread group, release the // thread group's resources. if lastExiter { - t.tg.release() + t.tg.Release(t) } // Detach tracees. diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go index a53e77c9f..c80391475 100644 --- a/pkg/sentry/kernel/task_futex.go +++ b/pkg/sentry/kernel/task_futex.go @@ -15,6 +15,8 @@ package kernel import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/usermem" ) @@ -52,3 +54,127 @@ func (t *Task) LoadUint32(addr usermem.Addr) (uint32, error) { func (t *Task) GetSharedKey(addr usermem.Addr) (futex.Key, error) { return t.MemoryManager().GetSharedFutexKey(t, addr) } + +// GetRobustList sets the robust futex list for the task. +func (t *Task) GetRobustList() usermem.Addr { + t.mu.Lock() + addr := t.robustList + t.mu.Unlock() + return addr +} + +// SetRobustList sets the robust futex list for the task. +func (t *Task) SetRobustList(addr usermem.Addr) { + t.mu.Lock() + t.robustList = addr + t.mu.Unlock() +} + +// exitRobustList walks the robust futex list, marking locks dead and notifying +// wakers. It corresponds to Linux's exit_robust_list(). Following Linux, +// errors are silently ignored. +func (t *Task) exitRobustList() { + t.mu.Lock() + addr := t.robustList + t.robustList = 0 + t.mu.Unlock() + + if addr == 0 { + return + } + + var rl linux.RobustListHead + if _, err := rl.CopyIn(t, usermem.Addr(addr)); err != nil { + return + } + + next := primitive.Uint64(rl.List) + done := 0 + var pendingLockAddr usermem.Addr + if rl.ListOpPending != 0 { + pendingLockAddr = usermem.Addr(rl.ListOpPending + rl.FutexOffset) + } + + // Wake up normal elements. + for usermem.Addr(next) != addr { + // We traverse to the next element of the list before we + // actually wake anything. This prevents the race where waking + // this futex causes a modification of the list. + thisLockAddr := usermem.Addr(uint64(next) + rl.FutexOffset) + + // Try to decode the next element in the list before waking the + // current futex. But don't check the error until after we've + // woken the current futex. Linux does it in this order too + _, nextErr := next.CopyIn(t, usermem.Addr(next)) + + // Wakeup the current futex if it's not pending. + if thisLockAddr != pendingLockAddr { + t.wakeRobustListOne(thisLockAddr) + } + + // If there was an error copying the next futex, we must bail. + if nextErr != nil { + break + } + + // This is a user structure, so it could be a massive list, or + // even contain a loop if they are trying to mess with us. We + // cap traversal to prevent that. + done++ + if done >= linux.ROBUST_LIST_LIMIT { + break + } + } + + // Is there a pending entry to wake? + if pendingLockAddr != 0 { + t.wakeRobustListOne(pendingLockAddr) + } +} + +// wakeRobustListOne wakes a single futex from the robust list. +func (t *Task) wakeRobustListOne(addr usermem.Addr) { + // Bit 0 in address signals PI futex. + pi := addr&1 == 1 + addr = addr &^ 1 + + // Load the futex. + f, err := t.LoadUint32(addr) + if err != nil { + // Can't read this single value? Ignore the problem. + // We can wake the other futexes in the list. + return + } + + tid := uint32(t.ThreadID()) + for { + // Is this held by someone else? + if f&linux.FUTEX_TID_MASK != tid { + return + } + + // This thread is dying and it's holding this futex. We need to + // set the owner died bit and wake up any waiters. + newF := (f & linux.FUTEX_WAITERS) | linux.FUTEX_OWNER_DIED + if curF, err := t.CompareAndSwapUint32(addr, f, newF); err != nil { + return + } else if curF != f { + // Futex changed out from under us. Try again... + f = curF + continue + } + + // Wake waiters if there are any. + if f&linux.FUTEX_WAITERS != 0 { + private := f&linux.FUTEX_PRIVATE_FLAG != 0 + if pi { + t.Futex().UnlockPI(t, addr, tid, private) + return + } + t.Futex().Wake(t, addr, private, linux.FUTEX_BITSET_MATCH_ANY, 1) + } + + // Done. + return + } +} diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go index eeccaa197..d23cea802 100644 --- a/pkg/sentry/kernel/task_log.go +++ b/pkg/sentry/kernel/task_log.go @@ -27,6 +27,9 @@ const ( // maxStackDebugBytes is the maximum number of user stack bytes that may be // printed by debugDumpStack. maxStackDebugBytes = 1024 + // maxCodeDebugBytes is the maximum number of user code bytes that may be + // printed by debugDumpCode. + maxCodeDebugBytes = 128 ) // Infof logs an formatted info message by calling log.Infof. @@ -61,6 +64,7 @@ func (t *Task) IsLogging(level log.Level) bool { func (t *Task) DebugDumpState() { t.debugDumpRegisters() t.debugDumpStack() + t.debugDumpCode() if mm := t.MemoryManager(); mm != nil { t.Debugf("Mappings:\n%s", mm) } @@ -128,6 +132,45 @@ func (t *Task) debugDumpStack() { } } +// debugDumpCode logs user code contents at log level debug. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) debugDumpCode() { + if !t.IsLogging(log.Debug) { + return + } + m := t.MemoryManager() + if m == nil { + t.Debugf("Memory manager for task is gone, skipping application code dump.") + return + } + t.Debugf("Code:") + // Print code on both sides of the instruction register. + start := usermem.Addr(t.Arch().IP()) - maxCodeDebugBytes/2 + // Round addr down to a 16-byte boundary. + start &= ^usermem.Addr(15) + // Print 16 bytes per line, one byte at a time. + for offset := uint64(0); offset < maxCodeDebugBytes; offset += 16 { + addr, ok := start.AddLength(offset) + if !ok { + break + } + var data [16]byte + n, err := m.CopyIn(t, addr, data[:], usermem.IOOpts{ + IgnorePermissions: true, + }) + // Print as much of the line as we can, even if an error was + // encountered. + if n > 0 { + t.Debugf("%x: % x", addr, data[:n]) + } + if err != nil { + t.Debugf("Error reading stack at address %x: %v", addr+usermem.Addr(n), err) + break + } + } +} + // trace definitions. // // Note that all region names are prefixed by ':' in order to ensure that they @@ -203,6 +246,6 @@ func (t *Task) traceExecEvent(tc *TaskContext) { trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>") return } - defer file.DecRef() + defer file.DecRef(t) trace.Logf(t.traceContext, traceCategory, "exec: %s", file.PathnameWithDeleted(t)) } diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go index d654dd997..8dc3fec90 100644 --- a/pkg/sentry/kernel/task_run.go +++ b/pkg/sentry/kernel/task_run.go @@ -26,6 +26,7 @@ import ( ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -140,7 +141,7 @@ func (*runApp) handleCPUIDInstruction(t *Task) error { region := trace.StartRegion(t.traceContext, cpuidRegion) expected := arch.CPUIDInstruction[:] found := make([]byte, len(expected)) - _, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found) + _, err := t.CopyInBytes(usermem.Addr(t.Arch().IP()), found) if err == nil && bytes.Equal(expected, found) { // Skip the cpuid instruction. t.Arch().CPUIDEmulate(t) @@ -167,15 +168,30 @@ func (app *runApp) execute(t *Task) taskRunState { return (*runInterrupt)(nil) } - // We're about to switch to the application again. If there's still a + // Execute any task work callbacks before returning to user space. + if atomic.LoadInt32(&t.taskWorkCount) > 0 { + t.taskWorkMu.Lock() + queue := t.taskWork + t.taskWork = nil + atomic.StoreInt32(&t.taskWorkCount, 0) + t.taskWorkMu.Unlock() + + // Do not hold taskWorkMu while executing task work, which may register + // more work. + for _, work := range queue { + work.TaskWork(t) + } + } + + // We're about to switch to the application again. If there's still an // unhandled SyscallRestartErrno that wasn't translated to an EINTR, // restart the syscall that was interrupted. If there's a saved signal // mask, restore it. (Note that restoring the saved signal mask may unblock // a pending signal, causing another interruption, but that signal should // not interact with the interrupted syscall.) if t.haveSyscallReturn { - if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok { - if sre == ERESTART_RESTARTBLOCK { + if sre, ok := syserror.SyscallRestartErrnoFromReturn(t.Arch().Return()); ok { + if sre == syserror.ERESTART_RESTARTBLOCK { t.Debugf("Restarting syscall %d with restart block after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre) t.Arch().RestartSyscallWithRestartBlock() } else { @@ -245,7 +261,7 @@ func (app *runApp) execute(t *Task) taskRunState { region := trace.StartRegion(t.traceContext, runRegion) t.accountTaskGoroutineEnter(TaskGoroutineRunningApp) - info, at, err := t.p.Switch(t.MemoryManager().AddressSpace(), t.Arch(), t.rseqCPU) + info, at, err := t.p.Switch(t, t.MemoryManager(), t.Arch(), t.rseqCPU) t.accountTaskGoroutineLeave(TaskGoroutineRunningApp) region.End() diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go index 09366b60c..52c55d13d 100644 --- a/pkg/sentry/kernel/task_sched.go +++ b/pkg/sentry/kernel/task_sched.go @@ -133,9 +133,10 @@ func (t *Task) accountTaskGoroutineEnter(state TaskGoroutineState) { } } -// Preconditions: The caller must be running on the task goroutine, and leaving -// a state indicated by a previous call to -// t.accountTaskGoroutineEnter(state). +// Preconditions: +// * The caller must be running on the task goroutine +// * The caller must be leaving a state indicated by a previous call to +// t.accountTaskGoroutineEnter(state). func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) { if state != TaskGoroutineRunningApp { // Task is unblocking/continuing. @@ -191,8 +192,8 @@ func (tg *ThreadGroup) CPUStats() usage.CPUStats { return tg.cpuStatsAtLocked(tg.leader.k.CPUClockNow()) } -// Preconditions: As for TaskGoroutineSchedInfo.userTicksAt. The TaskSet mutex -// must be locked. +// Preconditions: Same as TaskGoroutineSchedInfo.userTicksAt, plus: +// * The TaskSet mutex must be locked. func (tg *ThreadGroup) cpuStatsAtLocked(now uint64) usage.CPUStats { stats := tg.exitedCPUStats // Account for live tasks. diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go index 79766cafe..ebdb83061 100644 --- a/pkg/sentry/kernel/task_signals.go +++ b/pkg/sentry/kernel/task_signals.go @@ -159,7 +159,7 @@ func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunS sigact := computeAction(linux.Signal(info.Signo), act) if t.haveSyscallReturn { - if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok { + if sre, ok := syserror.SyscallRestartErrnoFromReturn(t.Arch().Return()); ok { // Signals that are ignored, cause a thread group stop, or // terminate the thread group do not interact with interrupted // syscalls; in Linux terms, they are never returned to the signal @@ -168,11 +168,11 @@ func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunS // signal that is actually handled (by userspace). if sigact == SignalActionHandler { switch { - case sre == ERESTARTNOHAND: + case sre == syserror.ERESTARTNOHAND: fallthrough - case sre == ERESTART_RESTARTBLOCK: + case sre == syserror.ERESTART_RESTARTBLOCK: fallthrough - case (sre == ERESTARTSYS && !act.IsRestart()): + case (sre == syserror.ERESTARTSYS && !act.IsRestart()): t.Debugf("Not restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo) t.Arch().SetReturn(uintptr(-ExtractErrno(syserror.EINTR, -1))) default: @@ -255,10 +255,15 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct) } } + mm := t.MemoryManager() // Set up the signal handler. If we have a saved signal mask, the signal // handler should run with the current mask, but sigreturn should restore // the saved one. - st := &arch.Stack{t.Arch(), t.MemoryManager(), sp} + st := &arch.Stack{ + Arch: t.Arch(), + IO: mm, + Bottom: sp, + } mask := t.signalMask if t.haveSavedSignalMask { mask = t.savedSignalMask @@ -273,12 +278,13 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct) // Please see the linux code as reference: // linux/arch/arm64/kernel/signal.c:setup_return() if act.Flags&linux.SA_RESTORER == 0 { - act.Restorer = t.MemoryManager().VDSOSigReturn() + act.Restorer = mm.VDSOSigReturn() } if err := t.Arch().SignalSetup(st, &act, info, &alt, mask); err != nil { return err } + t.p.FullStateChanged() t.haveSavedSignalMask = false // Add our signal mask. @@ -310,14 +316,16 @@ func (t *Task) SignalReturn(rt bool) (*SyscallControl, error) { // Restore our signal mask. SIGKILL and SIGSTOP should not be blocked. t.SetSignalMask(sigset &^ UnblockableSignals) + t.p.FullStateChanged() return ctrlResume, nil } // Sigtimedwait implements the semantics of sigtimedwait(2). // -// Preconditions: The caller must be running on the task goroutine. t.exitState -// < TaskExitZombie. +// Preconditions: +// * The caller must be running on the task goroutine. +// * t.exitState < TaskExitZombie. func (t *Task) Sigtimedwait(set linux.SignalSet, timeout time.Duration) (*arch.SignalInfo, error) { // set is the set of signals we're interested in; invert it to get the set // of signals to block. @@ -581,8 +589,9 @@ func (t *Task) SignalMask() linux.SignalSet { // SetSignalMask sets t's signal mask. // -// Preconditions: SetSignalMask can only be called by the task goroutine. -// t.exitState < TaskExitZombie. +// Preconditions: +// * The caller must be running on the task goroutine. +// * t.exitState < TaskExitZombie. func (t *Task) SetSignalMask(mask linux.SignalSet) { // By precondition, t prevents t.tg from completing an execve and mutating // t.tg.signalHandlers, so we can skip the TaskSet mutex. @@ -628,7 +637,7 @@ func (t *Task) setSignalMaskLocked(mask linux.SignalSet) { // SetSavedSignalMask sets the saved signal mask (see Task.savedSignalMask's // comment). // -// Preconditions: SetSavedSignalMask can only be called by the task goroutine. +// Preconditions: The caller must be running on the task goroutine. func (t *Task) SetSavedSignalMask(mask linux.SignalSet) { t.savedSignalMask = mask t.haveSavedSignalMask = true @@ -636,6 +645,7 @@ func (t *Task) SetSavedSignalMask(mask linux.SignalSet) { // SignalStack returns the task-private signal stack. func (t *Task) SignalStack() arch.SignalStack { + t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()) alt := t.signalStack if t.onSignalStack(alt) { alt.Flags |= arch.SignalStackFlagOnStack @@ -1050,6 +1060,8 @@ func (*runInterrupt) execute(t *Task) taskRunState { // Are there signals pending? if info := t.dequeueSignalLocked(t.signalMask); info != nil { + t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()) + if linux.SignalSetOf(linux.Signal(info.Signo))&StopSignals != 0 { // Indicate that we've dequeued a stop signal before unlocking the // signal mutex; initiateGroupStop will check for races with diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index 8485fb4b6..8e28230cc 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -16,6 +16,7 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -98,14 +99,18 @@ type TaskConfig struct { // NewTask creates a new task defined by cfg. // // NewTask does not start the returned task; the caller must call Task.Start. -func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) { +// +// If successful, NewTask transfers references held by cfg to the new task. +// Otherwise, NewTask releases them. +func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) { t, err := ts.newTask(cfg) if err != nil { cfg.TaskContext.release() - cfg.FSContext.DecRef() - cfg.FDTable.DecRef() + cfg.FSContext.DecRef(ctx) + cfg.FDTable.DecRef(ctx) + cfg.IPCNamespace.DecRef(ctx) if cfg.MountNamespaceVFS2 != nil { - cfg.MountNamespaceVFS2.DecRef() + cfg.MountNamespaceVFS2.DecRef(ctx) } return nil, err } diff --git a/pkg/sentry/kernel/task_stop.go b/pkg/sentry/kernel/task_stop.go index 10c6e455c..a35948a5f 100644 --- a/pkg/sentry/kernel/task_stop.go +++ b/pkg/sentry/kernel/task_stop.go @@ -99,8 +99,9 @@ type TaskStop interface { // beginInternalStop indicates the start of an internal stop that applies to t. // -// Preconditions: The task must not already be in an internal stop (i.e. t.stop -// == nil). The caller must be running on the task goroutine. +// Preconditions: +// * The caller must be running on the task goroutine. +// * The task must not already be in an internal stop (i.e. t.stop == nil). func (t *Task) beginInternalStop(s TaskStop) { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() @@ -109,8 +110,8 @@ func (t *Task) beginInternalStop(s TaskStop) { t.beginInternalStopLocked(s) } -// Preconditions: The signal mutex must be locked. All preconditions for -// Task.beginInternalStop also apply. +// Preconditions: Same as beginInternalStop, plus: +// * The signal mutex must be locked. func (t *Task) beginInternalStopLocked(s TaskStop) { if t.stop != nil { panic(fmt.Sprintf("Attempting to enter internal stop %#v when already in internal stop %#v", s, t.stop)) @@ -128,8 +129,9 @@ func (t *Task) beginInternalStopLocked(s TaskStop) { // t.stop, which is why there is no endInternalStop that locks the signal mutex // for you. // -// Preconditions: The signal mutex must be locked. The task must be in an -// internal stop (i.e. t.stop != nil). +// Preconditions: +// * The signal mutex must be locked. +// * The task must be in an internal stop (i.e. t.stop != nil). func (t *Task) endInternalStopLocked() { if t.stop == nil { panic("Attempting to leave non-existent internal stop") @@ -205,6 +207,22 @@ func (ts *TaskSet) BeginExternalStop() { } } +// PullFullState receives full states for all tasks. +func (ts *TaskSet) PullFullState() { + ts.mu.Lock() + defer ts.mu.Unlock() + if ts.Root == nil { + return + } + for t := range ts.Root.tids { + t.Activate() + if mm := t.MemoryManager(); mm != nil { + t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()) + } + t.Deactivate() + } +} + // EndExternalStop indicates the end of an external stop started by a previous // call to TaskSet.BeginExternalStop. EndExternalStop does not wait for task // goroutines to resume. diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go index a5903b0b5..0141459e7 100644 --- a/pkg/sentry/kernel/task_syscall.go +++ b/pkg/sentry/kernel/task_syscall.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bits" + "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/memmap" @@ -29,75 +30,8 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -// SyscallRestartErrno represents a ERESTART* errno defined in the Linux's kernel -// include/linux/errno.h. These errnos are never returned to userspace -// directly, but are used to communicate the expected behavior of an -// interrupted syscall from the syscall to signal handling. -type SyscallRestartErrno int - -// These numeric values are significant because ptrace syscall exit tracing can -// observe them. -// -// For all of the following errnos, if the syscall is not interrupted by a -// signal delivered to a user handler, the syscall is restarted. -const ( - // ERESTARTSYS is returned by an interrupted syscall to indicate that it - // should be converted to EINTR if interrupted by a signal delivered to a - // user handler without SA_RESTART set, and restarted otherwise. - ERESTARTSYS = SyscallRestartErrno(512) - - // ERESTARTNOINTR is returned by an interrupted syscall to indicate that it - // should always be restarted. - ERESTARTNOINTR = SyscallRestartErrno(513) - - // ERESTARTNOHAND is returned by an interrupted syscall to indicate that it - // should be converted to EINTR if interrupted by a signal delivered to a - // user handler, and restarted otherwise. - ERESTARTNOHAND = SyscallRestartErrno(514) - - // ERESTART_RESTARTBLOCK is returned by an interrupted syscall to indicate - // that it should be restarted using a custom function. The interrupted - // syscall must register a custom restart function by calling - // Task.SetRestartSyscallFn. - ERESTART_RESTARTBLOCK = SyscallRestartErrno(516) -) - var vsyscallCount = metric.MustCreateNewUint64Metric("/kernel/vsyscall_count", false /* sync */, "Number of times vsyscalls were invoked by the application") -// Error implements error.Error. -func (e SyscallRestartErrno) Error() string { - // Descriptions are borrowed from strace. - switch e { - case ERESTARTSYS: - return "to be restarted if SA_RESTART is set" - case ERESTARTNOINTR: - return "to be restarted" - case ERESTARTNOHAND: - return "to be restarted if no handler" - case ERESTART_RESTARTBLOCK: - return "interrupted by signal" - default: - return "(unknown interrupt error)" - } -} - -// SyscallRestartErrnoFromReturn returns the SyscallRestartErrno represented by -// rv, the value in a syscall return register. -func SyscallRestartErrnoFromReturn(rv uintptr) (SyscallRestartErrno, bool) { - switch int(rv) { - case -int(ERESTARTSYS): - return ERESTARTSYS, true - case -int(ERESTARTNOINTR): - return ERESTARTNOINTR, true - case -int(ERESTARTNOHAND): - return ERESTARTNOHAND, true - case -int(ERESTART_RESTARTBLOCK): - return ERESTART_RESTARTBLOCK, true - default: - return 0, false - } -} - // SyscallRestartBlock represents the restart block for a syscall restartable // with a custom function. It encapsulates the state required to restart a // syscall across a S/R. @@ -354,7 +288,7 @@ func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState { // Grab the caller up front, to make sure there's a sensible stack. caller := t.Arch().Native(uintptr(0)) - if _, err := t.CopyIn(usermem.Addr(t.Arch().Stack()), caller); err != nil { + if _, err := caller.CopyIn(t, usermem.Addr(t.Arch().Stack())); err != nil { t.Debugf("vsyscall %d: error reading return address from stack: %v", sysno, err) t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) @@ -390,7 +324,7 @@ func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState { type runVsyscallAfterPtraceEventSeccomp struct { addr usermem.Addr sysno uintptr - caller interface{} + caller marshal.Marshallable } func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState { @@ -413,7 +347,7 @@ func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState { return t.doVsyscallInvoke(sysno, t.Arch().SyscallArgs(), r.caller) } -func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller interface{}) taskRunState { +func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller marshal.Marshallable) taskRunState { rval, ctrl, err := t.executeSyscall(sysno, args) if ctrl != nil { t.Debugf("vsyscall %d, caller %x: syscall control: %v", sysno, t.Arch().Value(caller), ctrl) @@ -447,7 +381,7 @@ func ExtractErrno(err error, sysno int) int { return 0 case syscall.Errno: return int(err) - case SyscallRestartErrno: + case syserror.SyscallRestartErrno: return int(err) case *memmap.BusError: // Bus errors may generate SIGBUS, but for syscalls they still diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go index b02044ad2..ce134bf54 100644 --- a/pkg/sentry/kernel/task_usermem.go +++ b/pkg/sentry/kernel/task_usermem.go @@ -18,6 +18,7 @@ import ( "math" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -43,17 +44,6 @@ func (t *Task) Deactivate() { } } -// CopyIn copies a fixed-size value or slice of fixed-size values in from the -// task's memory. The copy will fail with syscall.EFAULT if it traverses user -// memory that is unmapped or not readable by the user. -// -// This Task's AddressSpace must be active. -func (t *Task) CopyIn(addr usermem.Addr, dst interface{}) (int, error) { - return usermem.CopyObjectIn(t, t.MemoryManager(), addr, dst, usermem.IOOpts{ - AddressSpaceActive: true, - }) -} - // CopyInBytes is a fast version of CopyIn if the caller can serialize the // data without reflection and pass in a byte slice. // @@ -64,17 +54,6 @@ func (t *Task) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) { }) } -// CopyOut copies a fixed-size value or slice of fixed-size values out to the -// task's memory. The copy will fail with syscall.EFAULT if it traverses user -// memory that is unmapped or not writeable by the user. -// -// This Task's AddressSpace must be active. -func (t *Task) CopyOut(addr usermem.Addr, src interface{}) (int, error) { - return usermem.CopyObjectOut(t, t.MemoryManager(), addr, src, usermem.IOOpts{ - AddressSpaceActive: true, - }) -} - // CopyOutBytes is a fast version of CopyOut if the caller can serialize the // data without reflection and pass in a byte slice. // @@ -114,7 +93,7 @@ func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([ var v []string for { argAddr := t.Arch().Native(0) - if _, err := t.CopyIn(addr, argAddr); err != nil { + if _, err := argAddr.CopyIn(t, addr); err != nil { return v, err } if t.Arch().Value(argAddr) == 0 { @@ -143,8 +122,9 @@ func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([ // CopyOutIovecs converts src to an array of struct iovecs and copies it to the // memory mapped at addr. // -// Preconditions: As for usermem.IO.CopyOut. The caller must be running on the -// task goroutine. t's AddressSpace must be active. +// Preconditions: Same as usermem.IO.CopyOut, plus: +// * The caller must be running on the task goroutine. +// * t's AddressSpace must be active. func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error { switch t.Arch().Width() { case 8: @@ -191,8 +171,9 @@ func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error // combined length of all AddrRanges would otherwise exceed this amount, ranges // beyond MAX_RW_COUNT are silently truncated. // -// Preconditions: As for usermem.IO.CopyIn. The caller must be running on the -// task goroutine. t's AddressSpace must be active. +// Preconditions: Same as usermem.IO.CopyIn, plus: +// * The caller must be running on the task goroutine. +// * t's AddressSpace must be active. func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRangeSeq, error) { if numIovecs == 0 { return usermem.AddrRangeSeq{}, nil @@ -284,7 +265,7 @@ func (t *Task) SingleIOSequence(addr usermem.Addr, length int, opts usermem.IOOp // // IovecsIOSequence is analogous to Linux's lib/iov_iter.c:import_iovec(). // -// Preconditions: As for Task.CopyInIovecs. +// Preconditions: Same as Task.CopyInIovecs. func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOpts) (usermem.IOSequence, error) { if iovcnt < 0 || iovcnt > linux.UIO_MAXIOV { return usermem.IOSequence{}, syserror.EINVAL @@ -299,3 +280,30 @@ func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOp Opts: opts, }, nil } + +// copyContext implements marshal.CopyContext. It wraps a task to allow copying +// memory to and from the task memory with custom usermem.IOOpts. +type copyContext struct { + *Task + opts usermem.IOOpts +} + +// AsCopyContext wraps the task and returns it as CopyContext. +func (t *Task) AsCopyContext(opts usermem.IOOpts) marshal.CopyContext { + return ©Context{t, opts} +} + +// CopyInString copies a string in from the task's memory. +func (t *copyContext) CopyInString(addr usermem.Addr, maxLen int) (string, error) { + return usermem.CopyStringIn(t, t.MemoryManager(), addr, maxLen, t.opts) +} + +// CopyInBytes copies task memory into dst from an IO context. +func (t *copyContext) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) { + return t.MemoryManager().CopyIn(t, addr, dst, t.opts) +} + +// CopyOutBytes copies src into task memoryfrom an IO context. +func (t *copyContext) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) { + return t.MemoryManager().CopyOut(t, addr, src, t.opts) +} diff --git a/pkg/sentry/kernel/task_work.go b/pkg/sentry/kernel/task_work.go new file mode 100644 index 000000000..dda5a433a --- /dev/null +++ b/pkg/sentry/kernel/task_work.go @@ -0,0 +1,38 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import "sync/atomic" + +// TaskWorker is a deferred task. +// +// This must be savable. +type TaskWorker interface { + // TaskWork will be executed prior to returning to user space. Note that + // TaskWork may call RegisterWork again, but this will not be executed until + // the next return to user space, unlike in Linux. This effectively allows + // registration of indefinite user return hooks, but not by default. + TaskWork(t *Task) +} + +// RegisterWork can be used to register additional task work that will be +// performed prior to returning to user space. See TaskWorker.TaskWork for +// semantics regarding registration. +func (t *Task) RegisterWork(work TaskWorker) { + t.taskWorkMu.Lock() + defer t.taskWorkMu.Unlock() + atomic.AddInt32(&t.taskWorkCount, 1) + t.taskWork = append(t.taskWork, work) +} diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go index 52849f5b3..a183b28c1 100644 --- a/pkg/sentry/kernel/thread_group.go +++ b/pkg/sentry/kernel/thread_group.go @@ -18,6 +18,7 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -307,8 +308,8 @@ func (tg *ThreadGroup) Limits() *limits.LimitSet { return tg.limits } -// release releases the thread group's resources. -func (tg *ThreadGroup) release() { +// Release releases the thread group's resources. +func (tg *ThreadGroup) Release(ctx context.Context) { // Timers must be destroyed without holding the TaskSet or signal mutexes // since timers send signals with Timer.mu locked. tg.itimerRealTimer.Destroy() @@ -325,7 +326,7 @@ func (tg *ThreadGroup) release() { it.DestroyTimer() } if tg.mounts != nil { - tg.mounts.DecRef() + tg.mounts.DecRef(ctx) } } @@ -366,7 +367,8 @@ func (tg *ThreadGroup) SetControllingTTY(tty *TTY, arg int32) error { // terminal is stolen, and all processes that had it as controlling // terminal lose it." - tty_ioctl(4) if tty.tg != nil && tg.processGroup.session != tty.tg.processGroup.session { - if !auth.CredentialsFromContext(tg.leader).HasCapability(linux.CAP_SYS_ADMIN) || arg != 1 { + // Stealing requires CAP_SYS_ADMIN in the root user namespace. + if creds := auth.CredentialsFromContext(tg.leader); !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) || arg != 1 { return syserror.EPERM } // Steal the TTY away. Unlike TIOCNOTTY, don't send signals. diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go index bf2dabb6e..fdadb52c0 100644 --- a/pkg/sentry/kernel/threads.go +++ b/pkg/sentry/kernel/threads.go @@ -36,6 +36,8 @@ import ( const TasksLimit = (1 << 16) // ThreadID is a generic thread identifier. +// +// +marshal type ThreadID int32 // String returns a decimal representation of the ThreadID. @@ -87,6 +89,13 @@ type TaskSet struct { // at time of save (but note that this is not necessarily the same thing as // sync.WaitGroup's zero value). runningGoroutines sync.WaitGroup `state:"nosave"` + + // aioGoroutines is the number of goroutines running async I/O + // callbacks. + // + // aioGoroutines is not saved but is required to be zero at the time of + // save. + aioGoroutines sync.WaitGroup `state:"nosave"` } // newTaskSet returns a new, empty TaskSet. @@ -256,6 +265,13 @@ func (ns *PIDNamespace) Tasks() []*Task { return tasks } +// NumTasks returns the number of tasks in ns. +func (ns *PIDNamespace) NumTasks() int { + ns.owner.mu.RLock() + defer ns.owner.mu.RUnlock() + return len(ns.tids) +} + // ThreadGroups returns a snapshot of the thread groups in ns. func (ns *PIDNamespace) ThreadGroups() []*ThreadGroup { return ns.ThreadGroupsAppend(nil) diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD index 7ba7dc50c..2817aa3ba 100644 --- a/pkg/sentry/kernel/time/BUILD +++ b/pkg/sentry/kernel/time/BUILD @@ -6,6 +6,7 @@ go_library( name = "time", srcs = [ "context.go", + "tcpip.go", "time.go", ], visibility = ["//pkg/sentry:internal"], diff --git a/pkg/sentry/kernel/time/tcpip.go b/pkg/sentry/kernel/time/tcpip.go new file mode 100644 index 000000000..c4474c0cf --- /dev/null +++ b/pkg/sentry/kernel/time/tcpip.go @@ -0,0 +1,131 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package time + +import ( + "sync" + "time" +) + +// TcpipAfterFunc waits for duration to elapse according to clock then runs fn. +// The timer is started immediately and will fire exactly once. +func TcpipAfterFunc(clock Clock, duration time.Duration, fn func()) *TcpipTimer { + timer := &TcpipTimer{ + clock: clock, + } + timer.notifier = functionNotifier{ + fn: func() { + // tcpip.Timer.Stop() explicitly states that the function is called in a + // separate goroutine that Stop() does not synchronize with. + // Timer.Destroy() synchronizes with calls to TimerListener.Notify(). + // This is semantically meaningful because, in the former case, it's + // legal to call tcpip.Timer.Stop() while holding locks that may also be + // taken by the function, but this isn't so in the latter case. Most + // immediately, Timer calls TimerListener.Notify() while holding + // Timer.mu. A deadlock occurs without spawning a goroutine: + // T1: (Timer expires) + // => Timer.Tick() <- Timer.mu.Lock() called + // => TimerListener.Notify() + // => Timer.Stop() + // => Timer.Destroy() <- Timer.mu.Lock() called, deadlock! + // + // Spawning a goroutine avoids the deadlock: + // T1: (Timer expires) + // => Timer.Tick() <- Timer.mu.Lock() called + // => TimerListener.Notify() <- Launches T2 + // T2: + // => Timer.Stop() + // => Timer.Destroy() <- Timer.mu.Lock() called, blocks + // T1: + // => (returns) <- Timer.mu.Unlock() called + // T2: + // => (continues) <- No deadlock! + go func() { + timer.Stop() + fn() + }() + }, + } + timer.Reset(duration) + return timer +} + +// TcpipTimer is a resettable timer with variable duration expirations. +// Implements tcpip.Timer, which does not define a Destroy method; instead, all +// resources are released after timer expiration and calls to Timer.Stop. +// +// Must be created by AfterFunc. +type TcpipTimer struct { + // clock is the time source. clock is immutable. + clock Clock + + // notifier is called when the Timer expires. notifier is immutable. + notifier functionNotifier + + // mu protects t. + mu sync.Mutex + + // t stores the latest running Timer. This is replaced whenever Reset is + // called since Timer cannot be restarted once it has been Destroyed by Stop. + // + // This field is nil iff Stop has been called. + t *Timer +} + +// Stop implements tcpip.Timer.Stop. +func (r *TcpipTimer) Stop() bool { + r.mu.Lock() + defer r.mu.Unlock() + + if r.t == nil { + return false + } + _, lastSetting := r.t.Swap(Setting{}) + r.t.Destroy() + r.t = nil + return lastSetting.Enabled +} + +// Reset implements tcpip.Timer.Reset. +func (r *TcpipTimer) Reset(d time.Duration) { + r.mu.Lock() + defer r.mu.Unlock() + + if r.t == nil { + r.t = NewTimer(r.clock, &r.notifier) + } + + r.t.Swap(Setting{ + Enabled: true, + Period: 0, + Next: r.clock.Now().Add(d), + }) +} + +// functionNotifier is a TimerListener that runs a function. +// +// functionNotifier cannot be saved or loaded. +type functionNotifier struct { + fn func() +} + +// Notify implements ktime.TimerListener.Notify. +func (f *functionNotifier) Notify(uint64, Setting) (Setting, bool) { + f.fn() + return Setting{}, false +} + +// Destroy implements ktime.TimerListener.Destroy. +func (f *functionNotifier) Destroy() {} diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go index e959700f2..f61a8e164 100644 --- a/pkg/sentry/kernel/time/time.go +++ b/pkg/sentry/kernel/time/time.go @@ -616,8 +616,10 @@ func (t *Timer) Swap(s Setting) (Time, Setting) { // Timer's Clock) at which the Setting was changed. Setting s.Enabled to true // starts the timer, while setting s.Enabled to false stops it. // -// Preconditions: The Timer must not be paused. f cannot call any Timer methods -// since it is called with the Timer mutex locked. +// Preconditions: +// * The Timer must not be paused. +// * f cannot call any Timer methods since it is called with the Timer mutex +// locked. func (t *Timer) SwapAnd(s Setting, f func()) (Time, Setting) { now := t.clock.Now() t.mu.Lock() diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go index da0ea7bb5..7c4fefb16 100644 --- a/pkg/sentry/kernel/timekeeper.go +++ b/pkg/sentry/kernel/timekeeper.go @@ -21,8 +21,8 @@ import ( "gvisor.dev/gvisor/pkg/log" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" sentrytime "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sync" ) @@ -90,7 +90,7 @@ type Timekeeper struct { // NewTimekeeper does not take ownership of paramPage. // // SetClocks must be called on the returned Timekeeper before it is usable. -func NewTimekeeper(mfp pgalloc.MemoryFileProvider, paramPage platform.FileRange) (*Timekeeper, error) { +func NewTimekeeper(mfp pgalloc.MemoryFileProvider, paramPage memmap.FileRange) (*Timekeeper, error) { return &Timekeeper{ params: NewVDSOParamPage(mfp, paramPage), }, nil @@ -186,6 +186,7 @@ func (t *Timekeeper) startUpdater() { timer := time.NewTicker(sentrytime.ApproxUpdateInterval) t.wg.Add(1) go func() { // S/R-SAFE: stopped during save. + defer t.wg.Done() for { // Start with an update immediately, so the clocks are // ready ASAP. @@ -209,9 +210,6 @@ func (t *Timekeeper) startUpdater() { p.realtimeBaseRef = int64(realtimeParams.BaseRef) p.realtimeFrequency = realtimeParams.Frequency } - - log.Debugf("Updating VDSO parameters: %+v", p) - return p }); err != nil { log.Warningf("Unable to update VDSO parameter page: %v", err) @@ -220,7 +218,6 @@ func (t *Timekeeper) startUpdater() { select { case <-timer.C: case <-t.stop: - t.wg.Done() return } } diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go index f1b3c212c..9bc452e67 100644 --- a/pkg/sentry/kernel/vdso.go +++ b/pkg/sentry/kernel/vdso.go @@ -17,10 +17,9 @@ package kernel import ( "fmt" - "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/usermem" ) @@ -28,6 +27,8 @@ import ( // // They are exposed to the VDSO via a parameter page managed by VDSOParamPage, // which also includes a sequence counter. +// +// +marshal type vdsoParams struct { monotonicReady uint64 monotonicBaseCycles int64 @@ -58,7 +59,7 @@ type vdsoParams struct { type VDSOParamPage struct { // The parameter page is fr, allocated from mfp.MemoryFile(). mfp pgalloc.MemoryFileProvider - fr platform.FileRange + fr memmap.FileRange // seq is the current sequence count written to the page. // @@ -68,21 +69,29 @@ type VDSOParamPage struct { // checked in state_test_util tests, causing this field to change across // save / restore. seq uint64 + + // copyScratchBuffer is a temporary buffer used to marshal the params before + // copying it to the real parameter page. The parameter page is typically + // updated at a moderate frequency of ~O(seconds) throughout the lifetime of + // the sentry, so reusing this buffer is a good tradeoff between memory + // usage and the cost of allocation. + copyScratchBuffer []byte } // NewVDSOParamPage returns a VDSOParamPage. // // Preconditions: -// // * fr is a single page allocated from mfp.MemoryFile(). VDSOParamPage does // not take ownership of fr; it must remain allocated for the lifetime of the // VDSOParamPage. -// // * VDSOParamPage must be the only writer to fr. -// // * mfp.MemoryFile().MapInternal(fr) must return a single safemem.Block. -func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *VDSOParamPage { - return &VDSOParamPage{mfp: mfp, fr: fr} +func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr memmap.FileRange) *VDSOParamPage { + return &VDSOParamPage{ + mfp: mfp, + fr: fr, + copyScratchBuffer: make([]byte, (*vdsoParams)(nil).SizeBytes()), + } } // access returns a mapping of the param page. @@ -136,7 +145,8 @@ func (v *VDSOParamPage) Write(f func() vdsoParams) error { // Get the new params. p := f() - buf := binary.Marshal(nil, usermem.ByteOrder, p) + buf := v.copyScratchBuffer[:p.SizeBytes()] + p.MarshalUnsafe(buf) // Skip the sequence counter. if _, err := safemem.Copy(paramPage.DropFirst(8), safemem.BlockFromSafeSlice(buf)); err != nil { |