diff options
Diffstat (limited to 'pkg/sentry')
-rw-r--r-- | pkg/sentry/fsimpl/tmpfs/inode_refs.go | 5 | ||||
-rw-r--r-- | pkg/sentry/kernel/abstract_socket_namespace.go | 77 | ||||
-rw-r--r-- | pkg/sentry/kernel/kernel_state_autogen.go | 6 | ||||
-rw-r--r-- | pkg/sentry/platform/ring0/defs_impl_arm64.go | 2 | ||||
-rw-r--r-- | pkg/sentry/socket/unix/socket_refs.go | 111 | ||||
-rw-r--r-- | pkg/sentry/socket/unix/unix.go | 22 | ||||
-rw-r--r-- | pkg/sentry/socket/unix/unix_state_autogen.go | 36 | ||||
-rw-r--r-- | pkg/sentry/socket/unix/unix_vfs2.go | 6 |
8 files changed, 222 insertions, 43 deletions
diff --git a/pkg/sentry/fsimpl/tmpfs/inode_refs.go b/pkg/sentry/fsimpl/tmpfs/inode_refs.go index 8b7ff185f..38eddde7e 100644 --- a/pkg/sentry/fsimpl/tmpfs/inode_refs.go +++ b/pkg/sentry/fsimpl/tmpfs/inode_refs.go @@ -1,11 +1,10 @@ package tmpfs import ( - "runtime" - "sync/atomic" - "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" + "runtime" + "sync/atomic" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go index 52ed5cea2..1b9721534 100644 --- a/pkg/sentry/kernel/abstract_socket_namespace.go +++ b/pkg/sentry/kernel/abstract_socket_namespace.go @@ -15,29 +15,21 @@ package kernel import ( + "fmt" "syscall" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/refs_vfs2" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sync" ) // +stateify savable type abstractEndpoint struct { - ep transport.BoundEndpoint - wr *refs.WeakRef - name string - ns *AbstractSocketNamespace -} - -// WeakRefGone implements refs.WeakRefUser.WeakRefGone. -func (e *abstractEndpoint) WeakRefGone(context.Context) { - e.ns.mu.Lock() - if e.ns.endpoints[e.name].ep == e.ep { - delete(e.ns.endpoints, e.name) - } - e.ns.mu.Unlock() + ep transport.BoundEndpoint + socket refs_vfs2.RefCounter + name string + ns *AbstractSocketNamespace } // AbstractSocketNamespace is used to implement the Linux abstract socket functionality. @@ -46,7 +38,11 @@ func (e *abstractEndpoint) WeakRefGone(context.Context) { type AbstractSocketNamespace struct { mu sync.Mutex `state:"nosave"` - // Keeps mapping from name to endpoint. + // Keeps a mapping from name to endpoint. AbstractSocketNamespace does not hold + // any references on any sockets that it contains; when retrieving a socket, + // TryIncRef() must be called in case the socket is concurrently being + // destroyed. It is the responsibility of the socket to remove itself from the + // abstract socket namespace when it is destroyed. endpoints map[string]abstractEndpoint } @@ -58,15 +54,15 @@ func NewAbstractSocketNamespace() *AbstractSocketNamespace { } // A boundEndpoint wraps a transport.BoundEndpoint to maintain a reference on -// its backing object. +// its backing socket. type boundEndpoint struct { transport.BoundEndpoint - rc refs.RefCounter + socket refs_vfs2.RefCounter } // Release implements transport.BoundEndpoint.Release. func (e *boundEndpoint) Release(ctx context.Context) { - e.rc.DecRef(ctx) + e.socket.DecRef(ctx) e.BoundEndpoint.Release(ctx) } @@ -81,32 +77,59 @@ func (a *AbstractSocketNamespace) BoundEndpoint(name string) transport.BoundEndp return nil } - rc := ep.wr.Get() - if rc == nil { - delete(a.endpoints, name) + if !ep.socket.TryIncRef() { + // The socket has reached zero references and is being destroyed. return nil } - return &boundEndpoint{ep.ep, rc} + return &boundEndpoint{ep.ep, ep.socket} } // Bind binds the given socket. // -// When the last reference managed by rc is dropped, ep may be removed from the +// When the last reference managed by socket is dropped, ep may be removed from the // namespace. -func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, rc refs.RefCounter) error { +func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, socket refs_vfs2.RefCounter) error { a.mu.Lock() defer a.mu.Unlock() + // Check if there is already a socket (which has not yet been destroyed) bound at name. if ep, ok := a.endpoints[name]; ok { - if rc := ep.wr.Get(); rc != nil { - rc.DecRef(ctx) + if ep.socket.TryIncRef() { + ep.socket.DecRef(ctx) return syscall.EADDRINUSE } } ae := abstractEndpoint{ep: ep, name: name, ns: a} - ae.wr = refs.NewWeakRef(rc, &ae) + ae.socket = socket a.endpoints[name] = ae return nil } + +// Remove removes the specified socket at name from the abstract socket +// namespace, if it has not yet been replaced. +func (a *AbstractSocketNamespace) Remove(name string, socket refs_vfs2.RefCounter) { + a.mu.Lock() + defer a.mu.Unlock() + + ep, ok := a.endpoints[name] + if !ok { + // We never delete a map entry apart from a socket's destructor (although the + // map entry may be overwritten). Therefore, a socket should exist, even if it + // may not be the one we expect. + panic(fmt.Sprintf("expected socket to exist at '%s' in abstract socket namespace", name)) + } + + // A Bind() operation may race with callers of Remove(), e.g. in the + // following case: + // socket1 reaches zero references and begins destruction + // a.Bind("foo", ep, socket2) replaces socket1 with socket2 + // socket1's destructor calls a.Remove("foo", socket1) + // + // Therefore, we need to check that the socket at name is what we expect + // before modifying the map. + if ep.socket == socket { + delete(a.endpoints, name) + } +} diff --git a/pkg/sentry/kernel/kernel_state_autogen.go b/pkg/sentry/kernel/kernel_state_autogen.go index 8ab5e6f8e..106e237ec 100644 --- a/pkg/sentry/kernel/kernel_state_autogen.go +++ b/pkg/sentry/kernel/kernel_state_autogen.go @@ -16,7 +16,7 @@ func (x *abstractEndpoint) StateTypeName() string { func (x *abstractEndpoint) StateFields() []string { return []string{ "ep", - "wr", + "socket", "name", "ns", } @@ -27,7 +27,7 @@ func (x *abstractEndpoint) beforeSave() {} func (x *abstractEndpoint) StateSave(m state.Sink) { x.beforeSave() m.Save(0, &x.ep) - m.Save(1, &x.wr) + m.Save(1, &x.socket) m.Save(2, &x.name) m.Save(3, &x.ns) } @@ -36,7 +36,7 @@ func (x *abstractEndpoint) afterLoad() {} func (x *abstractEndpoint) StateLoad(m state.Source) { m.Load(0, &x.ep) - m.Load(1, &x.wr) + m.Load(1, &x.socket) m.Load(2, &x.name) m.Load(3, &x.ns) } diff --git a/pkg/sentry/platform/ring0/defs_impl_arm64.go b/pkg/sentry/platform/ring0/defs_impl_arm64.go index 424b66f76..2dac9ad14 100644 --- a/pkg/sentry/platform/ring0/defs_impl_arm64.go +++ b/pkg/sentry/platform/ring0/defs_impl_arm64.go @@ -3,11 +3,11 @@ package ring0 import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" + "reflect" "fmt" "gvisor.dev/gvisor/pkg/usermem" "io" - "reflect" ) // Useful bits. diff --git a/pkg/sentry/socket/unix/socket_refs.go b/pkg/sentry/socket/unix/socket_refs.go new file mode 100644 index 000000000..8eb0c9327 --- /dev/null +++ b/pkg/sentry/socket/unix/socket_refs.go @@ -0,0 +1,111 @@ +package unix + +import ( + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" + "runtime" + "sync/atomic" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var socketOpsCommonownerType *socketOpsCommon + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// +stateify savable +type socketOpsCommonRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *socketOpsCommonRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, socketOpsCommonownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *socketOpsCommonRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*socketOpsCommonRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *socketOpsCommonRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *socketOpsCommonRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic("Incrementing non-positive ref count") + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *socketOpsCommonRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *socketOpsCommonRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic("Decrementing non-positive ref count") + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index 2b8454edb..b7e8e4325 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -24,7 +24,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" @@ -80,7 +79,7 @@ func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, sty stype: stype, }, } - s.EnableLeakCheck("unix.SocketOperations") + s.EnableLeakCheck() return fs.NewFile(ctx, d, flags, &s) } @@ -89,17 +88,26 @@ func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, sty // // +stateify savable type socketOpsCommon struct { - refs.AtomicRefCount + socketOpsCommonRefs socket.SendReceiveTimeout ep transport.Endpoint stype linux.SockType + + // abstractName and abstractNamespace indicate the name and namespace of the + // socket if it is bound to an abstract socket namespace. Once the socket is + // bound, they cannot be modified. + abstractName string + abstractNamespace *kernel.AbstractSocketNamespace } // DecRef implements RefCounter.DecRef. func (s *socketOpsCommon) DecRef(ctx context.Context) { - s.DecRefWithDestructor(ctx, func(context.Context) { + s.socketOpsCommonRefs.DecRef(func() { s.ep.Close(ctx) + if s.abstractNamespace != nil { + s.abstractNamespace.Remove(s.abstractName, s) + } }) } @@ -284,10 +292,14 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { if t.IsNetworkNamespaced() { return syserr.ErrInvalidEndpointState } - if err := t.AbstractSockets().Bind(t, p[1:], bep, s); err != nil { + asn := t.AbstractSockets() + name := p[1:] + if err := asn.Bind(t, name, bep, s); err != nil { // syserr.ErrPortInUse corresponds to EADDRINUSE. return syserr.ErrPortInUse } + s.abstractName = name + s.abstractNamespace = asn } else { // The parent and name. var d *fs.Dirent diff --git a/pkg/sentry/socket/unix/unix_state_autogen.go b/pkg/sentry/socket/unix/unix_state_autogen.go index 4a3bbc11b..6966529c6 100644 --- a/pkg/sentry/socket/unix/unix_state_autogen.go +++ b/pkg/sentry/socket/unix/unix_state_autogen.go @@ -6,6 +6,29 @@ import ( "gvisor.dev/gvisor/pkg/state" ) +func (x *socketOpsCommonRefs) StateTypeName() string { + return "pkg/sentry/socket/unix.socketOpsCommonRefs" +} + +func (x *socketOpsCommonRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *socketOpsCommonRefs) beforeSave() {} + +func (x *socketOpsCommonRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *socketOpsCommonRefs) afterLoad() {} + +func (x *socketOpsCommonRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func (x *SocketOperations) StateTypeName() string { return "pkg/sentry/socket/unix.SocketOperations" } @@ -35,10 +58,12 @@ func (x *socketOpsCommon) StateTypeName() string { func (x *socketOpsCommon) StateFields() []string { return []string{ - "AtomicRefCount", + "socketOpsCommonRefs", "SendReceiveTimeout", "ep", "stype", + "abstractName", + "abstractNamespace", } } @@ -46,22 +71,27 @@ func (x *socketOpsCommon) beforeSave() {} func (x *socketOpsCommon) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.AtomicRefCount) + m.Save(0, &x.socketOpsCommonRefs) m.Save(1, &x.SendReceiveTimeout) m.Save(2, &x.ep) m.Save(3, &x.stype) + m.Save(4, &x.abstractName) + m.Save(5, &x.abstractNamespace) } func (x *socketOpsCommon) afterLoad() {} func (x *socketOpsCommon) StateLoad(m state.Source) { - m.Load(0, &x.AtomicRefCount) + m.Load(0, &x.socketOpsCommonRefs) m.Load(1, &x.SendReceiveTimeout) m.Load(2, &x.ep) m.Load(3, &x.stype) + m.Load(4, &x.abstractName) + m.Load(5, &x.abstractNamespace) } func init() { + state.Register((*socketOpsCommonRefs)(nil)) state.Register((*SocketOperations)(nil)) state.Register((*socketOpsCommon)(nil)) } diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go index dfa25241a..d066ef8ab 100644 --- a/pkg/sentry/socket/unix/unix_vfs2.go +++ b/pkg/sentry/socket/unix/unix_vfs2.go @@ -183,10 +183,14 @@ func (s *SocketVFS2) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { if t.IsNetworkNamespaced() { return syserr.ErrInvalidEndpointState } - if err := t.AbstractSockets().Bind(t, p[1:], bep, s); err != nil { + asn := t.AbstractSockets() + name := p[1:] + if err := asn.Bind(t, name, bep, s); err != nil { // syserr.ErrPortInUse corresponds to EADDRINUSE. return syserr.ErrPortInUse } + s.abstractName = name + s.abstractNamespace = asn } else { path := fspath.Parse(p) root := t.FSContext().RootDirectoryVFS2() |