From 8b5e9dbae85d0877a60112055aa304665d5e39fa Mon Sep 17 00:00:00 2001 From: Ayush Ranjan Date: Tue, 18 Aug 2020 10:20:17 -0700 Subject: [vfs2] Implement /proc/sys/net/ipv4/tcp_rmem and /proc/sys/net/ipv4/tcp_wmem. Updates #1035 PiperOrigin-RevId: 327253907 --- pkg/sentry/fsimpl/proc/BUILD | 1 + pkg/sentry/fsimpl/proc/tasks_sys.go | 110 ++++++++++++++++++++++++++++++++++-- 2 files changed, 106 insertions(+), 5 deletions(-) (limited to 'pkg/sentry/fsimpl/proc') diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index 6014138ff..14ecfd300 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -36,6 +36,7 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/sentry/vfs", + "//pkg/sync", "//pkg/syserror", "//pkg/tcpip/header", "//pkg/usermem", diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go index b71778128..6435385ef 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys.go @@ -25,10 +25,18 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) +type tcpMemDir int + +const ( + tcpRMem tcpMemDir = iota + tcpWMem +) + // newSysDir returns the dentry corresponding to /proc/sys directory. func (fs *filesystem) newSysDir(root *auth.Credentials, k *kernel.Kernel) *kernfs.Dentry { return kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ @@ -56,7 +64,9 @@ func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) *ke contents = map[string]*kernfs.Dentry{ "ipv4": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ "tcp_recovery": fs.newDentry(root, fs.NextIno(), 0644, &tcpRecoveryData{stack: stack}), + "tcp_rmem": fs.newDentry(root, fs.NextIno(), 0644, &tcpMemData{stack: stack, dir: tcpRMem}), "tcp_sack": fs.newDentry(root, fs.NextIno(), 0644, &tcpSackData{stack: stack}), + "tcp_wmem": fs.newDentry(root, fs.NextIno(), 0644, &tcpMemData{stack: stack, dir: tcpWMem}), // The following files are simple stubs until they are implemented in // netstack, most of these files are configuration related. We use the @@ -181,10 +191,11 @@ func (d *tcpSackData) Generate(ctx context.Context, buf *bytes.Buffer) error { // Tough luck. val = "1\n" } - buf.WriteString(val) - return nil + _, err := buf.WriteString(val) + return err } +// Write implements vfs.WritableDynamicBytesSource.Write. func (d *tcpSackData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { if offset != 0 { // No need to handle partial writes thus far. @@ -200,7 +211,7 @@ func (d *tcpSackData) Write(ctx context.Context, src usermem.IOSequence, offset var v int32 n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts) if err != nil { - return n, err + return 0, err } if d.enabled == nil { d.enabled = new(bool) @@ -228,10 +239,11 @@ func (d *tcpRecoveryData) Generate(ctx context.Context, buf *bytes.Buffer) error return err } - buf.WriteString(fmt.Sprintf("%d\n", recovery)) - return nil + _, err = buf.WriteString(fmt.Sprintf("%d\n", recovery)) + return err } +// Write implements vfs.WritableDynamicBytesSource.Write. func (d *tcpRecoveryData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { if offset != 0 { // No need to handle partial writes thus far. @@ -254,3 +266,91 @@ func (d *tcpRecoveryData) Write(ctx context.Context, src usermem.IOSequence, off } return n, nil } + +// tcpMemData implements vfs.WritableDynamicBytesSource for +// /proc/sys/net/ipv4/tcp_rmem and /proc/sys/net/ipv4/tcp_wmem. +// +// +stateify savable +type tcpMemData struct { + kernfs.DynamicBytesFile + + dir tcpMemDir + stack inet.Stack `state:"wait"` + + // mu protects against concurrent reads/writes to FDs based on the dentry + // backing this byte source. + mu sync.Mutex `state:"nosave"` +} + +var _ vfs.WritableDynamicBytesSource = (*tcpMemData)(nil) + +// Generate implements vfs.DynamicBytesSource. +func (d *tcpMemData) Generate(ctx context.Context, buf *bytes.Buffer) error { + d.mu.Lock() + defer d.mu.Unlock() + + size, err := d.readSizeLocked() + if err != nil { + return err + } + _, err = buf.WriteString(fmt.Sprintf("%d\t%d\t%d\n", size.Min, size.Default, size.Max)) + return err +} + +// Write implements vfs.WritableDynamicBytesSource.Write. +func (d *tcpMemData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + if offset != 0 { + // No need to handle partial writes thus far. + return 0, syserror.EINVAL + } + if src.NumBytes() == 0 { + return 0, nil + } + d.mu.Lock() + defer d.mu.Unlock() + + // Limit the amount of memory allocated. + src = src.TakeFirst(usermem.PageSize - 1) + size, err := d.readSizeLocked() + if err != nil { + return 0, err + } + buf := []int32{int32(size.Min), int32(size.Default), int32(size.Max)} + n, err := usermem.CopyInt32StringsInVec(ctx, src.IO, src.Addrs, buf, src.Opts) + if err != nil { + return 0, err + } + newSize := inet.TCPBufferSize{ + Min: int(buf[0]), + Default: int(buf[1]), + Max: int(buf[2]), + } + if err := d.writeSizeLocked(newSize); err != nil { + return 0, err + } + return n, nil +} + +// Precondition: d.mu must be locked. +func (d *tcpMemData) readSizeLocked() (inet.TCPBufferSize, error) { + switch d.dir { + case tcpRMem: + return d.stack.TCPReceiveBufferSize() + case tcpWMem: + return d.stack.TCPSendBufferSize() + default: + panic(fmt.Sprintf("unknown tcpMemFile type: %v", d.dir)) + } +} + +// Precondition: d.mu must be locked. +func (d *tcpMemData) writeSizeLocked(size inet.TCPBufferSize) error { + switch d.dir { + case tcpRMem: + return d.stack.SetTCPReceiveBufferSize(size) + case tcpWMem: + return d.stack.SetTCPSendBufferSize(size) + default: + panic(fmt.Sprintf("unknown tcpMemFile type: %v", d.dir)) + } +} -- cgit v1.2.3 From 1666c8919d9d4ced966977f23e2905ff835eaaa0 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Fri, 21 Aug 2020 14:28:27 -0700 Subject: Make mounts ReadWrite first, then later change to ReadOnly. This lets us create "synthetic" mountpoint directories in ReadOnly mounts during VFS setup. Also add context.WithMountNamespace, as some filesystems (like overlay) require a MountNamespace on ctx to handle vfs.Filesystem Operations. PiperOrigin-RevId: 327874971 --- pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go | 2 +- pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go | 2 +- pkg/sentry/fsimpl/proc/tasks_test.go | 2 +- pkg/sentry/fsimpl/tmpfs/benchmark_test.go | 2 +- pkg/sentry/syscalls/linux/vfs2/mount.go | 4 +- pkg/sentry/vfs/context.go | 24 ++++++++++ pkg/sentry/vfs/mount.go | 19 ++++++-- runsc/boot/vfs.go | 55 ++++++++++++++++------- 8 files changed, 83 insertions(+), 27 deletions(-) (limited to 'pkg/sentry/fsimpl/proc') diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go index 6b56c5e71..827a608cb 100644 --- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go @@ -63,7 +63,7 @@ func setupDevtmpfs(t *testing.T) (context.Context, *auth.Credentials, *vfs.Virtu }); err != nil { t.Fatalf("failed to create mount point: %v", err) } - if err := vfsObj.MountAt(ctx, creds, "devtmpfs" /* source */, &devpop, "devtmpfs" /* fsTypeName */, &vfs.MountOptions{}); err != nil { + if _, err := vfsObj.MountAt(ctx, creds, "devtmpfs" /* source */, &devpop, "devtmpfs" /* fsTypeName */, &vfs.MountOptions{}); err != nil { t.Fatalf("failed to mount devtmpfs: %v", err) } diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go index 8f7d5a9bb..a2cc9b59f 100644 --- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go +++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go @@ -90,7 +90,7 @@ func mount(b *testing.B, imagePath string, vfsfs *vfs.VirtualFilesystem, pop *vf ctx := contexttest.Context(b) creds := auth.CredentialsFromContext(ctx) - if err := vfsfs.MountAt(ctx, creds, imagePath, pop, "extfs", &vfs.MountOptions{ + if _, err := vfsfs.MountAt(ctx, creds, imagePath, pop, "extfs", &vfs.MountOptions{ GetFilesystemOptions: vfs.GetFilesystemOptions{ InternalData: int(f.Fd()), }, diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index 3c9297dee..d82b3d2f3 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -132,7 +132,7 @@ func setup(t *testing.T) *testutil.System { }, }, } - if err := k.VFS().MountAt(ctx, creds, "", pop, Name, mntOpts); err != nil { + if _, err := k.VFS().MountAt(ctx, creds, "", pop, Name, mntOpts); err != nil { t.Fatalf("MountAt(/proc): %v", err) } return testutil.NewSystem(ctx, t, k.VFS(), mntns) diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go index d263147c2..e5a4218e8 100644 --- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go +++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go @@ -405,7 +405,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) { } defer mountPoint.DecRef(ctx) // Create and mount the submount. - if err := vfsObj.MountAt(ctx, creds, "", &pop, "tmpfs", &vfs.MountOptions{}); err != nil { + if _, err := vfsObj.MountAt(ctx, creds, "", &pop, "tmpfs", &vfs.MountOptions{}); err != nil { b.Fatalf("failed to mount tmpfs submount: %v", err) } filePathBuilder.WriteString(mountPointName) diff --git a/pkg/sentry/syscalls/linux/vfs2/mount.go b/pkg/sentry/syscalls/linux/vfs2/mount.go index 4bd5c7ca2..769c9b92f 100644 --- a/pkg/sentry/syscalls/linux/vfs2/mount.go +++ b/pkg/sentry/syscalls/linux/vfs2/mount.go @@ -109,8 +109,8 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, err } defer target.Release(t) - - return 0, nil, t.Kernel().VFS().MountAt(t, creds, source, &target.pop, fsType, &opts) + _, err = t.Kernel().VFS().MountAt(t, creds, source, &target.pop, fsType, &opts) + return 0, nil, err } // Umount2 implements Linux syscall umount2(2). diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go index c9e724fef..97018651f 100644 --- a/pkg/sentry/vfs/context.go +++ b/pkg/sentry/vfs/context.go @@ -40,6 +40,30 @@ func MountNamespaceFromContext(ctx context.Context) *MountNamespace { return nil } +type mountNamespaceContext struct { + context.Context + mntns *MountNamespace +} + +// Value implements Context.Value. +func (mc mountNamespaceContext) Value(key interface{}) interface{} { + switch key { + case CtxMountNamespace: + mc.mntns.IncRef() + return mc.mntns + default: + return mc.Context.Value(key) + } +} + +// WithMountNamespace returns a copy of ctx with the given MountNamespace. +func WithMountNamespace(ctx context.Context, mntns *MountNamespace) context.Context { + return &mountNamespaceContext{ + Context: ctx, + mntns: mntns, + } +} + // RootFromContext returns the VFS root used by ctx. It takes a reference on // the returned VirtualDentry. If ctx does not have a specific VFS root, // RootFromContext returns a zero-value VirtualDentry. diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index 714af6907..09fea3628 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -263,16 +263,20 @@ func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Cr } // MountAt creates and mounts a Filesystem configured by the given arguments. -func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error { +// The VirtualFilesystem will hold a reference to the Mount until it is unmounted. +// +// This method returns the mounted Mount without a reference, for convenience +// during VFS setup when there is no chance of racing with unmount. +func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) (*Mount, error) { mnt, err := vfs.MountDisconnected(ctx, creds, source, fsTypeName, opts) if err != nil { - return err + return nil, err } defer mnt.DecRef(ctx) if err := vfs.ConnectMountAt(ctx, creds, mnt, target); err != nil { - return err + return nil, err } - return nil + return mnt, nil } // UmountAt removes the Mount at the given path. @@ -657,6 +661,13 @@ retryFirst: return VirtualDentry{mnt, d} } +// SetMountReadOnly sets the mount as ReadOnly. +func (vfs *VirtualFilesystem) SetMountReadOnly(mnt *Mount, ro bool) error { + vfs.mountMu.Lock() + defer vfs.mountMu.Unlock() + return mnt.setReadOnlyLocked(ro) +} + // CheckBeginWrite increments the counter of in-progress write operations on // mnt. If mnt is mounted MS_RDONLY, CheckBeginWrite does nothing and returns // EROFS. diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index f27a6ff6b..fb200e988 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -205,15 +205,34 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *config. for i := range mounts { submount := &mounts[i] log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options) + var ( + mnt *vfs.Mount + err error + ) + if hint := c.hints.findMount(submount.Mount); hint != nil && hint.isSupported() { - if err := c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.Mount, hint); err != nil { + mnt, err = c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.Mount, hint) + if err != nil { return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, submount.Destination, err) } } else { - if err := c.mountSubmountVFS2(ctx, conf, mns, creds, submount); err != nil { + mnt, err = c.mountSubmountVFS2(ctx, conf, mns, creds, submount) + if err != nil { return fmt.Errorf("mount submount %q: %w", submount.Destination, err) } } + + if mnt != nil && mnt.ReadOnly() { + // Switch to ReadWrite while we setup submounts. + if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil { + return fmt.Errorf("failed to set mount at %q readwrite: %v", submount.Destination, err) + } + defer func() { + if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil { + panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.Destination, err)) + } + }() + } } if err := c.mountTmpVFS2(ctx, conf, creds, mns); err != nil { @@ -256,7 +275,7 @@ func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) { return mounts, nil } -func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) error { +func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) (*vfs.Mount, error) { root := mns.Root() defer root.DecRef(ctx) target := &vfs.PathOperation{ @@ -266,21 +285,22 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.C } fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, submount) if err != nil { - return fmt.Errorf("mountOptions failed: %w", err) + return nil, fmt.Errorf("mountOptions failed: %w", err) } if len(fsName) == 0 { // Filesystem is not supported (e.g. cgroup), just skip it. - return nil + return nil, nil } if err := c.k.VFS().MkdirAllAt(ctx, submount.Destination, root, creds, &vfs.MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true}); err != nil { - return err + return nil, err } - if err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts); err != nil { - return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts) + mnt, err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts) + if err != nil { + return nil, fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts) } log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.Source, submount.Destination, submount.Type, opts.GetFilesystemOptions.Data) - return nil + return mnt, nil } // getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values @@ -407,7 +427,8 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *config.Config // another user. This is normally done for /tmp. Options: []string{"mode=01777"}, } - return c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount}) + _, err := c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount}) + return err case syserror.ENOTDIR: // Not a dir?! Let it be. @@ -458,25 +479,25 @@ func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *conf // mountSharedSubmount binds mount to a previously mounted volume that is shared // among containers in the same pod. -func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount specs.Mount, source *mountHint) error { +func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount specs.Mount, source *mountHint) (*vfs.Mount, error) { if err := source.checkCompatible(mount); err != nil { - return err + return nil, err } _, opts, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount}) if err != nil { - return err + return nil, err } newMnt, err := c.k.VFS().NewDisconnectedMount(source.vfsMount.Filesystem(), source.vfsMount.Root(), opts) if err != nil { - return err + return nil, err } defer newMnt.DecRef(ctx) root := mns.Root() defer root.DecRef(ctx) if err := c.k.VFS().MkdirAllAt(ctx, mount.Destination, root, creds, &vfs.MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true}); err != nil { - return err + return nil, err } target := &vfs.PathOperation{ @@ -485,8 +506,8 @@ func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *co Path: fspath.Parse(mount.Destination), } if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil { - return err + return nil, err } log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name) - return nil + return newMnt, nil } -- cgit v1.2.3 From 3810a62b3a2e6bb55c3d030e15ba09665f2f91b3 Mon Sep 17 00:00:00 2001 From: Rahat Mahmood Date: Fri, 21 Aug 2020 16:03:38 -0700 Subject: Clarify seek behaviour for kernfs.GenericDirectoryFD. - Remove comment about GenericDirectoryFD not being compatible with dynamic directories. It is currently being used to implement dynamic directories. - Try to handle SEEK_END better than setting the offset to infinity. SEEK_END is poorly defined for dynamic directories anyways, so at least try make it work correctly for the static entries. Updates #1193. PiperOrigin-RevId: 327890128 --- pkg/sentry/fsimpl/devpts/devpts.go | 4 ++- pkg/sentry/fsimpl/fuse/fusefs.go | 4 ++- pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 46 +++++++++++++++++++++++------ pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 12 ++++---- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 8 +++-- pkg/sentry/fsimpl/proc/filesystem.go | 6 ++++ pkg/sentry/fsimpl/proc/subtasks.go | 4 ++- pkg/sentry/fsimpl/proc/task.go | 8 +++-- pkg/sentry/fsimpl/proc/task_fds.go | 8 +++-- pkg/sentry/fsimpl/proc/tasks.go | 4 ++- pkg/sentry/fsimpl/proc/tasks_sys.go | 12 ++++---- pkg/sentry/fsimpl/sys/sys.go | 4 ++- 12 files changed, 89 insertions(+), 31 deletions(-) (limited to 'pkg/sentry/fsimpl/proc') diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go index 7169e91af..3f3a099bd 100644 --- a/pkg/sentry/fsimpl/devpts/devpts.go +++ b/pkg/sentry/fsimpl/devpts/devpts.go @@ -185,7 +185,9 @@ func (i *rootInode) masterClose(t *Terminal) { // Open implements kernfs.Inode.Open. func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ + SeekEnd: kernfs.SeekEndStaticEntries, + }) if err != nil { return nil, err } diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go index 83c24ec25..44021ee4b 100644 --- a/pkg/sentry/fsimpl/fuse/fusefs.go +++ b/pkg/sentry/fsimpl/fuse/fusefs.go @@ -220,7 +220,9 @@ func (fs *filesystem) newInode(creds *auth.Credentials, mode linux.FileMode) *ke // Open implements kernfs.Inode.Open. func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ + SeekEnd: kernfs.SeekEndStaticEntries, + }) if err != nil { return nil, err } diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index fcee6200a..6518ff5cd 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -15,7 +15,7 @@ package kernfs import ( - "math" + "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -28,9 +28,25 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// SeekEndConfig describes the SEEK_END behaviour for FDs. +type SeekEndConfig int + +// Constants related to SEEK_END behaviour for FDs. +const ( + // Consider the end of the file to be after the final static entry. This is + // the default option. + SeekEndStaticEntries = iota + // Consider the end of the file to be at offset 0. + SeekEndZero +) + +// GenericDirectoryFDOptions contains configuration for a GenericDirectoryFD. +type GenericDirectoryFDOptions struct { + SeekEnd SeekEndConfig +} + // GenericDirectoryFD implements vfs.FileDescriptionImpl for a generic directory -// inode that uses OrderChildren to track child nodes. GenericDirectoryFD is not -// compatible with dynamic directories. +// inode that uses OrderChildren to track child nodes. // // Note that GenericDirectoryFD holds a lock over OrderedChildren while calling // IterDirents callback. The IterDirents callback therefore cannot hash or @@ -45,6 +61,9 @@ type GenericDirectoryFD struct { vfs.DirectoryFileDescriptionDefaultImpl vfs.LockFD + // Immutable. + seekEnd SeekEndConfig + vfsfd vfs.FileDescription children *OrderedChildren @@ -57,9 +76,9 @@ type GenericDirectoryFD struct { // NewGenericDirectoryFD creates a new GenericDirectoryFD and returns its // dentry. -func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) { +func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions, fdOpts GenericDirectoryFDOptions) (*GenericDirectoryFD, error) { fd := &GenericDirectoryFD{} - if err := fd.Init(children, locks, opts); err != nil { + if err := fd.Init(children, locks, opts, fdOpts); err != nil { return nil, err } if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil { @@ -71,12 +90,13 @@ func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildre // Init initializes a GenericDirectoryFD. Use it when overriding // GenericDirectoryFD. Caller must call fd.VFSFileDescription.Init() with the // correct implementation. -func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions) error { +func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions, fdOpts GenericDirectoryFDOptions) error { if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 { // Can't open directories for writing. return syserror.EISDIR } fd.LockFD.Init(locks) + fd.seekEnd = fdOpts.SeekEnd fd.children = children return nil } @@ -209,9 +229,17 @@ func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int case linux.SEEK_CUR: offset += fd.off case linux.SEEK_END: - // TODO(gvisor.dev/issue/1193): This can prevent new files from showing up - // if they are added after SEEK_END. - offset = math.MaxInt64 + switch fd.seekEnd { + case SeekEndStaticEntries: + fd.children.mu.RLock() + offset += int64(len(fd.children.set)) + offset += 2 // '.' and '..' aren't tracked in children. + fd.children.mu.RUnlock() + case SeekEndZero: + // No-op: offset += 0. + default: + panic(fmt.Sprintf("Invalid GenericDirectoryFD.seekEnd = %v", fd.seekEnd)) + } default: return 0, syserror.EINVAL } diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index fe8a1e710..885856868 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -555,15 +555,16 @@ type StaticDirectory struct { InodeNoDynamicLookup OrderedChildren - locks vfs.FileLocks + locks vfs.FileLocks + fdOpts GenericDirectoryFDOptions } var _ Inode = (*StaticDirectory)(nil) // NewStaticDir creates a new static directory and returns its dentry. -func NewStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]*Dentry) *Dentry { +func NewStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]*Dentry, fdOpts GenericDirectoryFDOptions) *Dentry { inode := &StaticDirectory{} - inode.Init(creds, devMajor, devMinor, ino, perm) + inode.Init(creds, devMajor, devMinor, ino, perm, fdOpts) dentry := &Dentry{} dentry.Init(inode) @@ -576,16 +577,17 @@ func NewStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64 } // Init initializes StaticDirectory. -func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) { +func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, fdOpts GenericDirectoryFDOptions) { if perm&^linux.PermissionsMask != 0 { panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) } + s.fdOpts = fdOpts s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeDirectory|perm) } // Open implements kernfs.Inode. func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &s.locks, &opts) + fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &s.locks, &opts, s.fdOpts) if err != nil { return nil, err } diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index c5d5afedf..e5c28c0e4 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -119,7 +119,9 @@ func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMod } func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{ + SeekEnd: kernfs.SeekEndStaticEntries, + }) if err != nil { return nil, err } @@ -151,7 +153,9 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte } func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{ + SeekEnd: kernfs.SeekEndStaticEntries, + }) if err != nil { return nil, err } diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go index 2463d51cd..c350ec127 100644 --- a/pkg/sentry/fsimpl/proc/filesystem.go +++ b/pkg/sentry/fsimpl/proc/filesystem.go @@ -110,6 +110,12 @@ func newStaticFile(data string) *staticFile { return &staticFile{StaticData: vfs.StaticData{Data: data}} } +func newStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]*kernfs.Dentry) *kernfs.Dentry { + return kernfs.NewStaticDir(creds, devMajor, devMinor, ino, perm, children, kernfs.GenericDirectoryFDOptions{ + SeekEnd: kernfs.SeekEndZero, + }) +} + // InternalData contains internal data passed in to the procfs mount via // vfs.GetFilesystemOptions.InternalData. type InternalData struct { diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index 79c2725f3..f25747da3 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -155,7 +155,9 @@ func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) erro // Open implements kernfs.Inode. func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &subtasksFD{task: i.task} - if err := fd.Init(&i.OrderedChildren, &i.locks, &opts); err != nil { + if err := fd.Init(&i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ + SeekEnd: kernfs.SeekEndZero, + }); err != nil { return nil, err } if err := fd.VFSFileDescription().Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index a5c7aa470..109b31b4c 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -105,7 +105,9 @@ func (i *taskInode) Valid(ctx context.Context) bool { // Open implements kernfs.Inode. func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ + SeekEnd: kernfs.SeekEndZero, + }) if err != nil { return nil, err } @@ -142,7 +144,9 @@ func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux. dir := &kernfs.StaticDirectory{} // Note: credentials are overridden by taskOwnedInode. - dir.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm) + dir.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm, kernfs.GenericDirectoryFDOptions{ + SeekEnd: kernfs.SeekEndZero, + }) inode := &taskOwnedInode{Inode: dir, owner: task} d := &kernfs.Dentry{} diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index f0d3f7f5e..e8fcb9aa1 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -144,7 +144,9 @@ func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro // Open implements kernfs.Inode. func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ + SeekEnd: kernfs.SeekEndZero, + }) if err != nil { return nil, err } @@ -271,7 +273,9 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, // Open implements kernfs.Inode. func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ + SeekEnd: kernfs.SeekEndZero, + }) if err != nil { return nil, err } diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index 6d2b90a8b..1391992b7 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -199,7 +199,9 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback // Open implements kernfs.Inode. func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ + SeekEnd: kernfs.SeekEndZero, + }) if err != nil { return nil, err } diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go index 6435385ef..038a194c7 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys.go @@ -39,14 +39,14 @@ const ( // newSysDir returns the dentry corresponding to /proc/sys directory. func (fs *filesystem) newSysDir(root *auth.Credentials, k *kernel.Kernel) *kernfs.Dentry { - return kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ - "kernel": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ + return newStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ + "kernel": newStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ "hostname": fs.newDentry(root, fs.NextIno(), 0444, &hostnameData{}), "shmall": fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMALL)), "shmmax": fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMAX)), "shmmni": fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMNI)), }), - "vm": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ + "vm": newStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ "mmap_min_addr": fs.newDentry(root, fs.NextIno(), 0444, &mmapMinAddrData{k: k}), "overcommit_memory": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0\n")), }), @@ -62,7 +62,7 @@ func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) *ke // network namespace of the calling process. if stack := k.RootNetworkNamespace().Stack(); stack != nil { contents = map[string]*kernfs.Dentry{ - "ipv4": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ + "ipv4": newStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ "tcp_recovery": fs.newDentry(root, fs.NextIno(), 0644, &tcpRecoveryData{stack: stack}), "tcp_rmem": fs.newDentry(root, fs.NextIno(), 0644, &tcpMemData{stack: stack, dir: tcpRMem}), "tcp_sack": fs.newDentry(root, fs.NextIno(), 0644, &tcpSackData{stack: stack}), @@ -109,7 +109,7 @@ func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) *ke "tcp_syn_retries": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("3")), "tcp_timestamps": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")), }), - "core": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ + "core": newStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ "default_qdisc": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("pfifo_fast")), "message_burst": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("10")), "message_cost": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("5")), @@ -123,7 +123,7 @@ func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) *ke } } - return kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, contents) + return newStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, contents) } // mmapMinAddrData implements vfs.DynamicBytesSource for diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index 0401726b6..393feb802 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -131,7 +131,9 @@ func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.Set // Open implements kernfs.Inode.Open. func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{ + SeekEnd: kernfs.SeekEndStaticEntries, + }) if err != nil { return nil, err } -- cgit v1.2.3 From 87e03869065f0784bf9ed76855205693128f65a4 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Tue, 25 Aug 2020 21:01:45 -0700 Subject: Use new reference count utility throughout gvisor. This uses the refs_vfs2 template in vfs2 as well as objects common to vfs1 and vfs2. Note that vfs1-only refcounts are not replaced, since vfs1 will be deleted soon anyway. The following structs now use the new tool, with leak check enabled: devpts:rootInode fuse:inode kernfs:Dentry kernfs:dir kernfs:readonlyDir kernfs:StaticDirectory proc:fdDirInode proc:fdInfoDirInode proc:subtasksInode proc:taskInode proc:tasksInode vfs:FileDescription vfs:MountNamespace vfs:Filesystem sys:dir kernel:FSContext kernel:ProcessGroup kernel:Session shm:Shm mm:aioMappable mm:SpecialMappable transport:queue And the following use the template, but because they currently are not leak checked, a TODO is left instead of enabling leak check in this patch: kernel:FDTable tun:tunEndpoint Updates #1486. PiperOrigin-RevId: 328460377 --- pkg/refs_vfs2/BUILD | 2 +- pkg/refs_vfs2/refs_template.go | 17 ++++- pkg/sentry/fsimpl/devpts/BUILD | 15 ++++ pkg/sentry/fsimpl/devpts/devpts.go | 7 ++ pkg/sentry/fsimpl/fuse/BUILD | 13 ++++ pkg/sentry/fsimpl/fuse/fusefs.go | 7 ++ pkg/sentry/fsimpl/kernfs/BUILD | 54 ++++++++++++- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 27 ++++--- pkg/sentry/fsimpl/kernfs/kernfs.go | 24 +++--- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 12 +++ pkg/sentry/fsimpl/proc/BUILD | 61 +++++++++++++++ pkg/sentry/fsimpl/proc/subtasks.go | 7 ++ pkg/sentry/fsimpl/proc/task.go | 8 ++ pkg/sentry/fsimpl/proc/task_fds.go | 16 +++- pkg/sentry/fsimpl/proc/task_net.go | 6 +- pkg/sentry/fsimpl/proc/tasks.go | 7 ++ pkg/sentry/fsimpl/sys/BUILD | 15 ++++ pkg/sentry/fsimpl/sys/sys.go | 9 ++- pkg/sentry/kernel/BUILD | 48 ++++++++++++ pkg/sentry/kernel/fd_table.go | 21 +++-- pkg/sentry/kernel/fd_table_unsafe.go | 2 + pkg/sentry/kernel/fs_context.go | 89 ++++++++++++---------- pkg/sentry/kernel/sessions.go | 29 +++---- pkg/sentry/kernel/shm/BUILD | 13 ++++ pkg/sentry/kernel/shm/shm.go | 19 ++--- pkg/sentry/mm/BUILD | 24 ++++++ pkg/sentry/mm/aio_context.go | 7 +- pkg/sentry/mm/special_mappable.go | 7 +- pkg/sentry/socket/unix/transport/BUILD | 12 +++ pkg/sentry/socket/unix/transport/connectioned.go | 8 +- pkg/sentry/socket/unix/transport/connectionless.go | 2 +- pkg/sentry/socket/unix/transport/queue.go | 13 ++-- pkg/sentry/vfs/BUILD | 37 +++++++++ pkg/sentry/vfs/README.md | 9 --- pkg/sentry/vfs/file_description.go | 39 +--------- pkg/sentry/vfs/filesystem.go | 37 +-------- pkg/sentry/vfs/mount.go | 21 ++--- pkg/tcpip/link/tun/BUILD | 14 ++++ pkg/tcpip/link/tun/device.go | 9 +-- 39 files changed, 531 insertions(+), 236 deletions(-) (limited to 'pkg/sentry/fsimpl/proc') diff --git a/pkg/refs_vfs2/BUILD b/pkg/refs_vfs2/BUILD index 7b3e10683..577b827a5 100644 --- a/pkg/refs_vfs2/BUILD +++ b/pkg/refs_vfs2/BUILD @@ -11,7 +11,7 @@ go_template( types = [ "T", ], - visibility = ["//pkg/sentry:internal"], + visibility = ["//:sandbox"], deps = [ "//pkg/log", "//pkg/refs", diff --git a/pkg/refs_vfs2/refs_template.go b/pkg/refs_vfs2/refs_template.go index 99c43c065..d9b552896 100644 --- a/pkg/refs_vfs2/refs_template.go +++ b/pkg/refs_vfs2/refs_template.go @@ -12,11 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Package refs_template defines a template that can be used by reference counted -// objects. +// Package refs_template defines a template that can be used by reference +// counted objects. The "owner" template parameter is used in log messages to +// indicate the type of reference-counted object that exhibited a reference +// leak. As a result, structs that are embedded in other structs should not use +// this template, since it will make tracking down leaks more difficult. package refs_template import ( + "fmt" "runtime" "sync/atomic" @@ -38,6 +42,11 @@ var ownerType *T // Note that the number of references is actually refCount + 1 so that a default // zero-value Refs object contains one reference. // +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// // +stateify savable type Refs struct { // refCount is composed of two fields: @@ -82,7 +91,7 @@ func (r *Refs) ReadRefs() int64 { //go:nosplit func (r *Refs) IncRef() { if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { - panic("Incrementing non-positive ref count") + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, ownerType)) } } @@ -122,7 +131,7 @@ func (r *Refs) TryIncRef() bool { func (r *Refs) DecRef(destroy func()) { switch v := atomic.AddInt64(&r.refCount, -1); { case v < -1: - panic("Decrementing non-positive ref count") + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, ownerType)) case v == -1: // Call the destructor. diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD index 93512c9b6..3f64fab3a 100644 --- a/pkg/sentry/fsimpl/devpts/BUILD +++ b/pkg/sentry/fsimpl/devpts/BUILD @@ -1,7 +1,19 @@ load("//tools:defs.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") licenses(["notice"]) +go_template_instance( + name = "root_inode_refs", + out = "root_inode_refs.go", + package = "devpts", + prefix = "rootInode", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "rootInode", + }, +) + go_library( name = "devpts", srcs = [ @@ -9,6 +21,7 @@ go_library( "line_discipline.go", "master.go", "queue.go", + "root_inode_refs.go", "slave.go", "terminal.go", ], @@ -16,6 +29,8 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/log", + "//pkg/refs", "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/fs/lock", diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go index 3f3a099bd..0eaff9087 100644 --- a/pkg/sentry/fsimpl/devpts/devpts.go +++ b/pkg/sentry/fsimpl/devpts/devpts.go @@ -83,6 +83,7 @@ func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds } root.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 1, linux.ModeDirectory|0555) root.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + root.EnableLeakCheck() root.dentry.Init(root) // Construct the pts master inode and dentry. Linux always uses inode @@ -110,6 +111,7 @@ func (fs *filesystem) Release(ctx context.Context) { // rootInode is the root directory inode for the devpts mounts. type rootInode struct { + rootInodeRefs kernfs.AlwaysValid kernfs.InodeAttrs kernfs.InodeDirectoryNoNewChildren @@ -233,3 +235,8 @@ func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, } return offset, nil } + +// DecRef implements kernfs.Inode. +func (i *rootInode) DecRef(context.Context) { + i.rootInodeRefs.DecRef(i.Destroy) +} diff --git a/pkg/sentry/fsimpl/fuse/BUILD b/pkg/sentry/fsimpl/fuse/BUILD index 999111deb..53a4f3012 100644 --- a/pkg/sentry/fsimpl/fuse/BUILD +++ b/pkg/sentry/fsimpl/fuse/BUILD @@ -15,6 +15,17 @@ go_template_instance( }, ) +go_template_instance( + name = "inode_refs", + out = "inode_refs.go", + package = "fuse", + prefix = "inode", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "inode", + }, +) + go_library( name = "fuse", srcs = [ @@ -22,6 +33,7 @@ go_library( "dev.go", "fusefs.go", "init.go", + "inode_refs.go", "register.go", "request_list.go", ], @@ -30,6 +42,7 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/log", + "//pkg/refs", "//pkg/sentry/fsimpl/devtmpfs", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/kernel", diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go index 44021ee4b..9717c0e15 100644 --- a/pkg/sentry/fsimpl/fuse/fusefs.go +++ b/pkg/sentry/fsimpl/fuse/fusefs.go @@ -198,6 +198,7 @@ func (fs *filesystem) Release(ctx context.Context) { // inode implements kernfs.Inode. type inode struct { + inodeRefs kernfs.InodeAttrs kernfs.InodeNoDynamicLookup kernfs.InodeNotSymlink @@ -213,6 +214,7 @@ func (fs *filesystem) newInode(creds *auth.Credentials, mode linux.FileMode) *ke i := &inode{} i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755) i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + i.EnableLeakCheck() i.dentry.Init(i) return &i.dentry @@ -324,3 +326,8 @@ func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptio return statFromFUSEAttr(out.Attr, opts.Mask, fusefs.devMinor), nil } + +// DecRef implements kernfs.Inode. +func (i *inode) DecRef(context.Context) { + i.inodeRefs.DecRef(i.Destroy) +} diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD index 3835557fe..637dca70c 100644 --- a/pkg/sentry/fsimpl/kernfs/BUILD +++ b/pkg/sentry/fsimpl/kernfs/BUILD @@ -26,9 +26,54 @@ go_template_instance( }, ) +go_template_instance( + name = "dentry_refs", + out = "dentry_refs.go", + package = "kernfs", + prefix = "Dentry", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "Dentry", + }, +) + +go_template_instance( + name = "static_directory_refs", + out = "static_directory_refs.go", + package = "kernfs", + prefix = "StaticDirectory", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "StaticDirectory", + }, +) + +go_template_instance( + name = "dir_refs", + out = "dir_refs.go", + package = "kernfs_test", + prefix = "dir", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "dir", + }, +) + +go_template_instance( + name = "readonly_dir_refs", + out = "readonly_dir_refs.go", + package = "kernfs_test", + prefix = "readonlyDir", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "readonlyDir", + }, +) + go_library( name = "kernfs", srcs = [ + "dentry_refs.go", "dynamic_bytes_file.go", "fd_impl_util.go", "filesystem.go", @@ -36,6 +81,7 @@ go_library( "inode_impl_util.go", "kernfs.go", "slot_list.go", + "static_directory_refs.go", "symlink.go", ], visibility = ["//pkg/sentry:internal"], @@ -59,11 +105,17 @@ go_library( go_test( name = "kernfs_test", size = "small", - srcs = ["kernfs_test.go"], + srcs = [ + "dir_refs.go", + "kernfs_test.go", + "readonly_dir_refs.go", + ], deps = [ ":kernfs", "//pkg/abi/linux", "//pkg/context", + "//pkg/log", + "//pkg/refs", "//pkg/sentry/contexttest", "//pkg/sentry/fsimpl/testutil", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 885856868..f442a5606 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -20,7 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" @@ -344,8 +343,6 @@ type OrderedChildrenOptions struct { // // Must be initialize with Init before first use. type OrderedChildren struct { - refs.AtomicRefCount - // Can children be modified by user syscalls? It set to false, interface // methods that would modify the children return EPERM. Immutable. writable bool @@ -361,14 +358,14 @@ func (o *OrderedChildren) Init(opts OrderedChildrenOptions) { o.set = make(map[string]*slot) } -// DecRef implements Inode.DecRef. -func (o *OrderedChildren) DecRef(ctx context.Context) { - o.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) { - o.mu.Lock() - defer o.mu.Unlock() - o.order.Reset() - o.set = nil - }) +// Destroy clears the children stored in o. It should be called by structs +// embedding OrderedChildren upon destruction, i.e. when their reference count +// reaches zero. +func (o *OrderedChildren) Destroy() { + o.mu.Lock() + defer o.mu.Unlock() + o.order.Reset() + o.set = nil } // Populate inserts children into this OrderedChildren, and d's dentry @@ -549,6 +546,7 @@ func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.D // // +stateify savable type StaticDirectory struct { + StaticDirectoryRefs InodeNotSymlink InodeDirectoryNoNewChildren InodeAttrs @@ -594,11 +592,16 @@ func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd return fd.VFSFileDescription(), nil } -// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*StaticDirectory) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } +// DecRef implements kernfs.Inode. +func (s *StaticDirectory) DecRef(context.Context) { + s.StaticDirectoryRefs.DecRef(s.Destroy) +} + // AlwaysValid partially implements kernfs.inodeDynamicLookup. type AlwaysValid struct{} diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 51dbc050c..ca3685800 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -57,7 +57,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" @@ -161,9 +160,9 @@ const ( // // Must be initialized by Init prior to first use. type Dentry struct { - vfsd vfs.Dentry + DentryRefs - refs.AtomicRefCount + vfsd vfs.Dentry // flags caches useful information about the dentry from the inode. See the // dflags* consts above. Must be accessed by atomic ops. @@ -194,6 +193,7 @@ func (d *Dentry) Init(inode Inode) { if ftype == linux.ModeSymlink { d.flags |= dflagsIsSymlink } + d.EnableLeakCheck() } // VFSDentry returns the generic vfs dentry for this kernfs dentry. @@ -213,16 +213,14 @@ func (d *Dentry) isSymlink() bool { // DecRef implements vfs.DentryImpl.DecRef. func (d *Dentry) DecRef(ctx context.Context) { - d.AtomicRefCount.DecRefWithDestructor(ctx, d.destroy) -} - -// Precondition: Dentry must be removed from VFS' dentry cache. -func (d *Dentry) destroy(ctx context.Context) { - d.inode.DecRef(ctx) // IncRef from Init. - d.inode = nil - if d.parent != nil { - d.parent.DecRef(ctx) // IncRef from Dentry.InsertChild. - } + // Before the destructor is called, Dentry must be removed from VFS' dentry cache. + d.DentryRefs.DecRef(func() { + d.inode.DecRef(ctx) // IncRef from Init. + d.inode = nil + if d.parent != nil { + d.parent.DecRef(ctx) // IncRef from Dentry.InsertChild. + } + }) } // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index e5c28c0e4..e376d1736 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -96,6 +96,7 @@ func (*attrs) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.S } type readonlyDir struct { + readonlyDirRefs attrs kernfs.InodeNotSymlink kernfs.InodeNoDynamicLookup @@ -111,6 +112,7 @@ func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMod dir := &readonlyDir{} dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode) dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + dir.EnableLeakCheck() dir.dentry.Init(dir) dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents)) @@ -128,7 +130,12 @@ func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs return fd.VFSFileDescription(), nil } +func (d *readonlyDir) DecRef(context.Context) { + d.readonlyDirRefs.DecRef(d.Destroy) +} + type dir struct { + dirRefs attrs kernfs.InodeNotSymlink kernfs.InodeNoDynamicLookup @@ -145,6 +152,7 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte dir.fs = fs dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode) dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true}) + dir.EnableLeakCheck() dir.dentry.Init(dir) dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents)) @@ -162,6 +170,10 @@ func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, return fd.VFSFileDescription(), nil } +func (d *dir) DecRef(context.Context) { + d.dirRefs.DecRef(d.Destroy) +} + func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) { creds := auth.CredentialsFromContext(ctx) dir := d.fs.newDir(creds, opts.Mode, nil) diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index 14ecfd300..a45b44440 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -1,18 +1,79 @@ load("//tools:defs.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") licenses(["notice"]) +go_template_instance( + name = "fd_dir_inode_refs", + out = "fd_dir_inode_refs.go", + package = "proc", + prefix = "fdDirInode", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "fdDirInode", + }, +) + +go_template_instance( + name = "fd_info_dir_inode_refs", + out = "fd_info_dir_inode_refs.go", + package = "proc", + prefix = "fdInfoDirInode", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "fdInfoDirInode", + }, +) + +go_template_instance( + name = "subtasks_inode_refs", + out = "subtasks_inode_refs.go", + package = "proc", + prefix = "subtasksInode", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "subtasksInode", + }, +) + +go_template_instance( + name = "task_inode_refs", + out = "task_inode_refs.go", + package = "proc", + prefix = "taskInode", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "taskInode", + }, +) + +go_template_instance( + name = "tasks_inode_refs", + out = "tasks_inode_refs.go", + package = "proc", + prefix = "tasksInode", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "tasksInode", + }, +) + go_library( name = "proc", srcs = [ + "fd_dir_inode_refs.go", + "fd_info_dir_inode_refs.go", "filesystem.go", "subtasks.go", + "subtasks_inode_refs.go", "task.go", "task_fds.go", "task_files.go", + "task_inode_refs.go", "task_net.go", "tasks.go", "tasks_files.go", + "tasks_inode_refs.go", "tasks_sys.go", ], visibility = ["//pkg/sentry:internal"], diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index f25747da3..01c0efb3a 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -31,6 +31,7 @@ import ( // // +stateify savable type subtasksInode struct { + subtasksInodeRefs kernfs.InodeNotSymlink kernfs.InodeDirectoryNoNewChildren kernfs.InodeAttrs @@ -57,6 +58,7 @@ func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, // Note: credentials are overridden by taskOwnedInode. subInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) subInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + subInode.EnableLeakCheck() inode := &taskOwnedInode{Inode: subInode, owner: task} dentry := &kernfs.Dentry{} @@ -182,3 +184,8 @@ func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs func (*subtasksInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } + +// DecRef implements kernfs.Inode. +func (i *subtasksInode) DecRef(context.Context) { + i.subtasksInodeRefs.DecRef(i.Destroy) +} diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 109b31b4c..66b557abd 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -32,6 +32,7 @@ import ( // // +stateify savable type taskInode struct { + taskInodeRefs kernfs.InodeNotSymlink kernfs.InodeDirectoryNoNewChildren kernfs.InodeNoDynamicLookup @@ -84,6 +85,7 @@ func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace taskInode := &taskInode{task: task} // Note: credentials are overridden by taskOwnedInode. taskInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) + taskInode.EnableLeakCheck() inode := &taskOwnedInode{Inode: taskInode, owner: task} dentry := &kernfs.Dentry{} @@ -119,6 +121,11 @@ func (*taskInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, v return syserror.EPERM } +// DecRef implements kernfs.Inode. +func (i *taskInode) DecRef(context.Context) { + i.taskInodeRefs.DecRef(i.Destroy) +} + // taskOwnedInode implements kernfs.Inode and overrides inode owner with task // effective user and group. type taskOwnedInode struct { @@ -147,6 +154,7 @@ func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux. dir.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm, kernfs.GenericDirectoryFDOptions{ SeekEnd: kernfs.SeekEndZero, }) + dir.EnableLeakCheck() inode := &taskOwnedInode{Inode: dir, owner: task} d := &kernfs.Dentry{} diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index e8fcb9aa1..0527b2de8 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -22,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -101,6 +100,7 @@ func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, off // // +stateify savable type fdDirInode struct { + fdDirInodeRefs kernfs.InodeNotSymlink kernfs.InodeDirectoryNoNewChildren kernfs.InodeAttrs @@ -120,6 +120,7 @@ func (fs *filesystem) newFDDirInode(task *kernel.Task) *kernfs.Dentry { }, } inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) + inode.EnableLeakCheck() dentry := &kernfs.Dentry{} dentry.Init(inode) @@ -175,6 +176,11 @@ func (i *fdDirInode) CheckPermissions(ctx context.Context, creds *auth.Credentia return err } +// DecRef implements kernfs.Inode. +func (i *fdDirInode) DecRef(context.Context) { + i.fdDirInodeRefs.DecRef(i.Destroy) +} + // fdSymlink is an symlink for the /proc/[pid]/fd/[fd] file. // // +stateify savable @@ -227,6 +233,7 @@ func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDen // // +stateify savable type fdInfoDirInode struct { + fdInfoDirInodeRefs kernfs.InodeNotSymlink kernfs.InodeDirectoryNoNewChildren kernfs.InodeAttrs @@ -245,6 +252,7 @@ func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) *kernfs.Dentry { }, } inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) + inode.EnableLeakCheck() dentry := &kernfs.Dentry{} dentry.Init(inode) @@ -282,12 +290,16 @@ func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd * return fd.VFSFileDescription(), nil } +// DecRef implements kernfs.Inode. +func (i *fdInfoDirInode) DecRef(context.Context) { + i.fdInfoDirInodeRefs.DecRef(i.Destroy) +} + // fdInfoData implements vfs.DynamicBytesSource for /proc/[pid]/fdinfo/[fd]. // // +stateify savable type fdInfoData struct { kernfs.DynamicBytesFile - refs.AtomicRefCount task *kernel.Task fd int32 diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go index a4c884bf9..4e69782c7 100644 --- a/pkg/sentry/fsimpl/proc/task_net.go +++ b/pkg/sentry/fsimpl/proc/task_net.go @@ -262,7 +262,7 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error { // For now, we always redact this pointer. fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %8d", (*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct. - s.Refs()-1, // RefCount, don't count our own ref. + s.ReadRefs()-1, // RefCount, don't count our own ref. 0, // Protocol, always 0 for UDS. sockFlags, // Flags. sops.Endpoint().Type(), // Type. @@ -430,7 +430,7 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, // Field: refcount. Don't count the ref we obtain while deferencing // the weakref to this socket. - fmt.Fprintf(buf, "%d ", s.Refs()-1) + fmt.Fprintf(buf, "%d ", s.ReadRefs()-1) // Field: Socket struct address. Redacted due to the same reason as // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. @@ -589,7 +589,7 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error { // Field: ref; reference count on the socket inode. Don't count the ref // we obtain while deferencing the weakref to this socket. - fmt.Fprintf(buf, "%d ", s.Refs()-1) + fmt.Fprintf(buf, "%d ", s.ReadRefs()-1) // Field: Socket struct address. Redacted due to the same reason as // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index 1391992b7..863c4467e 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -37,6 +37,7 @@ const ( // // +stateify savable type tasksInode struct { + tasksInodeRefs kernfs.InodeNotSymlink kernfs.InodeDirectoryNoNewChildren kernfs.InodeAttrs @@ -84,6 +85,7 @@ func (fs *filesystem) newTasksInode(k *kernel.Kernel, pidns *kernel.PIDNamespace cgroupControllers: cgroupControllers, } inode.InodeAttrs.Init(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) + inode.EnableLeakCheck() dentry := &kernfs.Dentry{} dentry.Init(inode) @@ -226,6 +228,11 @@ func (i *tasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.St return stat, nil } +// DecRef implements kernfs.Inode. +func (i *tasksInode) DecRef(context.Context) { + i.tasksInodeRefs.DecRef(i.Destroy) +} + // staticFileSetStat implements a special static file that allows inode // attributes to be set. This is to support /proc files that are readonly, but // allow attributes to be set. diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD index f9b232da6..906cd52cb 100644 --- a/pkg/sentry/fsimpl/sys/BUILD +++ b/pkg/sentry/fsimpl/sys/BUILD @@ -1,10 +1,23 @@ load("//tools:defs.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") licenses(["notice"]) +go_template_instance( + name = "dir_refs", + out = "dir_refs.go", + package = "sys", + prefix = "dir", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "dir", + }, +) + go_library( name = "sys", srcs = [ + "dir_refs.go", "kcov.go", "sys.go", ], @@ -13,6 +26,8 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/coverage", + "//pkg/log", + "//pkg/refs", "//pkg/sentry/arch", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/kernel", diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index 1f042d9f7..ea30a4ec2 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -118,6 +118,7 @@ func (fs *filesystem) Release(ctx context.Context) { // dir implements kernfs.Inode. type dir struct { + dirRefs kernfs.InodeAttrs kernfs.InodeNoDynamicLookup kernfs.InodeNotSymlink @@ -133,6 +134,7 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte d := &dir{} d.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755) d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + d.EnableLeakCheck() d.dentry.Init(d) d.IncLinks(d.OrderedChildren.Populate(&d.dentry, contents)) @@ -140,7 +142,7 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte return &d.dentry } -// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } @@ -156,6 +158,11 @@ func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, return fd.VFSFileDescription(), nil } +// DecRef implements kernfs.Inode.DecRef. +func (d *dir) DecRef(context.Context) { + d.dirRefs.DecRef(d.Destroy) +} + // cpuFile implements kernfs.Inode. type cpuFile struct { kernfs.DynamicBytesFile diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index d1ecceba3..d436daab4 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -74,6 +74,50 @@ go_template_instance( }, ) +go_template_instance( + name = "fd_table_refs", + out = "fd_table_refs.go", + package = "kernel", + prefix = "FDTable", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "FDTable", + }, +) + +go_template_instance( + name = "fs_context_refs", + out = "fs_context_refs.go", + package = "kernel", + prefix = "FSContext", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "FSContext", + }, +) + +go_template_instance( + name = "process_group_refs", + out = "process_group_refs.go", + package = "kernel", + prefix = "ProcessGroup", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "ProcessGroup", + }, +) + +go_template_instance( + name = "session_refs", + out = "session_refs.go", + package = "kernel", + prefix = "Session", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "Session", + }, +) + proto_library( name = "uncaught_signal", srcs = ["uncaught_signal.proto"], @@ -88,8 +132,10 @@ go_library( "aio.go", "context.go", "fd_table.go", + "fd_table_refs.go", "fd_table_unsafe.go", "fs_context.go", + "fs_context_refs.go", "ipc_namespace.go", "kcov.go", "kcov_unsafe.go", @@ -101,6 +147,7 @@ go_library( "pending_signals_state.go", "posixtimer.go", "process_group_list.go", + "process_group_refs.go", "ptrace.go", "ptrace_amd64.go", "ptrace_arm64.go", @@ -108,6 +155,7 @@ go_library( "seccomp.go", "seqatomic_taskgoroutineschedinfo_unsafe.go", "session_list.go", + "session_refs.go", "sessions.go", "signal.go", "signal_handlers.go", diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index ce53af69b..5773244ac 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/limits" @@ -78,7 +77,8 @@ type descriptor struct { // // +stateify savable type FDTable struct { - refs.AtomicRefCount + FDTableRefs + k *Kernel // mu protects below. @@ -176,16 +176,15 @@ func (k *Kernel) NewFDTable() *FDTable { return f } -// destroy removes all of the file descriptors from the map. -func (f *FDTable) destroy(ctx context.Context) { - f.RemoveIf(ctx, func(*fs.File, *vfs.FileDescription, FDFlags) bool { - return true - }) -} - -// DecRef implements RefCounter.DecRef with destructor f.destroy. +// DecRef implements RefCounter.DecRef. +// +// If f reaches zero references, all of its file descriptors are removed. func (f *FDTable) DecRef(ctx context.Context) { - f.DecRefWithDestructor(ctx, f.destroy) + f.FDTableRefs.DecRef(func() { + f.RemoveIf(ctx, func(*fs.File, *vfs.FileDescription, FDFlags) bool { + return true + }) + }) } // Size returns the number of file descriptor slots currently allocated. diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go index 7fd97dc53..6b8feb107 100644 --- a/pkg/sentry/kernel/fd_table_unsafe.go +++ b/pkg/sentry/kernel/fd_table_unsafe.go @@ -31,6 +31,8 @@ type descriptorTable struct { } // init initializes the table. +// +// TODO(gvisor.dev/1486): Enable leak check for FDTable. func (f *FDTable) init() { var slice []unsafe.Pointer // Empty slice. atomic.StorePointer(&f.slice, unsafe.Pointer(&slice)) diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go index 8f2d36d5a..d46d1e1c1 100644 --- a/pkg/sentry/kernel/fs_context.go +++ b/pkg/sentry/kernel/fs_context.go @@ -18,7 +18,6 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" @@ -30,7 +29,7 @@ import ( // // +stateify savable type FSContext struct { - refs.AtomicRefCount + FSContextRefs // mu protects below. mu sync.Mutex `state:"nosave"` @@ -64,7 +63,7 @@ func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext { cwd: cwd, umask: umask, } - f.EnableLeakCheck("kernel.FSContext") + f.EnableLeakCheck() return &f } @@ -77,54 +76,56 @@ func NewFSContextVFS2(root, cwd vfs.VirtualDentry, umask uint) *FSContext { cwdVFS2: cwd, umask: umask, } - f.EnableLeakCheck("kernel.FSContext") + f.EnableLeakCheck() return &f } -// destroy is the destructor for an FSContext. +// DecRef implements RefCounter.DecRef. // -// This will call DecRef on both root and cwd Dirents. If either call to -// DecRef returns an error, then it will be propagated. If both calls to -// DecRef return an error, then the one from root.DecRef will be propagated. +// When f reaches zero references, DecRef will be called on both root and cwd +// Dirents. // // Note that there may still be calls to WorkingDirectory() or RootDirectory() // (that return nil). This is because valid references may still be held via // proc files or other mechanisms. -func (f *FSContext) destroy(ctx context.Context) { - // Hold f.mu so that we don't race with RootDirectory() and - // WorkingDirectory(). - f.mu.Lock() - defer f.mu.Unlock() - - if VFS2Enabled { - f.rootVFS2.DecRef(ctx) - f.rootVFS2 = vfs.VirtualDentry{} - f.cwdVFS2.DecRef(ctx) - f.cwdVFS2 = vfs.VirtualDentry{} - } else { - f.root.DecRef(ctx) - f.root = nil - f.cwd.DecRef(ctx) - f.cwd = nil - } -} - -// DecRef implements RefCounter.DecRef with destructor f.destroy. func (f *FSContext) DecRef(ctx context.Context) { - f.DecRefWithDestructor(ctx, f.destroy) + f.FSContextRefs.DecRef(func() { + // Hold f.mu so that we don't race with RootDirectory() and + // WorkingDirectory(). + f.mu.Lock() + defer f.mu.Unlock() + + if VFS2Enabled { + f.rootVFS2.DecRef(ctx) + f.rootVFS2 = vfs.VirtualDentry{} + f.cwdVFS2.DecRef(ctx) + f.cwdVFS2 = vfs.VirtualDentry{} + } else { + f.root.DecRef(ctx) + f.root = nil + f.cwd.DecRef(ctx) + f.cwd = nil + } + }) } // Fork forks this FSContext. // -// This is not a valid call after destroy. +// This is not a valid call after f is destroyed. func (f *FSContext) Fork() *FSContext { f.mu.Lock() defer f.mu.Unlock() if VFS2Enabled { + if !f.cwdVFS2.Ok() { + panic("FSContext.Fork() called after destroy") + } f.cwdVFS2.IncRef() f.rootVFS2.IncRef() } else { + if f.cwd == nil { + panic("FSContext.Fork() called after destroy") + } f.cwd.IncRef() f.root.IncRef() } @@ -140,8 +141,8 @@ func (f *FSContext) Fork() *FSContext { // WorkingDirectory returns the current working directory. // -// This will return nil if called after destroy(), otherwise it will return a -// Dirent with a reference taken. +// This will return nil if called after f is destroyed, otherwise it will return +// a Dirent with a reference taken. func (f *FSContext) WorkingDirectory() *fs.Dirent { f.mu.Lock() defer f.mu.Unlock() @@ -152,8 +153,8 @@ func (f *FSContext) WorkingDirectory() *fs.Dirent { // WorkingDirectoryVFS2 returns the current working directory. // -// This will return nil if called after destroy(), otherwise it will return a -// Dirent with a reference taken. +// This will return nil if called after f is destroyed, otherwise it will return +// a Dirent with a reference taken. func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry { f.mu.Lock() defer f.mu.Unlock() @@ -165,7 +166,7 @@ func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry { // SetWorkingDirectory sets the current working directory. // This will take an extra reference on the Dirent. // -// This is not a valid call after destroy. +// This is not a valid call after f is destroyed. func (f *FSContext) SetWorkingDirectory(ctx context.Context, d *fs.Dirent) { if d == nil { panic("FSContext.SetWorkingDirectory called with nil dirent") @@ -187,11 +188,15 @@ func (f *FSContext) SetWorkingDirectory(ctx context.Context, d *fs.Dirent) { // SetWorkingDirectoryVFS2 sets the current working directory. // This will take an extra reference on the VirtualDentry. // -// This is not a valid call after destroy. +// This is not a valid call after f is destroyed. func (f *FSContext) SetWorkingDirectoryVFS2(ctx context.Context, d vfs.VirtualDentry) { f.mu.Lock() defer f.mu.Unlock() + if !f.cwdVFS2.Ok() { + panic(fmt.Sprintf("FSContext.SetWorkingDirectoryVFS2(%v)) called after destroy", d)) + } + old := f.cwdVFS2 f.cwdVFS2 = d d.IncRef() @@ -200,8 +205,8 @@ func (f *FSContext) SetWorkingDirectoryVFS2(ctx context.Context, d vfs.VirtualDe // RootDirectory returns the current filesystem root. // -// This will return nil if called after destroy(), otherwise it will return a -// Dirent with a reference taken. +// This will return nil if called after f is destroyed, otherwise it will return +// a Dirent with a reference taken. func (f *FSContext) RootDirectory() *fs.Dirent { f.mu.Lock() defer f.mu.Unlock() @@ -213,8 +218,8 @@ func (f *FSContext) RootDirectory() *fs.Dirent { // RootDirectoryVFS2 returns the current filesystem root. // -// This will return nil if called after destroy(), otherwise it will return a -// Dirent with a reference taken. +// This will return nil if called after f is destroyed, otherwise it will return +// a Dirent with a reference taken. func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry { f.mu.Lock() defer f.mu.Unlock() @@ -226,7 +231,7 @@ func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry { // SetRootDirectory sets the root directory. // This will take an extra reference on the Dirent. // -// This is not a valid call after free. +// This is not a valid call after f is destroyed. func (f *FSContext) SetRootDirectory(ctx context.Context, d *fs.Dirent) { if d == nil { panic("FSContext.SetRootDirectory called with nil dirent") @@ -247,7 +252,7 @@ func (f *FSContext) SetRootDirectory(ctx context.Context, d *fs.Dirent) { // SetRootDirectoryVFS2 sets the root directory. It takes a reference on vd. // -// This is not a valid call after free. +// This is not a valid call after f is destroyed. func (f *FSContext) SetRootDirectoryVFS2(ctx context.Context, vd vfs.VirtualDentry) { if !vd.Ok() { panic("FSContext.SetRootDirectoryVFS2 called with zero-value VirtualDentry") diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go index 5c4c622c2..df5c8421b 100644 --- a/pkg/sentry/kernel/sessions.go +++ b/pkg/sentry/kernel/sessions.go @@ -16,8 +16,6 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/syserror" ) @@ -32,7 +30,7 @@ type ProcessGroupID ThreadID // // +stateify savable type Session struct { - refs refs.AtomicRefCount + SessionRefs // leader is the originator of the Session. // @@ -62,16 +60,11 @@ type Session struct { sessionEntry } -// incRef grabs a reference. -func (s *Session) incRef() { - s.refs.IncRef() -} - -// decRef drops a reference. +// DecRef drops a reference. // // Precondition: callers must hold TaskSet.mu for writing. -func (s *Session) decRef() { - s.refs.DecRefWithDestructor(nil, func(context.Context) { +func (s *Session) DecRef() { + s.SessionRefs.DecRef(func() { // Remove translations from the leader. for ns := s.leader.pidns; ns != nil; ns = ns.parent { id := ns.sids[s] @@ -88,7 +81,7 @@ func (s *Session) decRef() { // // +stateify savable type ProcessGroup struct { - refs refs.AtomicRefCount // not exported. + refs ProcessGroupRefs // originator is the originator of the group. // @@ -163,7 +156,7 @@ func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) { } alive := true - pg.refs.DecRefWithDestructor(nil, func(context.Context) { + pg.refs.DecRef(func() { alive = false // don't bother with handleOrphan. // Remove translations from the originator. @@ -175,7 +168,7 @@ func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) { // Remove the list of process groups. pg.session.processGroups.Remove(pg) - pg.session.decRef() + pg.session.DecRef() }) if alive { pg.handleOrphan() @@ -302,7 +295,7 @@ func (tg *ThreadGroup) createSession() error { id: SessionID(id), leader: tg, } - s.refs.EnableLeakCheck("kernel.Session") + s.EnableLeakCheck() // Create a new ProcessGroup, belonging to that Session. // This also has a single reference (assigned below). @@ -316,7 +309,7 @@ func (tg *ThreadGroup) createSession() error { session: s, ancestors: 0, } - pg.refs.EnableLeakCheck("kernel.ProcessGroup") + pg.refs.EnableLeakCheck() // Tie them and return the result. s.processGroups.PushBack(pg) @@ -396,13 +389,13 @@ func (tg *ThreadGroup) CreateProcessGroup() error { // // We manually adjust the ancestors if the parent is in the same // session. - tg.processGroup.session.incRef() + tg.processGroup.session.IncRef() pg := ProcessGroup{ id: ProcessGroupID(id), originator: tg, session: tg.processGroup.session, } - pg.refs.EnableLeakCheck("kernel.ProcessGroup") + pg.refs.EnableLeakCheck() if tg.leader.parent != nil && tg.leader.parent.tg.processGroup.session == pg.session { pg.ancestors++ diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD index c211fc8d0..b7e4b480d 100644 --- a/pkg/sentry/kernel/shm/BUILD +++ b/pkg/sentry/kernel/shm/BUILD @@ -1,12 +1,25 @@ load("//tools:defs.bzl", "go_library") +load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) +go_template_instance( + name = "shm_refs", + out = "shm_refs.go", + package = "shm", + prefix = "Shm", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "Shm", + }, +) + go_library( name = "shm", srcs = [ "device.go", "shm.go", + "shm_refs.go", ], visibility = ["//pkg/sentry:internal"], deps = [ diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go index 13ec7afe0..00c03585e 100644 --- a/pkg/sentry/kernel/shm/shm.go +++ b/pkg/sentry/kernel/shm/shm.go @@ -39,7 +39,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" @@ -252,7 +251,7 @@ func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.Fi creatorPID: pid, changeTime: ktime.NowFromContext(ctx), } - shm.EnableLeakCheck("kernel.Shm") + shm.EnableLeakCheck() // Find the next available ID. for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ { @@ -337,14 +336,14 @@ func (r *Registry) remove(s *Shm) { // // +stateify savable type Shm struct { - // AtomicRefCount tracks the number of references to this segment. + // ShmRefs tracks the number of references to this segment. // // A segment holds a reference to itself until it is marked for // destruction. // // In addition to direct users, the MemoryManager will hold references // via MappingIdentity. - refs.AtomicRefCount + ShmRefs mfp pgalloc.MemoryFileProvider @@ -428,11 +427,14 @@ func (s *Shm) InodeID() uint64 { return uint64(s.ID) } -// DecRef overrides refs.RefCount.DecRef with a destructor. +// DecRef drops a reference on s. // // Precondition: Caller must not hold s.mu. func (s *Shm) DecRef(ctx context.Context) { - s.DecRefWithDestructor(ctx, s.destroy) + s.ShmRefs.DecRef(func() { + s.mfp.MemoryFile().DecRef(s.fr) + s.registry.remove(s) + }) } // Msync implements memmap.MappingIdentity.Msync. Msync is a no-op for shm @@ -642,11 +644,6 @@ func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error { return nil } -func (s *Shm) destroy(context.Context) { - s.mfp.MemoryFile().DecRef(s.fr) - s.registry.remove(s) -} - // MarkDestroyed marks a segment for destruction. The segment is actually // destroyed once it has no references. MarkDestroyed may be called multiple // times, and is safe to call after a segment has already been destroyed. See diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD index f9d0837a1..b4a47ccca 100644 --- a/pkg/sentry/mm/BUILD +++ b/pkg/sentry/mm/BUILD @@ -73,12 +73,35 @@ go_template_instance( }, ) +go_template_instance( + name = "aio_mappable_refs", + out = "aio_mappable_refs.go", + package = "mm", + prefix = "aioMappable", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "aioMappable", + }, +) + +go_template_instance( + name = "special_mappable_refs", + out = "special_mappable_refs.go", + package = "mm", + prefix = "SpecialMappable", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "SpecialMappable", + }, +) + go_library( name = "mm", srcs = [ "address_space.go", "aio_context.go", "aio_context_state.go", + "aio_mappable_refs.go", "debug.go", "file_refcount_set.go", "io.go", @@ -92,6 +115,7 @@ go_library( "save_restore.go", "shm.go", "special_mappable.go", + "special_mappable_refs.go", "syscalls.go", "vma.go", "vma_set.go", diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go index 16fea53c4..7bf48cb2c 100644 --- a/pkg/sentry/mm/aio_context.go +++ b/pkg/sentry/mm/aio_context.go @@ -17,7 +17,6 @@ package mm import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/usage" @@ -239,7 +238,7 @@ func (ctx *AIOContext) Drain() { // // +stateify savable type aioMappable struct { - refs.AtomicRefCount + aioMappableRefs mfp pgalloc.MemoryFileProvider fr memmap.FileRange @@ -253,13 +252,13 @@ func newAIOMappable(mfp pgalloc.MemoryFileProvider) (*aioMappable, error) { return nil, err } m := aioMappable{mfp: mfp, fr: fr} - m.EnableLeakCheck("mm.aioMappable") + m.EnableLeakCheck() return &m, nil } // DecRef implements refs.RefCounter.DecRef. func (m *aioMappable) DecRef(ctx context.Context) { - m.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) { + m.aioMappableRefs.DecRef(func() { m.mfp.MemoryFile().DecRef(m.fr) }) } diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go index 4cdb52eb6..f4c93baeb 100644 --- a/pkg/sentry/mm/special_mappable.go +++ b/pkg/sentry/mm/special_mappable.go @@ -16,7 +16,6 @@ package mm import ( "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/usage" @@ -31,7 +30,7 @@ import ( // // +stateify savable type SpecialMappable struct { - refs.AtomicRefCount + SpecialMappableRefs mfp pgalloc.MemoryFileProvider fr memmap.FileRange @@ -45,13 +44,13 @@ type SpecialMappable struct { // Preconditions: fr.Length() != 0. func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr memmap.FileRange) *SpecialMappable { m := SpecialMappable{mfp: mfp, fr: fr, name: name} - m.EnableLeakCheck("mm.SpecialMappable") + m.EnableLeakCheck() return &m } // DecRef implements refs.RefCounter.DecRef. func (m *SpecialMappable) DecRef(ctx context.Context) { - m.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) { + m.SpecialMappableRefs.DecRef(func() { m.mfp.MemoryFile().DecRef(m.fr) }) } diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD index c708b6030..26c3a51b9 100644 --- a/pkg/sentry/socket/unix/transport/BUILD +++ b/pkg/sentry/socket/unix/transport/BUILD @@ -15,6 +15,17 @@ go_template_instance( }, ) +go_template_instance( + name = "queue_refs", + out = "queue_refs.go", + package = "transport", + prefix = "queue", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "queue", + }, +) + go_library( name = "transport", srcs = [ @@ -22,6 +33,7 @@ go_library( "connectioned_state.go", "connectionless.go", "queue.go", + "queue_refs.go", "transport_message_list.go", "unix.go", ], diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go index c67b602f0..e3a75b519 100644 --- a/pkg/sentry/socket/unix/transport/connectioned.go +++ b/pkg/sentry/socket/unix/transport/connectioned.go @@ -142,9 +142,9 @@ func NewPair(ctx context.Context, stype linux.SockType, uid UniqueIDProvider) (E } q1 := &queue{ReaderQueue: a.Queue, WriterQueue: b.Queue, limit: initialLimit} - q1.EnableLeakCheck("transport.queue") + q1.EnableLeakCheck() q2 := &queue{ReaderQueue: b.Queue, WriterQueue: a.Queue, limit: initialLimit} - q2.EnableLeakCheck("transport.queue") + q2.EnableLeakCheck() if stype == linux.SOCK_STREAM { a.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q1}} @@ -300,14 +300,14 @@ func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce Conn } readQueue := &queue{ReaderQueue: ce.WaiterQueue(), WriterQueue: ne.Queue, limit: initialLimit} - readQueue.EnableLeakCheck("transport.queue") + readQueue.EnableLeakCheck() ne.connected = &connectedEndpoint{ endpoint: ce, writeQueue: readQueue, } writeQueue := &queue{ReaderQueue: ne.Queue, WriterQueue: ce.WaiterQueue(), limit: initialLimit} - writeQueue.EnableLeakCheck("transport.queue") + writeQueue.EnableLeakCheck() if e.stype == linux.SOCK_STREAM { ne.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{readQueue: writeQueue}} } else { diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go index 70ee8f9b8..4751b2fd8 100644 --- a/pkg/sentry/socket/unix/transport/connectionless.go +++ b/pkg/sentry/socket/unix/transport/connectionless.go @@ -42,7 +42,7 @@ var ( func NewConnectionless(ctx context.Context) Endpoint { ep := &connectionlessEndpoint{baseEndpoint{Queue: &waiter.Queue{}}} q := queue{ReaderQueue: ep.Queue, WriterQueue: &waiter.Queue{}, limit: initialLimit} - q.EnableLeakCheck("transport.queue") + q.EnableLeakCheck() ep.receiver = &queueReceiver{readQueue: &q} return ep } diff --git a/pkg/sentry/socket/unix/transport/queue.go b/pkg/sentry/socket/unix/transport/queue.go index ef6043e19..342def28f 100644 --- a/pkg/sentry/socket/unix/transport/queue.go +++ b/pkg/sentry/socket/unix/transport/queue.go @@ -16,7 +16,6 @@ package transport import ( "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" @@ -28,7 +27,7 @@ import ( // // +stateify savable type queue struct { - refs.AtomicRefCount + queueRefs ReaderQueue *waiter.Queue WriterQueue *waiter.Queue @@ -68,11 +67,13 @@ func (q *queue) Reset(ctx context.Context) { q.mu.Unlock() } -// DecRef implements RefCounter.DecRef with destructor q.Reset. +// DecRef implements RefCounter.DecRef. func (q *queue) DecRef(ctx context.Context) { - q.DecRefWithDestructor(ctx, q.Reset) - // We don't need to notify after resetting because no one cares about - // this queue after all references have been dropped. + q.queueRefs.DecRef(func() { + // We don't need to notify after resetting because no one cares about + // this queue after all references have been dropped. + q.Reset(ctx) + }) } // IsReadable determines if q is currently readable. diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 642769e7c..8093ca55c 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -27,6 +27,39 @@ go_template_instance( }, ) +go_template_instance( + name = "file_description_refs", + out = "file_description_refs.go", + package = "vfs", + prefix = "FileDescription", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "FileDescription", + }, +) + +go_template_instance( + name = "mount_namespace_refs", + out = "mount_namespace_refs.go", + package = "vfs", + prefix = "MountNamespace", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "MountNamespace", + }, +) + +go_template_instance( + name = "filesystem_refs", + out = "filesystem_refs.go", + package = "vfs", + prefix = "Filesystem", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "Filesystem", + }, +) + go_library( name = "vfs", srcs = [ @@ -40,12 +73,15 @@ go_library( "event_list.go", "file_description.go", "file_description_impl_util.go", + "file_description_refs.go", "filesystem.go", "filesystem_impl_util.go", + "filesystem_refs.go", "filesystem_type.go", "inotify.go", "lock.go", "mount.go", + "mount_namespace_refs.go", "mount_unsafe.go", "options.go", "pathname.go", @@ -63,6 +99,7 @@ go_library( "//pkg/fspath", "//pkg/gohacks", "//pkg/log", + "//pkg/refs", "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/fs", diff --git a/pkg/sentry/vfs/README.md b/pkg/sentry/vfs/README.md index 4b9faf2ea..5aad31b78 100644 --- a/pkg/sentry/vfs/README.md +++ b/pkg/sentry/vfs/README.md @@ -184,12 +184,3 @@ This construction, which is essentially a type-safe analogue to Linux's - File locking - `O_ASYNC` - -- Reference counts in the `vfs` package do not use the `refs` package since - `refs.AtomicRefCount` adds 64 bytes of overhead to each 8-byte reference - count, resulting in considerable cache bloat. 24 bytes of this overhead is - for weak reference support, which have poor performance and will not be used - by VFS2. The remaining 40 bytes is to store a descriptive string and stack - trace for reference leak checking; we can support reference leak checking - without incurring this space overhead by including the applicable - information directly in finalizers for applicable types. diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 3219a9e13..22a54fa48 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -38,9 +38,7 @@ import ( // // FileDescription is analogous to Linux's struct file. type FileDescription struct { - // refs is the reference count. refs is accessed using atomic memory - // operations. - refs int64 + FileDescriptionRefs // flagsMu protects statusFlags and asyncHandler below. flagsMu sync.Mutex @@ -131,7 +129,7 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mou } } - fd.refs = 1 + fd.EnableLeakCheck() // Remove "file creation flags" to mirror the behavior from file.f_flags in // fs/open.c:do_dentry_open. @@ -149,30 +147,9 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mou return nil } -// IncRef increments fd's reference count. -func (fd *FileDescription) IncRef() { - atomic.AddInt64(&fd.refs, 1) -} - -// TryIncRef increments fd's reference count and returns true. If fd's -// reference count is already zero, TryIncRef does nothing and returns false. -// -// TryIncRef does not require that a reference is held on fd. -func (fd *FileDescription) TryIncRef() bool { - for { - refs := atomic.LoadInt64(&fd.refs) - if refs <= 0 { - return false - } - if atomic.CompareAndSwapInt64(&fd.refs, refs, refs+1) { - return true - } - } -} - // DecRef decrements fd's reference count. func (fd *FileDescription) DecRef(ctx context.Context) { - if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 { + fd.FileDescriptionRefs.DecRef(func() { // Unregister fd from all epoll instances. fd.epollMu.Lock() epolls := fd.epolls @@ -208,15 +185,7 @@ func (fd *FileDescription) DecRef(ctx context.Context) { } fd.asyncHandler = nil fd.flagsMu.Unlock() - } else if refs < 0 { - panic("FileDescription.DecRef() called without holding a reference") - } -} - -// Refs returns the current number of references. The returned count -// is inherently racy and is unsafe to use without external synchronization. -func (fd *FileDescription) Refs() int64 { - return atomic.LoadInt64(&fd.refs) + }) } // Mount returns the mount on which fd was opened. It does not take a reference diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index 2c60cfab2..46851f638 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -15,8 +15,6 @@ package vfs import ( - "sync/atomic" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" @@ -34,9 +32,7 @@ import ( // // +stateify savable type Filesystem struct { - // refs is the reference count. refs is accessed using atomic memory - // operations. - refs int64 + FilesystemRefs // vfs is the VirtualFilesystem that uses this Filesystem. vfs is // immutable. @@ -52,7 +48,7 @@ type Filesystem struct { // Init must be called before first use of fs. func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, fsType FilesystemType, impl FilesystemImpl) { - fs.refs = 1 + fs.EnableLeakCheck() fs.vfs = vfsObj fs.fsType = fsType fs.impl = impl @@ -76,39 +72,14 @@ func (fs *Filesystem) Impl() FilesystemImpl { return fs.impl } -// IncRef increments fs' reference count. -func (fs *Filesystem) IncRef() { - if atomic.AddInt64(&fs.refs, 1) <= 1 { - panic("Filesystem.IncRef() called without holding a reference") - } -} - -// TryIncRef increments fs' reference count and returns true. If fs' reference -// count is zero, TryIncRef does nothing and returns false. -// -// TryIncRef does not require that a reference is held on fs. -func (fs *Filesystem) TryIncRef() bool { - for { - refs := atomic.LoadInt64(&fs.refs) - if refs <= 0 { - return false - } - if atomic.CompareAndSwapInt64(&fs.refs, refs, refs+1) { - return true - } - } -} - // DecRef decrements fs' reference count. func (fs *Filesystem) DecRef(ctx context.Context) { - if refs := atomic.AddInt64(&fs.refs, -1); refs == 0 { + fs.FilesystemRefs.DecRef(func() { fs.vfs.filesystemsMu.Lock() delete(fs.vfs.filesystems, fs) fs.vfs.filesystemsMu.Unlock() fs.impl.Release(ctx) - } else if refs < 0 { - panic("Filesystem.decRef() called without holding a reference") - } + }) } // FilesystemImpl contains implementation details for a Filesystem. diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index cd5456eef..db5fb3bb1 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -128,16 +128,14 @@ func (mnt *Mount) Options() MountOptions { // // +stateify savable type MountNamespace struct { + MountNamespaceRefs + // Owner is the usernamespace that owns this mount namespace. Owner *auth.UserNamespace // root is the MountNamespace's root mount. root is immutable. root *Mount - // refs is the reference count. refs is accessed using atomic memory - // operations. - refs int64 - // mountpoints maps all Dentries which are mount points in this namespace // to the number of Mounts for which they are mount points. mountpoints is // protected by VirtualFilesystem.mountMu. @@ -168,9 +166,9 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth } mntns := &MountNamespace{ Owner: creds.UserNamespace, - refs: 1, mountpoints: make(map[*Dentry]uint32), } + mntns.EnableLeakCheck() mntns.root = newMount(vfs, fs, root, mntns, &MountOptions{}) return mntns, nil } @@ -509,17 +507,10 @@ func (mnt *Mount) DecRef(ctx context.Context) { } } -// IncRef increments mntns' reference count. -func (mntns *MountNamespace) IncRef() { - if atomic.AddInt64(&mntns.refs, 1) <= 1 { - panic("MountNamespace.IncRef() called without holding a reference") - } -} - // DecRef decrements mntns' reference count. func (mntns *MountNamespace) DecRef(ctx context.Context) { vfs := mntns.root.fs.VirtualFilesystem() - if refs := atomic.AddInt64(&mntns.refs, -1); refs == 0 { + mntns.MountNamespaceRefs.DecRef(func() { vfs.mountMu.Lock() vfs.mounts.seq.BeginWrite() vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(mntns.root, &umountRecursiveOptions{ @@ -533,9 +524,7 @@ func (mntns *MountNamespace) DecRef(ctx context.Context) { for _, mnt := range mountsToDecRef { mnt.DecRef(ctx) } - } else if refs < 0 { - panic("MountNamespace.DecRef() called without holding a reference") - } + }) } // getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes diff --git a/pkg/tcpip/link/tun/BUILD b/pkg/tcpip/link/tun/BUILD index 6c137f693..0243424f6 100644 --- a/pkg/tcpip/link/tun/BUILD +++ b/pkg/tcpip/link/tun/BUILD @@ -1,18 +1,32 @@ load("//tools:defs.bzl", "go_library") +load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) +go_template_instance( + name = "tun_endpoint_refs", + out = "tun_endpoint_refs.go", + package = "tun", + prefix = "tunEndpoint", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "tunEndpoint", + }, +) + go_library( name = "tun", srcs = [ "device.go", "protocol.go", + "tun_endpoint_refs.go", "tun_unsafe.go", ], visibility = ["//visibility:public"], deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/log", "//pkg/refs", "//pkg/sync", "//pkg/syserror", diff --git a/pkg/tcpip/link/tun/device.go b/pkg/tcpip/link/tun/device.go index 3b1510a33..b6ddbe81e 100644 --- a/pkg/tcpip/link/tun/device.go +++ b/pkg/tcpip/link/tun/device.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" @@ -135,6 +134,7 @@ func attachOrCreateNIC(s *stack.Stack, name, prefix string, linkCaps stack.LinkE // 2. Creating a new NIC. id := tcpip.NICID(s.UniqueID()) + // TODO(gvisor.dev/1486): enable leak check for tunEndpoint. endpoint := &tunEndpoint{ Endpoint: channel.New(defaultDevOutQueueLen, defaultDevMtu, ""), stack: s, @@ -331,19 +331,18 @@ func (d *Device) WriteNotify() { // It is ref-counted as multiple opening files can attach to the same NIC. // The last owner is responsible for deleting the NIC. type tunEndpoint struct { + tunEndpointRefs *channel.Endpoint - refs.AtomicRefCount - stack *stack.Stack nicID tcpip.NICID name string isTap bool } -// DecRef decrements refcount of e, removes NIC if refcount goes to 0. +// DecRef decrements refcount of e, removing NIC if it reaches 0. func (e *tunEndpoint) DecRef(ctx context.Context) { - e.DecRefWithDestructor(ctx, func(context.Context) { + e.tunEndpointRefs.DecRef(func() { e.stack.RemoveNIC(e.nicID) }) } -- cgit v1.2.3 From 8d75fc4883ca8c10fb615203993d56d33a9e36b6 Mon Sep 17 00:00:00 2001 From: Rahat Mahmood Date: Fri, 28 Aug 2020 14:29:16 -0700 Subject: Implement StatFS for various VFS2 filesystems. This mainly involved enabling kernfs' client filesystems to provide a StatFS implementation. Fixes #3411, #3515. PiperOrigin-RevId: 329009864 --- pkg/abi/linux/fs.go | 1 + pkg/sentry/fsimpl/devpts/devpts.go | 10 +++++++++- pkg/sentry/fsimpl/devpts/master.go | 1 + pkg/sentry/fsimpl/devpts/slave.go | 1 + pkg/sentry/fsimpl/fuse/fusefs.go | 8 +++++++- pkg/sentry/fsimpl/host/host.go | 1 + pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 1 + pkg/sentry/fsimpl/kernfs/filesystem.go | 5 ++--- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 16 +++++++++++++--- pkg/sentry/fsimpl/kernfs/kernfs.go | 5 +++++ pkg/sentry/fsimpl/kernfs/kernfs_test.go | 8 +++++--- pkg/sentry/fsimpl/kernfs/symlink.go | 1 + pkg/sentry/fsimpl/pipefs/pipefs.go | 8 +++++--- pkg/sentry/fsimpl/proc/filesystem.go | 7 +++++++ pkg/sentry/fsimpl/proc/subtasks.go | 9 +++++---- pkg/sentry/fsimpl/proc/task.go | 7 ++++--- pkg/sentry/fsimpl/proc/task_fds.go | 19 +++++++++++-------- pkg/sentry/fsimpl/proc/task_files.go | 2 ++ pkg/sentry/fsimpl/proc/tasks.go | 9 +++++---- pkg/sentry/fsimpl/proc/tasks_files.go | 2 ++ pkg/sentry/fsimpl/sockfs/sockfs.go | 9 +++++++-- pkg/sentry/fsimpl/sys/kcov.go | 3 ++- pkg/sentry/fsimpl/sys/sys.go | 14 ++++++++++++++ pkg/sentry/vfs/filesystem_impl_util.go | 13 +++++++++++++ test/syscalls/BUILD | 1 + test/syscalls/linux/pipe.cc | 12 ++++++++++++ test/syscalls/linux/proc.cc | 14 ++++++++++++++ test/syscalls/linux/socket.cc | 17 +++++++++++++++++ test/syscalls/linux/statfs.cc | 16 ++++++++++------ 29 files changed, 178 insertions(+), 42 deletions(-) (limited to 'pkg/sentry/fsimpl/proc') diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go index 2b1ef0d4e..0d921ed6f 100644 --- a/pkg/abi/linux/fs.go +++ b/pkg/abi/linux/fs.go @@ -29,6 +29,7 @@ const ( SYSFS_MAGIC = 0x62656572 TMPFS_MAGIC = 0x01021994 V9FS_MAGIC = 0x01021997 + FUSE_SUPER_MAGIC = 0x65735546 ) // Filesystem path limits, from uapi/linux/limits.h. diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go index 0eaff9087..57580f4d4 100644 --- a/pkg/sentry/fsimpl/devpts/devpts.go +++ b/pkg/sentry/fsimpl/devpts/devpts.go @@ -111,12 +111,13 @@ func (fs *filesystem) Release(ctx context.Context) { // rootInode is the root directory inode for the devpts mounts. type rootInode struct { - rootInodeRefs + implStatFS kernfs.AlwaysValid kernfs.InodeAttrs kernfs.InodeDirectoryNoNewChildren kernfs.InodeNotSymlink kernfs.OrderedChildren + rootInodeRefs locks vfs.FileLocks @@ -240,3 +241,10 @@ func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, func (i *rootInode) DecRef(context.Context) { i.rootInodeRefs.DecRef(i.Destroy) } + +type implStatFS struct{} + +// StatFS implements kernfs.Inode.StatFS. +func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) { + return vfs.GenericStatFS(linux.DEVPTS_SUPER_MAGIC), nil +} diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go index 3bb397f71..60feb1993 100644 --- a/pkg/sentry/fsimpl/devpts/master.go +++ b/pkg/sentry/fsimpl/devpts/master.go @@ -30,6 +30,7 @@ import ( // masterInode is the inode for the master end of the Terminal. type masterInode struct { + implStatFS kernfs.InodeAttrs kernfs.InodeNoopRefCount kernfs.InodeNotDirectory diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go index 32e4e1908..a9da7af64 100644 --- a/pkg/sentry/fsimpl/devpts/slave.go +++ b/pkg/sentry/fsimpl/devpts/slave.go @@ -29,6 +29,7 @@ import ( // slaveInode is the inode for the slave end of the Terminal. type slaveInode struct { + implStatFS kernfs.InodeAttrs kernfs.InodeNoopRefCount kernfs.InodeNotDirectory diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go index 9717c0e15..810819ae4 100644 --- a/pkg/sentry/fsimpl/fuse/fusefs.go +++ b/pkg/sentry/fsimpl/fuse/fusefs.go @@ -200,9 +200,9 @@ func (fs *filesystem) Release(ctx context.Context) { type inode struct { inodeRefs kernfs.InodeAttrs + kernfs.InodeDirectoryNoNewChildren kernfs.InodeNoDynamicLookup kernfs.InodeNotSymlink - kernfs.InodeDirectoryNoNewChildren kernfs.OrderedChildren locks vfs.FileLocks @@ -331,3 +331,9 @@ func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptio func (i *inode) DecRef(context.Context) { i.inodeRefs.DecRef(i.Destroy) } + +// StatFS implements kernfs.Inode.StatFS. +func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) { + // TODO(gvisor.dev/issues/3413): Complete the implementation of statfs. + return vfs.GenericStatFS(linux.FUSE_SUPER_MAGIC), nil +} diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 2d3821f33..7561f821c 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -186,6 +186,7 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe // inode implements kernfs.Inode. type inode struct { + kernfs.InodeNoStatFS kernfs.InodeNotDirectory kernfs.InodeNotSymlink diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index 12adf727a..1ee089620 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -35,6 +35,7 @@ import ( // +stateify savable type DynamicBytesFile struct { InodeAttrs + InodeNoStatFS InodeNoopRefCount InodeNotDirectory InodeNotSymlink diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index e5d6b5c35..0e3011689 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -721,14 +721,13 @@ func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf // StatFSAt implements vfs.FilesystemImpl.StatFSAt. func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { fs.mu.RLock() - _, _, err := fs.walkExistingLocked(ctx, rp) + _, inode, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() fs.processDeferredDecRefs(ctx) if err != nil { return linux.Statfs{}, err } - // TODO(gvisor.dev/issue/1193): actually implement statfs. - return linux.Statfs{}, syserror.ENOSYS + return inode.StatFS(ctx, fs.VFSFilesystem()) } // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index f442a5606..c0b863ba4 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -546,12 +546,13 @@ func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.D // // +stateify savable type StaticDirectory struct { - StaticDirectoryRefs - InodeNotSymlink - InodeDirectoryNoNewChildren InodeAttrs + InodeDirectoryNoNewChildren InodeNoDynamicLookup + InodeNoStatFS + InodeNotSymlink OrderedChildren + StaticDirectoryRefs locks vfs.FileLocks fdOpts GenericDirectoryFDOptions @@ -609,3 +610,12 @@ type AlwaysValid struct{} func (*AlwaysValid) Valid(context.Context) bool { return true } + +// InodeNoStatFS partially implements the Inode interface, where the client +// filesystem doesn't support statfs(2). +type InodeNoStatFS struct{} + +// StatFS implements Inode.StatFS. +func (*InodeNoStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) { + return linux.Statfs{}, syserror.ENOSYS +} diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index ca3685800..88fcd54aa 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -320,6 +320,11 @@ type Inode interface { // Precondition: rp.Done(). vfsd.Impl() must be the kernfs Dentry containing // the inode on which Open() is being called. Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) + + // StatFS returns filesystem statistics for the client filesystem. This + // corresponds to vfs.FilesystemImpl.StatFSAt. If the client filesystem + // doesn't support statfs(2), this should return ENOSYS. + StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) } type inodeRefs interface { diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index e376d1736..675587c6b 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -98,9 +98,10 @@ func (*attrs) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.S type readonlyDir struct { readonlyDirRefs attrs - kernfs.InodeNotSymlink - kernfs.InodeNoDynamicLookup kernfs.InodeDirectoryNoNewChildren + kernfs.InodeNoDynamicLookup + kernfs.InodeNoStatFS + kernfs.InodeNotSymlink kernfs.OrderedChildren locks vfs.FileLocks @@ -137,9 +138,10 @@ func (d *readonlyDir) DecRef(context.Context) { type dir struct { dirRefs attrs - kernfs.InodeNotSymlink kernfs.InodeNoDynamicLookup + kernfs.InodeNotSymlink kernfs.OrderedChildren + kernfs.InodeNoStatFS locks vfs.FileLocks diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go index 2ab3f53fd..64731a3e4 100644 --- a/pkg/sentry/fsimpl/kernfs/symlink.go +++ b/pkg/sentry/fsimpl/kernfs/symlink.go @@ -28,6 +28,7 @@ type StaticSymlink struct { InodeAttrs InodeNoopRefCount InodeSymlink + InodeNoStatFS target string } diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go index 2ca793db9..7053ad6db 100644 --- a/pkg/sentry/fsimpl/pipefs/pipefs.go +++ b/pkg/sentry/fsimpl/pipefs/pipefs.go @@ -143,14 +143,16 @@ func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth. return syserror.EPERM } -// TODO(gvisor.dev/issue/1193): kernfs does not provide a way to implement -// statfs, from which we should indicate PIPEFS_MAGIC. - // Open implements kernfs.Inode.Open. func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { return i.pipe.Open(ctx, rp.Mount(), vfsd, opts.Flags, &i.locks) } +// StatFS implements kernfs.Inode.StatFS. +func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) { + return vfs.GenericStatFS(linux.PIPEFS_MAGIC), nil +} + // NewConnectedPipeFDs returns a pair of FileDescriptions representing the read // and write ends of a newly-created pipe, as for pipe(2) and pipe2(2). // diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go index c350ec127..03b5941b9 100644 --- a/pkg/sentry/fsimpl/proc/filesystem.go +++ b/pkg/sentry/fsimpl/proc/filesystem.go @@ -121,3 +121,10 @@ func newStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64 type InternalData struct { Cgroups map[string]string } + +type implStatFS struct{} + +// StatFS implements kernfs.Inode.StatFS. +func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) { + return vfs.GenericStatFS(linux.PROC_SUPER_MAGIC), nil +} diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index 01c0efb3a..d57d94dbc 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -31,12 +31,13 @@ import ( // // +stateify savable type subtasksInode struct { - subtasksInodeRefs - kernfs.InodeNotSymlink - kernfs.InodeDirectoryNoNewChildren + implStatFS + kernfs.AlwaysValid kernfs.InodeAttrs + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeNotSymlink kernfs.OrderedChildren - kernfs.AlwaysValid + subtasksInodeRefs locks vfs.FileLocks diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 66b557abd..dbdb5d929 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -32,12 +32,13 @@ import ( // // +stateify savable type taskInode struct { - taskInodeRefs - kernfs.InodeNotSymlink + implStatFS + kernfs.InodeAttrs kernfs.InodeDirectoryNoNewChildren kernfs.InodeNoDynamicLookup - kernfs.InodeAttrs + kernfs.InodeNotSymlink kernfs.OrderedChildren + taskInodeRefs locks vfs.FileLocks diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index 0527b2de8..3f0d78461 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -100,13 +100,14 @@ func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, off // // +stateify savable type fdDirInode struct { + fdDir fdDirInodeRefs - kernfs.InodeNotSymlink - kernfs.InodeDirectoryNoNewChildren + implStatFS + kernfs.AlwaysValid kernfs.InodeAttrs + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeNotSymlink kernfs.OrderedChildren - kernfs.AlwaysValid - fdDir } var _ kernfs.Inode = (*fdDirInode)(nil) @@ -185,6 +186,7 @@ func (i *fdDirInode) DecRef(context.Context) { // // +stateify savable type fdSymlink struct { + implStatFS kernfs.InodeAttrs kernfs.InodeNoopRefCount kernfs.InodeSymlink @@ -233,13 +235,14 @@ func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDen // // +stateify savable type fdInfoDirInode struct { + fdDir fdInfoDirInodeRefs - kernfs.InodeNotSymlink - kernfs.InodeDirectoryNoNewChildren + implStatFS + kernfs.AlwaysValid kernfs.InodeAttrs + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeNotSymlink kernfs.OrderedChildren - kernfs.AlwaysValid - fdDir } var _ kernfs.Inode = (*fdInfoDirInode)(nil) diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 830b78949..356036b9b 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -648,6 +648,7 @@ func (o *oomScoreAdj) Write(ctx context.Context, src usermem.IOSequence, offset // // +stateify savable type exeSymlink struct { + implStatFS kernfs.InodeAttrs kernfs.InodeNoopRefCount kernfs.InodeSymlink @@ -832,6 +833,7 @@ func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.Vir // namespaceInode is a synthetic inode created to represent a namespace in // /proc/[pid]/ns/*. type namespaceInode struct { + implStatFS kernfs.InodeAttrs kernfs.InodeNoopRefCount kernfs.InodeNotDirectory diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index 863c4467e..3ea00ab87 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -37,12 +37,13 @@ const ( // // +stateify savable type tasksInode struct { - tasksInodeRefs - kernfs.InodeNotSymlink - kernfs.InodeDirectoryNoNewChildren + implStatFS + kernfs.AlwaysValid kernfs.InodeAttrs + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeNotSymlink kernfs.OrderedChildren - kernfs.AlwaysValid + tasksInodeRefs locks vfs.FileLocks diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go index 7d8983aa5..8c41729e4 100644 --- a/pkg/sentry/fsimpl/proc/tasks_files.go +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -32,6 +32,7 @@ import ( ) type selfSymlink struct { + implStatFS kernfs.InodeAttrs kernfs.InodeNoopRefCount kernfs.InodeSymlink @@ -74,6 +75,7 @@ func (*selfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, } type threadSelfSymlink struct { + implStatFS kernfs.InodeAttrs kernfs.InodeNoopRefCount kernfs.InodeSymlink diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go index c61818ff6..94a998568 100644 --- a/pkg/sentry/fsimpl/sockfs/sockfs.go +++ b/pkg/sentry/fsimpl/sockfs/sockfs.go @@ -81,10 +81,10 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe // inode implements kernfs.Inode. type inode struct { - kernfs.InodeNotDirectory - kernfs.InodeNotSymlink kernfs.InodeAttrs kernfs.InodeNoopRefCount + kernfs.InodeNotDirectory + kernfs.InodeNotSymlink } // Open implements kernfs.Inode.Open. @@ -92,6 +92,11 @@ func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr return nil, syserror.ENXIO } +// StatFS implements kernfs.Inode.StatFS. +func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) { + return vfs.GenericStatFS(linux.SOCKFS_MAGIC), nil +} + // NewDentry constructs and returns a sockfs dentry. // // Preconditions: mnt.Filesystem() must have been returned by NewFilesystem(). diff --git a/pkg/sentry/fsimpl/sys/kcov.go b/pkg/sentry/fsimpl/sys/kcov.go index 92710d877..73f3d3309 100644 --- a/pkg/sentry/fsimpl/sys/kcov.go +++ b/pkg/sentry/fsimpl/sys/kcov.go @@ -39,8 +39,9 @@ func (fs *filesystem) newKcovFile(ctx context.Context, creds *auth.Credentials) type kcovInode struct { kernfs.InodeAttrs kernfs.InodeNoopRefCount - kernfs.InodeNotSymlink kernfs.InodeNotDirectory + kernfs.InodeNotSymlink + implStatFS } func (i *kcovInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index ea30a4ec2..39952d2d0 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -163,9 +163,16 @@ func (d *dir) DecRef(context.Context) { d.dirRefs.DecRef(d.Destroy) } +// StatFS implements kernfs.Inode.StatFS. +func (d *dir) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) { + return vfs.GenericStatFS(linux.SYSFS_MAGIC), nil +} + // cpuFile implements kernfs.Inode. type cpuFile struct { + implStatFS kernfs.DynamicBytesFile + maxCores uint } @@ -182,3 +189,10 @@ func (fs *filesystem) newCPUFile(creds *auth.Credentials, maxCores uint, mode li d.Init(c) return d } + +type implStatFS struct{} + +// StatFS implements kernfs.Inode.StatFS. +func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) { + return vfs.GenericStatFS(linux.SYSFS_MAGIC), nil +} diff --git a/pkg/sentry/vfs/filesystem_impl_util.go b/pkg/sentry/vfs/filesystem_impl_util.go index 465e610e0..2620cf975 100644 --- a/pkg/sentry/vfs/filesystem_impl_util.go +++ b/pkg/sentry/vfs/filesystem_impl_util.go @@ -16,6 +16,9 @@ package vfs import ( "strings" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/usermem" ) // GenericParseMountOptions parses a comma-separated list of options of the @@ -41,3 +44,13 @@ func GenericParseMountOptions(str string) map[string]string { } return m } + +// GenericStatFS returns a statfs struct filled with the common fields for a +// general filesystem. This is analogous to Linux's fs/libfs.cs:simple_statfs(). +func GenericStatFS(fsMagic uint64) linux.Statfs { + return linux.Statfs{ + Type: fsMagic, + BlockSize: usermem.PageSize, + NameLength: linux.NAME_MAX, + } +} diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index 65e8299c3..f949bc0e3 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -803,6 +803,7 @@ syscall_test( syscall_test( add_overlay = True, test = "//test/syscalls/linux:statfs_test", + use_tmpfs = True, # Test specifically relies on TEST_TMPDIR to be tmpfs. ) syscall_test( diff --git a/test/syscalls/linux/pipe.cc b/test/syscalls/linux/pipe.cc index 34291850d..c097c9187 100644 --- a/test/syscalls/linux/pipe.cc +++ b/test/syscalls/linux/pipe.cc @@ -13,7 +13,9 @@ // limitations under the License. #include /* Obtain O_* constant definitions */ +#include #include +#include #include #include @@ -198,6 +200,16 @@ TEST_P(PipeTest, NonBlocking) { SyscallFailsWithErrno(EWOULDBLOCK)); } +TEST(PipeTest, StatFS) { + int fds[2]; + ASSERT_THAT(pipe(fds), SyscallSucceeds()); + struct statfs st; + EXPECT_THAT(fstatfs(fds[0], &st), SyscallSucceeds()); + EXPECT_EQ(st.f_type, PIPEFS_MAGIC); + EXPECT_EQ(st.f_bsize, getpagesize()); + EXPECT_EQ(st.f_namelen, NAME_MAX); +} + TEST(Pipe2Test, CloExec) { int fds[2]; ASSERT_THAT(pipe2(fds, O_CLOEXEC), SyscallSucceeds()); diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc index d6b875dbf..b73189e55 100644 --- a/test/syscalls/linux/proc.cc +++ b/test/syscalls/linux/proc.cc @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -26,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -2159,6 +2161,18 @@ TEST(Proc, PidTidIOAccounting) { noop.Join(); } +TEST(Proc, Statfs) { + struct statfs st; + EXPECT_THAT(statfs("/proc", &st), SyscallSucceeds()); + if (IsRunningWithVFS1()) { + EXPECT_EQ(st.f_type, ANON_INODE_FS_MAGIC); + } else { + EXPECT_EQ(st.f_type, PROC_SUPER_MAGIC); + } + EXPECT_EQ(st.f_bsize, getpagesize()); + EXPECT_EQ(st.f_namelen, NAME_MAX); +} + } // namespace } // namespace testing } // namespace gvisor diff --git a/test/syscalls/linux/socket.cc b/test/syscalls/linux/socket.cc index c20cd3fcc..e680d3dd7 100644 --- a/test/syscalls/linux/socket.cc +++ b/test/syscalls/linux/socket.cc @@ -14,6 +14,7 @@ #include #include +#include #include #include @@ -26,6 +27,9 @@ namespace gvisor { namespace testing { +// From linux/magic.h, but we can't depend on linux headers here. +#define SOCKFS_MAGIC 0x534F434B + TEST(SocketTest, UnixSocketPairProtocol) { int socks[2]; ASSERT_THAT(socketpair(AF_UNIX, SOCK_STREAM, PF_UNIX, socks), @@ -94,6 +98,19 @@ TEST(SocketTest, UnixSocketStat) { } } +TEST(SocketTest, UnixSocketStatFS) { + SKIP_IF(IsRunningWithVFS1()); + + FileDescriptor bound = + ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_UNIX, SOCK_STREAM, PF_UNIX)); + + struct statfs st; + EXPECT_THAT(fstatfs(bound.get(), &st), SyscallSucceeds()); + EXPECT_EQ(st.f_type, SOCKFS_MAGIC); + EXPECT_EQ(st.f_bsize, getpagesize()); + EXPECT_EQ(st.f_namelen, NAME_MAX); +} + using SocketOpenTest = ::testing::TestWithParam; // UDS cannot be opened. diff --git a/test/syscalls/linux/statfs.cc b/test/syscalls/linux/statfs.cc index aca51d30f..49f2f156c 100644 --- a/test/syscalls/linux/statfs.cc +++ b/test/syscalls/linux/statfs.cc @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include #include @@ -26,6 +27,10 @@ namespace testing { namespace { +// From linux/magic.h. For some reason, not defined in the headers for some +// build environments. +#define OVERLAYFS_SUPER_MAGIC 0x794c7630 + TEST(StatfsTest, CannotStatBadPath) { auto temp_file = NewTempAbsPathInDir("/tmp"); @@ -38,19 +43,18 @@ TEST(StatfsTest, InternalTmpfs) { struct statfs st; EXPECT_THAT(statfs(temp_file.path().c_str(), &st), SyscallSucceeds()); + // Note: We could be an overlay or goferfs on some configurations. + EXPECT_TRUE(st.f_type == TMPFS_MAGIC || st.f_type == OVERLAYFS_SUPER_MAGIC || + st.f_type == V9FS_MAGIC); } TEST(StatfsTest, InternalDevShm) { struct statfs st; EXPECT_THAT(statfs("/dev/shm", &st), SyscallSucceeds()); -} - -TEST(StatfsTest, NameLen) { - struct statfs st; - EXPECT_THAT(statfs("/dev/shm", &st), SyscallSucceeds()); // This assumes that /dev/shm is tmpfs. - EXPECT_EQ(st.f_namelen, NAME_MAX); + // Note: We could be an overlay on some configurations. + EXPECT_TRUE(st.f_type == TMPFS_MAGIC || st.f_type == OVERLAYFS_SUPER_MAGIC); } TEST(FstatfsTest, CannotStatBadFd) { -- cgit v1.2.3 From f2f92e52f6548b3b29d561d6d334a4f1fdbd8437 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Tue, 8 Sep 2020 13:58:50 -0700 Subject: Honor readonly flag for root mount Updates #1487 PiperOrigin-RevId: 330580699 --- pkg/sentry/fsimpl/devtmpfs/devtmpfs.go | 2 +- pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go | 2 +- pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go | 6 +- pkg/sentry/fsimpl/ext/ext_test.go | 6 +- pkg/sentry/fsimpl/fuse/dev_test.go | 2 +- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 2 +- pkg/sentry/fsimpl/proc/tasks_test.go | 2 +- pkg/sentry/fsimpl/sys/sys_test.go | 2 +- pkg/sentry/fsimpl/tmpfs/benchmark_test.go | 4 +- pkg/sentry/fsimpl/tmpfs/pipe_test.go | 2 +- pkg/sentry/fsimpl/tmpfs/tmpfs_test.go | 2 +- pkg/sentry/vfs/mount.go | 6 +- runsc/boot/fs.go | 8 +- runsc/boot/loader_test.go | 4 +- runsc/boot/vfs.go | 35 ++++- runsc/container/container_test.go | 180 +++++++++++++--------- runsc/container/multi_container_test.go | 21 +-- runsc/container/shared_volume_test.go | 12 +- 18 files changed, 178 insertions(+), 120 deletions(-) (limited to 'pkg/sentry/fsimpl/proc') diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go index 52f44f66d..a23094e54 100644 --- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go @@ -80,7 +80,7 @@ type Accessor struct { // NewAccessor returns an Accessor that supports creation of device special // files in the devtmpfs instance registered with name fsTypeName in vfsObj. func NewAccessor(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, fsTypeName string) (*Accessor, error) { - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "devtmpfs" /* source */, fsTypeName, &vfs.GetFilesystemOptions{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "devtmpfs" /* source */, fsTypeName, &vfs.MountOptions{}) if err != nil { return nil, err } diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go index 827a608cb..3a38b8bb4 100644 --- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go @@ -48,7 +48,7 @@ func setupDevtmpfs(t *testing.T) (context.Context, *auth.Credentials, *vfs.Virtu }) // Create a test mount namespace with devtmpfs mounted at "/dev". - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "tmpfs" /* source */, "tmpfs" /* fsTypeName */, &vfs.GetFilesystemOptions{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "tmpfs" /* source */, "tmpfs" /* fsTypeName */, &vfs.MountOptions{}) if err != nil { t.Fatalf("failed to create tmpfs root mount: %v", err) } diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go index a2cc9b59f..c349b886e 100644 --- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go +++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go @@ -59,7 +59,11 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.MountOptions{ + GetFilesystemOptions: vfs.GetFilesystemOptions{ + InternalData: int(f.Fd()), + }, + }) if err != nil { f.Close() return nil, nil, nil, nil, err diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go index 2dbaee287..0989558cd 100644 --- a/pkg/sentry/fsimpl/ext/ext_test.go +++ b/pkg/sentry/fsimpl/ext/ext_test.go @@ -71,7 +71,11 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.MountOptions{ + GetFilesystemOptions: vfs.GetFilesystemOptions{ + InternalData: int(f.Fd()), + }, + }) if err != nil { f.Close() return nil, nil, nil, nil, err diff --git a/pkg/sentry/fsimpl/fuse/dev_test.go b/pkg/sentry/fsimpl/fuse/dev_test.go index 1ffe7ccd2..aedc2fa39 100644 --- a/pkg/sentry/fsimpl/fuse/dev_test.go +++ b/pkg/sentry/fsimpl/fuse/dev_test.go @@ -342,7 +342,7 @@ func setup(t *testing.T) *testutil.System { AllowUserMount: true, }) - mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) + mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{}) if err != nil { t.Fatalf("NewMountNamespace(): %v", err) } diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index 675587c6b..09806a3f2 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -52,7 +52,7 @@ func newTestSystem(t *testing.T, rootFn RootDentryFn) *testutil.System { v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.GetFilesystemOptions{}) + mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.MountOptions{}) if err != nil { t.Fatalf("Failed to create testfs root mount: %v", err) } diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index d82b3d2f3..f693f9060 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -104,7 +104,7 @@ func setup(t *testing.T) *testutil.System { AllowUserMount: true, }) - mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.GetFilesystemOptions{}) + mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.MountOptions{}) if err != nil { t.Fatalf("NewMountNamespace(): %v", err) } diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go index 9fd38b295..0a0d914cc 100644 --- a/pkg/sentry/fsimpl/sys/sys_test.go +++ b/pkg/sentry/fsimpl/sys/sys_test.go @@ -38,7 +38,7 @@ func newTestSystem(t *testing.T) *testutil.System { AllowUserMount: true, }) - mns, err := k.VFS().NewMountNamespace(ctx, creds, "", sys.Name, &vfs.GetFilesystemOptions{}) + mns, err := k.VFS().NewMountNamespace(ctx, creds, "", sys.Name, &vfs.MountOptions{}) if err != nil { t.Fatalf("Failed to create new mount namespace: %v", err) } diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go index e5a4218e8..5209a17af 100644 --- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go +++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go @@ -182,7 +182,7 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) { vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{}) if err != nil { b.Fatalf("failed to create tmpfs root mount: %v", err) } @@ -376,7 +376,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) { vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{}) if err != nil { b.Fatalf("failed to create tmpfs root mount: %v", err) } diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go index ec2701d8b..be29a2363 100644 --- a/pkg/sentry/fsimpl/tmpfs/pipe_test.go +++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go @@ -158,7 +158,7 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{}) if err != nil { t.Fatalf("failed to create tmpfs root mount: %v", err) } diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go index 6f3e3ae6f..99c8e3c0f 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go @@ -41,7 +41,7 @@ func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentr vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{}) if err != nil { return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err) } diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index db5fb3bb1..06ca91989 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -154,13 +154,13 @@ type MountNamespace struct { // NewMountNamespace returns a new mount namespace with a root filesystem // configured by the given arguments. A reference is taken on the returned // MountNamespace. -func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) { +func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *MountOptions) (*MountNamespace, error) { rft := vfs.getFilesystemType(fsTypeName) if rft == nil { ctx.Warningf("Unknown filesystem type: %s", fsTypeName) return nil, syserror.ENODEV } - fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts) + fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions) if err != nil { return nil, err } @@ -169,7 +169,7 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth mountpoints: make(map[*Dentry]uint32), } mntns.EnableLeakCheck() - mntns.root = newMount(vfs, fs, root, mntns, &MountOptions{}) + mntns.root = newMount(vfs, fs, root, mntns, opts) return mntns, nil } diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index e2c5f5fb1..ddf288456 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -254,7 +254,7 @@ func mustFindFilesystem(name string) fs.Filesystem { // addSubmountOverlay overlays the inode over a ramfs tree containing the given // paths. -func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) { +func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string, mf fs.MountSourceFlags) (*fs.Inode, error) { // Construct a ramfs tree of mount points. The contents never // change, so this can be fully caching. There's no real // filesystem backing this tree, so we set the filesystem to @@ -264,7 +264,7 @@ func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string if err != nil { return nil, fmt.Errorf("creating mount tree: %v", err) } - overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{}) + overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, mf) if err != nil { return nil, fmt.Errorf("adding mount overlay: %v", err) } @@ -741,7 +741,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *config.Con // for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always // mounted even if they are not in the spec. submounts := append(subtargets("/", c.mounts), "/dev", "/sys", "/proc", "/tmp") - rootInode, err = addSubmountOverlay(ctx, rootInode, submounts) + rootInode, err = addSubmountOverlay(ctx, rootInode, submounts, mf) if err != nil { return nil, fmt.Errorf("adding submount overlay: %v", err) } @@ -851,7 +851,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *config.Confi submounts := subtargets(m.Destination, c.mounts) if len(submounts) > 0 { log.Infof("Adding submount overlay over %q", m.Destination) - inode, err = addSubmountOverlay(ctx, inode, submounts) + inode, err = addSubmountOverlay(ctx, inode, submounts, mf) if err != nil { return fmt.Errorf("adding submount overlay: %v", err) } diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index dc9861389..bf9ec5d38 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -491,9 +491,9 @@ func TestCreateMountNamespaceVFS2(t *testing.T) { } ctx := l.k.SupervisorContext() - mns, err := mntr.setupVFS2(ctx, l.root.conf, &l.root.procArgs) + mns, err := mntr.mountAll(l.root.conf, &l.root.procArgs) if err != nil { - t.Fatalf("failed to setupVFS2: %v", err) + t.Fatalf("mountAll: %v", err) } root := mns.Root() diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index 66b6cf19b..7844ea28c 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -134,7 +134,7 @@ func registerFilesystems(k *kernel.Kernel) error { } func setupContainerVFS2(ctx context.Context, conf *config.Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { - mns, err := mntr.setupVFS2(ctx, conf, procArgs) + mns, err := mntr.mountAll(conf, procArgs) if err != nil { return fmt.Errorf("failed to setupFS: %w", err) } @@ -149,7 +149,7 @@ func setupContainerVFS2(ctx context.Context, conf *config.Config, mntr *containe return nil } -func (c *containerMounter) setupVFS2(ctx context.Context, conf *config.Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) { +func (c *containerMounter) mountAll(conf *config.Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) { log.Infof("Configuring container's file system with VFS2") // Create context with root credentials to mount the filesystem (the current @@ -172,24 +172,44 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *config.Config, p if err := c.mountSubmountsVFS2(rootCtx, conf, mns, rootCreds); err != nil { return nil, fmt.Errorf("mounting submounts vfs2: %w", err) } + + if c.root.Readonly || conf.Overlay { + // Switch to ReadOnly after all submounts were setup. + root := mns.Root() + defer root.DecRef(rootCtx) + if err := c.k.VFS().SetMountReadOnly(root.Mount(), true); err != nil { + return nil, fmt.Errorf(`failed to set mount at "/" readonly: %v`, err) + } + } + return mns, nil } +// createMountNamespaceVFS2 creates the container's root mount and namespace. +// The mount is created ReadWrite to allow mount point for submounts to be +// created. ** The caller is responsible to switch to ReadOnly if needed ** func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials) (*vfs.MountNamespace, error) { fd := c.fds.remove() - opts := p9MountData(fd, conf.FileAccess, true /* vfs2 */) + data := p9MountData(fd, conf.FileAccess, true /* vfs2 */) if conf.OverlayfsStaleRead { // We can't check for overlayfs here because sandbox is chroot'ed and gofer // can only send mount options for specs.Mounts (specs.Root is missing // Options field). So assume root is always on top of overlayfs. - opts = append(opts, "overlayfs_stale_read") + data = append(data, "overlayfs_stale_read") } log.Infof("Mounting root over 9P, ioFD: %d", fd) - mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, &vfs.GetFilesystemOptions{ - Data: strings.Join(opts, ","), - }) + opts := &vfs.MountOptions{ + // Always mount as ReadWrite to allow other mounts on top of it. It'll be + // made ReadOnly in the caller (if needed). + ReadOnly: false, + GetFilesystemOptions: vfs.GetFilesystemOptions{ + Data: strings.Join(data, ","), + }, + InternalMount: true, + } + mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, opts) if err != nil { return nil, fmt.Errorf("setting up mount namespace: %w", err) } @@ -227,6 +247,7 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *config. if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil { return fmt.Errorf("failed to set mount at %q readwrite: %v", submount.Destination, err) } + // Restore back to ReadOnly at the end. defer func() { if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil { panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.Destination, err)) diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index 6082068c7..33ada5bb9 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -41,6 +41,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/test/testutil" + "gvisor.dev/gvisor/pkg/urpc" "gvisor.dev/gvisor/runsc/boot/platforms" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/specutils" @@ -1490,6 +1491,8 @@ func TestMountNewDir(t *testing.T) { Source: srcDir, Type: "bind", }) + // Extra points for creating the mount with a readonly root. + spec.Root.Readonly = true if err := run(spec, conf); err != nil { t.Fatalf("error running sandbox: %v", err) @@ -1499,17 +1502,17 @@ func TestMountNewDir(t *testing.T) { } func TestReadonlyRoot(t *testing.T) { - for name, conf := range configsWithVFS2(t, overlay) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { - spec := testutil.NewSpecWithArgs("/bin/touch", "/foo") + spec := testutil.NewSpecWithArgs("sleep", "100") spec.Root.Readonly = true + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) if err != nil { t.Fatalf("error setting up container: %v", err) } defer cleanup() - // Create, start and wait for the container. args := Args{ ID: testutil.RandomContainerID(), Spec: spec, @@ -1524,12 +1527,82 @@ func TestReadonlyRoot(t *testing.T) { t.Fatalf("error starting container: %v", err) } - ws, err := c.Wait() + // Read mounts to check that root is readonly. + out, ws, err := executeCombinedOutput(c, "/bin/sh", "-c", "mount | grep ' / '") + if err != nil || ws != 0 { + t.Fatalf("exec failed, ws: %v, err: %v", ws, err) + } + t.Logf("root mount: %q", out) + if !strings.Contains(string(out), "(ro)") { + t.Errorf("root not mounted readonly: %q", out) + } + + // Check that file cannot be created. + ws, err = execute(c, "/bin/touch", "/foo") if err != nil { - t.Fatalf("error waiting on container: %v", err) + t.Fatalf("touch file in ro mount: %v", err) } if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM { - t.Fatalf("container failed, waitStatus: %v", ws) + t.Fatalf("wrong waitStatus: %v", ws) + } + }) + } +} + +func TestReadonlyMount(t *testing.T) { + for name, conf := range configsWithVFS2(t, all...) { + t.Run(name, func(t *testing.T) { + dir, err := ioutil.TempDir(testutil.TmpDir(), "ro-mount") + if err != nil { + t.Fatalf("ioutil.TempDir() failed: %v", err) + } + spec := testutil.NewSpecWithArgs("sleep", "100") + spec.Mounts = append(spec.Mounts, specs.Mount{ + Destination: dir, + Source: dir, + Type: "bind", + Options: []string{"ro"}, + }) + spec.Root.Readonly = false + + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + c, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() + if err := c.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Read mounts to check that volume is readonly. + cmd := fmt.Sprintf("mount | grep ' %s '", dir) + out, ws, err := executeCombinedOutput(c, "/bin/sh", "-c", cmd) + if err != nil || ws != 0 { + t.Fatalf("exec failed, ws: %v, err: %v", ws, err) + } + t.Logf("mount: %q", out) + if !strings.Contains(string(out), "(ro)") { + t.Errorf("volume not mounted readonly: %q", out) + } + + // Check that file cannot be created. + ws, err = execute(c, "/bin/touch", path.Join(dir, "file")) + if err != nil { + t.Fatalf("touch file in ro mount: %v", err) + } + if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM { + t.Fatalf("wrong WaitStatus: %v", ws) } }) } @@ -1616,54 +1689,6 @@ func TestUIDMap(t *testing.T) { } } -func TestReadonlyMount(t *testing.T) { - for name, conf := range configsWithVFS2(t, overlay) { - t.Run(name, func(t *testing.T) { - dir, err := ioutil.TempDir(testutil.TmpDir(), "ro-mount") - spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file")) - if err != nil { - t.Fatalf("ioutil.TempDir() failed: %v", err) - } - spec.Mounts = append(spec.Mounts, specs.Mount{ - Destination: dir, - Source: dir, - Type: "bind", - Options: []string{"ro"}, - }) - spec.Root.Readonly = false - - _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) - if err != nil { - t.Fatalf("error setting up container: %v", err) - } - defer cleanup() - - // Create, start and wait for the container. - args := Args{ - ID: testutil.RandomContainerID(), - Spec: spec, - BundleDir: bundleDir, - } - c, err := New(conf, args) - if err != nil { - t.Fatalf("error creating container: %v", err) - } - defer c.Destroy() - if err := c.Start(conf); err != nil { - t.Fatalf("error starting container: %v", err) - } - - ws, err := c.Wait() - if err != nil { - t.Fatalf("error waiting on container: %v", err) - } - if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM { - t.Fatalf("container failed, waitStatus: %v", ws) - } - }) - } -} - // TestAbbreviatedIDs checks that runsc supports using abbreviated container // IDs in place of full IDs. func TestAbbreviatedIDs(t *testing.T) { @@ -2116,21 +2141,13 @@ func TestMountPropagation(t *testing.T) { // Check that mount didn't propagate to private mount. privFile := filepath.Join(priv, "mnt", "file") - execArgs := &control.ExecArgs{ - Filename: "/usr/bin/test", - Argv: []string{"test", "!", "-f", privFile}, - } - if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 { + if ws, err := execute(cont, "/usr/bin/test", "!", "-f", privFile); err != nil || ws != 0 { t.Fatalf("exec: test ! -f %q, ws: %v, err: %v", privFile, ws, err) } // Check that mount propagated to slave mount. slaveFile := filepath.Join(slave, "mnt", "file") - execArgs = &control.ExecArgs{ - Filename: "/usr/bin/test", - Argv: []string{"test", "-f", slaveFile}, - } - if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 { + if ws, err := execute(cont, "/usr/bin/test", "-f", slaveFile); err != nil || ws != 0 { t.Fatalf("exec: test -f %q, ws: %v, err: %v", privFile, ws, err) } } @@ -2196,11 +2213,7 @@ func TestMountSymlink(t *testing.T) { // Check that symlink was resolved and mount was created where the symlink // is pointing to. file := path.Join(target, "file") - execArgs := &control.ExecArgs{ - Filename: "/usr/bin/test", - Argv: []string{"test", "-f", file}, - } - if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 { + if ws, err := execute(cont, "/usr/bin/test", "-f", file); err != nil || ws != 0 { t.Fatalf("exec: test -f %q, ws: %v, err: %v", file, ws, err) } }) @@ -2326,6 +2339,35 @@ func TestTTYField(t *testing.T) { } } +func execute(cont *Container, name string, arg ...string) (syscall.WaitStatus, error) { + args := &control.ExecArgs{ + Filename: name, + Argv: append([]string{name}, arg...), + } + return cont.executeSync(args) +} + +func executeCombinedOutput(cont *Container, name string, arg ...string) ([]byte, syscall.WaitStatus, error) { + r, w, err := os.Pipe() + if err != nil { + return nil, 0, err + } + defer r.Close() + + args := &control.ExecArgs{ + Filename: name, + Argv: append([]string{name}, arg...), + FilePayload: urpc.FilePayload{Files: []*os.File{os.Stdin, w, w}}, + } + ws, err := cont.executeSync(args) + w.Close() + if err != nil { + return nil, 0, err + } + out, err := ioutil.ReadAll(r) + return out, ws, err +} + // executeSync synchronously executes a new process. func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) { pid, err := cont.Execute(args) diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go index 5b790c6c8..952215ec1 100644 --- a/runsc/container/multi_container_test.go +++ b/runsc/container/multi_container_test.go @@ -1517,8 +1517,7 @@ func TestMultiContainerGoferKilled(t *testing.T) { } // Check that container isn't running anymore. - args := &control.ExecArgs{Argv: []string{"/bin/true"}} - if _, err := c.executeSync(args); err == nil { + if _, err := execute(c, "/bin/true"); err == nil { t.Fatalf("Container %q was not stopped after gofer death", c.ID) } @@ -1533,8 +1532,7 @@ func TestMultiContainerGoferKilled(t *testing.T) { if err := waitForProcessList(c, pl); err != nil { t.Errorf("Container %q was affected by another container: %v", c.ID, err) } - args := &control.ExecArgs{Argv: []string{"/bin/true"}} - if _, err := c.executeSync(args); err != nil { + if _, err := execute(c, "/bin/true"); err != nil { t.Fatalf("Container %q was affected by another container: %v", c.ID, err) } } @@ -1556,8 +1554,7 @@ func TestMultiContainerGoferKilled(t *testing.T) { // Check that entire sandbox isn't running anymore. for _, c := range containers { - args := &control.ExecArgs{Argv: []string{"/bin/true"}} - if _, err := c.executeSync(args); err == nil { + if _, err := execute(c, "/bin/true"); err == nil { t.Fatalf("Container %q was not stopped after gofer death", c.ID) } } @@ -1719,12 +1716,11 @@ func TestMultiContainerHomeEnvDir(t *testing.T) { homeDirs[name] = homeFile } - // We will sleep in the root container in order to ensure that - // the root container doesn't terminate before sub containers can be - // created. + // We will sleep in the root container in order to ensure that the root + //container doesn't terminate before sub containers can be created. rootCmd := []string{"/bin/sh", "-c", fmt.Sprintf("printf \"$HOME\" > %s; sleep 1000", homeDirs["root"].Name())} subCmd := []string{"/bin/sh", "-c", fmt.Sprintf("printf \"$HOME\" > %s", homeDirs["sub"].Name())} - execCmd := []string{"/bin/sh", "-c", fmt.Sprintf("printf \"$HOME\" > %s", homeDirs["exec"].Name())} + execCmd := fmt.Sprintf("printf \"$HOME\" > %s", homeDirs["exec"].Name()) // Setup the containers, a root container and sub container. specConfig, ids := createSpecs(rootCmd, subCmd) @@ -1735,9 +1731,8 @@ func TestMultiContainerHomeEnvDir(t *testing.T) { defer cleanup() // Exec into the root container synchronously. - args := &control.ExecArgs{Argv: execCmd} - if _, err := containers[0].executeSync(args); err != nil { - t.Errorf("error executing %+v: %v", args, err) + if _, err := execute(containers[0], "/bin/sh", "-c", execCmd); err != nil { + t.Errorf("error executing %+v: %v", execCmd, err) } // Wait for the subcontainer to finish. diff --git a/runsc/container/shared_volume_test.go b/runsc/container/shared_volume_test.go index 4ea8fefee..cb5bffb89 100644 --- a/runsc/container/shared_volume_test.go +++ b/runsc/container/shared_volume_test.go @@ -168,11 +168,7 @@ func TestSharedVolume(t *testing.T) { func checkFile(c *Container, filename string, want []byte) error { cpy := filename + ".copy" - argsCp := &control.ExecArgs{ - Filename: "/bin/cp", - Argv: []string{"cp", "-f", filename, cpy}, - } - if _, err := c.executeSync(argsCp); err != nil { + if _, err := execute(c, "/bin/cp", "-f", filename, cpy); err != nil { return fmt.Errorf("unexpected error copying file %q to %q: %v", filename, cpy, err) } got, err := ioutil.ReadFile(cpy) @@ -235,11 +231,7 @@ func TestSharedVolumeFile(t *testing.T) { } // Append to file inside the container and check that content is not lost. - argsAppend := &control.ExecArgs{ - Filename: "/bin/bash", - Argv: []string{"bash", "-c", "echo -n sandbox- >> " + filename}, - } - if _, err := c.executeSync(argsAppend); err != nil { + if _, err := execute(c, "/bin/bash", "-c", "echo -n sandbox- >> "+filename); err != nil { t.Fatalf("unexpected error appending file %q: %v", filename, err) } want = []byte("host-sandbox-") -- cgit v1.2.3